Esempio n. 1
0
def compare_speed():
    # To run this speed comparison
    # cd <directory of this file>
    # THEANO_FLAGS=device=gpu \
    #   python -c 'import test_rng_curand; test_rng_curand.compare_speed()'

    mrg = MRG_RandomStreams()
    crn = CURAND_RandomStreams(234)

    N = 1000 * 100

    dest = theano.shared(numpy.zeros(N, dtype=theano.config.floatX))

    mrg_u = theano.function([], [], updates={dest: mrg.uniform((N,))},
            profile='mrg uniform')
    crn_u = theano.function([], [], updates={dest: crn.uniform((N,))},
            profile='crn uniform')
    mrg_n = theano.function([], [], updates={dest: mrg.normal((N,))},
            profile='mrg normal')
    crn_n = theano.function([], [], updates={dest: crn.normal((N,))},
            profile='crn normal')

    for f in mrg_u, crn_u, mrg_n, crn_n:
        # don't time the first call, it has some startup cost
        print('DEBUGPRINT')
        print('----------')
        theano.printing.debugprint(f)

    for i in range(100):
        for f in mrg_u, crn_u, mrg_n, crn_n:
            # don't time the first call, it has some startup cost
            f.fn.time_thunks = (i > 0)
            f()
Esempio n. 2
0
def compare_speed():
    # To run this speed comparison
    # cd <directory of this file>
    # THEANO_FLAGS=device=gpu \
    #   python -c 'import test_rng_curand; test_rng_curand.compare_speed()'

    mrg = MRG_RandomStreams()
    crn = CURAND_RandomStreams(234)

    N = 1000 * 100

    dest = theano.shared(numpy.zeros(N, dtype=theano.config.floatX))

    mrg_u = theano.function([], [], updates={dest: mrg.uniform((N,))},
            profile='mrg uniform')
    crn_u = theano.function([], [], updates={dest: crn.uniform((N,))},
            profile='crn uniform')
    mrg_n = theano.function([], [], updates={dest: mrg.normal((N,))},
            profile='mrg normal')
    crn_n = theano.function([], [], updates={dest: crn.normal((N,))},
            profile='crn normal')

    for f in mrg_u, crn_u, mrg_n, crn_n:
        # don't time the first call, it has some startup cost
        print('DEBUGPRINT')
        print('----------')
        theano.printing.debugprint(f)

    for i in range(100):
        for f in mrg_u, crn_u, mrg_n, crn_n:
            # don't time the first call, it has some startup cost
            f.fn.time_thunks = (i > 0)
            f()
Esempio n. 3
0
def check_normal_basic(shape_as_symbolic, dim_as_symbolic=False):
    """
    check_normal_basic(shape_as_symbolic, dim_as_symbolic=False)

    Runs a basic sanity check on the `normal` method of a
    `CURAND_RandomStreams` object.

    Checks that variates

     * have a mean in the right neighbourhood (near 0)
     * are of the specified shape
     * successive calls produce different arrays of variates

    Parameters
    ----------
    shape_as_symbolic : boolean
        If `True`, est the case that the shape tuple is a symbolic
        variable rather than known at compile-time.

    dim_as_symbolic : boolean
        If `True`, test the case that an element of the shape
        tuple is a Theano symbolic. Irrelevant if `shape_as_symbolic`
        is `True`.
    """
    rng = CURAND_RandomStreams(234)
    if shape_as_symbolic:
        # instantiate a TensorConstant with the value (10, 10)
        shape = constant((10, 10))
    else:
        if dim_as_symbolic:
            # Only one dimension is symbolic, with the others known
            shape = (10, constant(10))
        else:
            shape = (10, 10)
    u0 = rng.normal(shape)
    u1 = rng.normal(shape)

    f0 = theano.function([], u0, mode=mode_with_gpu)
    f1 = theano.function([], u1, mode=mode_with_gpu)

    v0list = [f0() for i in range(3)]
    v1list = [f1() for i in range(3)]

    #print v0list
    #print v1list
    # assert that elements are different in a few ways
    assert numpy.all(v0list[0] != v0list[1])
    assert numpy.all(v1list[0] != v1list[1])
    assert numpy.all(v0list[0] != v1list[0])

    for v in v0list:
        assert v.shape == (10, 10)
        assert v.min() < v.max()
        assert -.5 <= v.mean() <= .5
Esempio n. 4
0
def check_normal_basic(shape_as_symbolic, dim_as_symbolic=False):
    """
    check_normal_basic(shape_as_symbolic, dim_as_symbolic=False)

    Runs a basic sanity check on the `normal` method of a
    `CURAND_RandomStreams` object.

    Checks that variates

     * have a mean in the right neighbourhood (near 0)
     * are of the specified shape
     * successive calls produce different arrays of variates

    Parameters
    ----------
    shape_as_symbolic : boolean
        If `True`, est the case that the shape tuple is a symbolic
        variable rather than known at compile-time.

    dim_as_symbolic : boolean
        If `True`, test the case that an element of the shape
        tuple is a Theano symbolic. Irrelevant if `shape_as_symbolic`
        is `True`.
    """
    rng = CURAND_RandomStreams(234)
    if shape_as_symbolic:
        # instantiate a TensorConstant with the value (10, 10)
        shape = constant((10, 10))
    else:
        if dim_as_symbolic:
            # Only one dimension is symbolic, with the others known
            shape = (10, constant(10))
        else:
            shape = (10, 10)
    u0 = rng.normal(shape)
    u1 = rng.normal(shape)

    f0 = theano.function([], u0, mode=mode_with_gpu)
    f1 = theano.function([], u1, mode=mode_with_gpu)

    v0list = [f0() for i in range(3)]
    v1list = [f1() for i in range(3)]

    # print v0list
    # print v1list
    # assert that elements are different in a few ways
    assert numpy.all(v0list[0] != v0list[1])
    assert numpy.all(v1list[0] != v1list[1])
    assert numpy.all(v0list[0] != v1list[0])

    for v in v0list:
        assert v.shape == (10, 10)
        assert v.min() < v.max()
        assert -.5 <= v.mean() <= .5
Esempio n. 5
0
    def sampler(self, mu, log_sigma):
        if "gpu" in theano.config.device:
            from theano.sandbox.cuda.rng_curand import CURAND_RandomStreams
            srng = CURAND_RandomStreams(seed=seed)
            # srng = T.shared_randomstreams.RandomStreams(seed=seed)
        else:
            srng = T.shared_randomstreams.RandomStreams(seed=seed)

        eps = srng.normal(mu.shape)

        # Reparametrize
        z = mu + (T.exp(0.5 * log_sigma) - 1) * eps * 5e-1

        return z
Esempio n. 6
0
class DiscLayer(object):
    def __init__(self, rng, input, in_dim, W=None, b=None, W_scale=1.0):
        # Setup a shared random generator for this layer
        self.rng = RandStream(rng.randint(1000000))

        self.input = input
        self.in_dim = in_dim

        # Get some random initial weights and biases, if not given
        if W is None:
            # Generate random initial filters in a typical way
            W_init = 1.0 * np.asarray(rng.normal( \
                      size=(self.in_dim, 1)), \
                      dtype=theano.config.floatX)
            W = theano.shared(value=(W_scale * W_init))
        if b is None:
            b_init = np.zeros((1, ), dtype=theano.config.floatX)
            b = theano.shared(value=b_init)

        # Set layer weights and biases
        self.W = W
        self.b = b

        # Compute linear "pre-activation" for this layer
        self.linear_output = 20.0 * T.tanh(
            (T.dot(self.input, self.W) + self.b) / 20.0)

        # Apply activation function
        self.output = self.linear_output

        # Compute squared sum of outputs, for regularization
        self.act_l2_sum = T.sum(self.output**2.0) / self.output.shape[0]

        # Conveniently package layer parameters
        self.params = [self.W, self.b]
        # little layer construction complete...
        return

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                dtype=theano.config.floatX)
        return P_nz
class DiscLayer(object):
    def __init__(self, rng, input, in_dim, W=None, b=None, W_scale=1.0):
        # Setup a shared random generator for this layer
        self.rng = RandStream(rng.randint(1000000))

        self.input = input
        self.in_dim = in_dim

        # Get some random initial weights and biases, if not given
        if W is None:
            # Generate random initial filters in a typical way
            W_init = 1.0 * np.asarray(rng.normal( \
                      size=(self.in_dim, 1)), \
                      dtype=theano.config.floatX)
            W = theano.shared(value=(W_scale*W_init))
        if b is None:
            b_init = np.zeros((1,), dtype=theano.config.floatX)
            b = theano.shared(value=b_init)

        # Set layer weights and biases
        self.W = W
        self.b = b

        # Compute linear "pre-activation" for this layer
        self.linear_output = 20.0 * T.tanh((T.dot(self.input, self.W) + self.b) / 20.0)

        # Apply activation function
        self.output = self.linear_output

        # Compute squared sum of outputs, for regularization
        self.act_l2_sum = T.sum(self.output**2.0) / self.output.shape[0]

        # Conveniently package layer parameters
        self.params = [self.W, self.b]
        # little layer construction complete...
        return

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        P_nz = P + DCG(self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                dtype=theano.config.floatX))
        return P_nz
Esempio n. 8
0
class InfNet(object):
    """
    A net that tries to infer an approximate posterior for some observation,
    given some deep, directed generative model. The output of this network
    comprises two constructs: an approximate mean vector and an approximate
    standard deviation vector (i.e. diagonal matrix) for a Gaussian posterior.

    Parameters:
        rng: a numpy.random RandomState object
        Xd: symbolic input matrix for inputting observable data
        Xc: symbolic input matrix for inputting control data
        Xm: symbolic input matrix for a mask on which values to take
                    from Xc and which to take from Xd
        prior_sigma: standard deviation of isotropic Gaussian prior that our
                     inferred posteriors will be penalized for deviating from.
        params: a dict of parameters describing the desired ensemble:
            lam_l2a: L2 regularization weight on neuron activations
            vis_drop: drop rate to use on observable variables
            hid_drop: drop rate to use on hidden layer activations
                -- note: vis_drop/hid_drop are optional, with defaults 0.0/0.0
            input_noise: standard dev for noise on the input of this net
            bias_noise: standard dev for noise on the biases of hidden layers
            shared_config: list of "layer descriptions" for shared part
            mu_config: list of "layer descriptions" for mu part
            sigma_config: list of "layer descriptions" for sigma part
            activation: "function handle" for the desired non-linearity
            init_scale: scaling factor for hidden layer weights (__ * 0.01)
        shared_param_dicts: parameters for the MLP controlled by this InfNet
    """
    def __init__(self, \
            rng=None, \
            Xd=None, \
            Xc=None, \
            Xm=None, \
            prior_sigma=None, \
            params=None, \
            shared_param_dicts=None):
        # Setup a shared random generator for this network 
        self.rng = RandStream(rng.randint(1000000))
        # Grab the symbolic input matrix
        self.Xd = Xd
        self.Xc = Xc
        self.Xm = Xm
        self.prior_sigma = prior_sigma
        #####################################################
        # Process user-supplied parameters for this network #
        #####################################################
        self.params = params
        self.lam_l2a = params['lam_l2a']
        if 'vis_drop' in params:
            self.vis_drop = params['vis_drop']
        else:
            self.vis_drop = 0.0
        if 'hid_drop' in params:
            self.hid_drop = params['hid_drop']
        else:
            self.hid_drop = 0.0
        if 'input_noise' in params:
            self.input_noise = params['input_noise']
        else:
            self.input_noise = 0.0
        if 'bias_noise' in params:
            self.bias_noise = params['bias_noise']
        else:
            self.bias_noise = 0.0
        if 'init_scale' in params:
            self.init_scale = params['init_scale']
        else:
            self.init_scale = 1.0
        # Check if the params for this net were given a priori. This option
        # will be used for creating "clones" of an inference network, with all
        # of the network parameters shared between clones.
        if shared_param_dicts is None:
            # This is not a clone, and we will need to make a dict for
            # referring to the parameters of each network layer
            self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []}
            self.is_clone = False
        else:
            # This is a clone, and its layer parameters can be found by
            # referring to the given param dict (i.e. shared_param_dicts).
            self.shared_param_dicts = shared_param_dicts
            self.is_clone = True
        # Get the configuration/prototype for this network. The config is a
        # list of layer descriptions, including a description for the input
        # layer, which is typically just the dimension of the inputs. So, the
        # depth of the mlp is one less than the number of layer configs.
        self.shared_config = params['shared_config']
        self.mu_config = params['mu_config']
        self.sigma_config = params['sigma_config']
        if 'activation' in params:
            self.activation = params['activation']
        else:
            self.activation = relu_actfun
        #########################################
        # Initialize the shared part of network #
        #########################################
        self.shared_layers = []
        layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:])
        layer_num = 0
        # Construct input by combining data input and control input, taking
        # unmasked values from data input and others from the control input
        next_input = ((1.0 - self.Xm) * self.Xd) + \
                (self.Xm * self.Xc)
        for in_def, out_def in layer_def_pairs:
            first_layer = (layer_num == 0)
            last_layer = (layer_num == (len(layer_def_pairs) - 1))
            l_name = "share_layer_{0:d}".format(layer_num)
            if (type(in_def) is list) or (type(in_def) is tuple):
                # Receiving input from a poolish layer...
                in_dim = in_def[0]
            else:
                # Receiving input from a normal layer...
                in_dim = in_def
            if (type(out_def) is list) or (type(out_def) is tuple):
                # Applying some sort of pooling in this layer...
                out_dim = out_def[0]
                pool_size = out_def[1]
            else:
                # Not applying any pooling in this layer...
                out_dim = out_def
                pool_size = 0
            # Select the appropriate noise to add to this layer
            if first_layer:
                d_rate = self.vis_drop
            else:
                d_rate = self.hid_drop
            if first_layer:
                i_noise = self.input_noise
                b_noise = 0.0
            else:
                i_noise = 0.0
                b_noise = self.bias_noise
            if not self.is_clone:
                ##########################################
                # Initialize a layer with new parameters #
                ##########################################
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name, W_scale=self.init_scale)
                self.shared_layers.append(new_layer)
                self.shared_param_dicts['shared'].append({'W': new_layer.W, 'b': new_layer.b})
            else:
                ##################################################
                # Initialize a layer with some shared parameters #
                ##################################################
                init_params = self.shared_param_dicts['shared'][layer_num]
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        name=l_name, W_scale=self.init_scale)
                self.shared_layers.append(new_layer)
            next_input = self.shared_layers[-1].output
            # Acknowledge layer completion
            layer_num = layer_num + 1
        #####################################
        # Initialize the mu part of network #
        #####################################
        self.mu_layers = []
        layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:])
        layer_num = 0
        # Take input from the output of the shared network
        next_input = self.shared_layers[-1].output
        for in_def, out_def in layer_def_pairs:
            first_layer = (layer_num == 0)
            last_layer = (layer_num == (len(layer_def_pairs) - 1))
            l_name = "mu_layer_{0:d}".format(layer_num)
            if (type(in_def) is list) or (type(in_def) is tuple):
                # Receiving input from a poolish layer...
                in_dim = in_def[0]
            else:
                # Receiving input from a normal layer...
                in_dim = in_def
            if (type(out_def) is list) or (type(out_def) is tuple):
                # Applying some sort of pooling in this layer...
                out_dim = out_def[0]
                pool_size = out_def[1]
            else:
                # Not applying any pooling in this layer...
                out_dim = out_def
                pool_size = 0
            # Select the appropriate noise to add to this layer
            d_rate = self.hid_drop
            i_noise = 0.0
            b_noise = self.bias_noise
            if not self.is_clone:
                ##########################################
                # Initialize a layer with new parameters #
                ##########################################
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name, W_scale=self.init_scale)
                self.mu_layers.append(new_layer)
                self.shared_param_dicts['mu'].append({'W': new_layer.W, 'b': new_layer.b})
            else:
                ##################################################
                # Initialize a layer with some shared parameters #
                ##################################################
                init_params = self.shared_param_dicts['mu'][layer_num]
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        name=l_name, W_scale=self.init_scale)
                self.mu_layers.append(new_layer)
            next_input = self.mu_layers[-1].output
            # Acknowledge layer completion
            layer_num = layer_num + 1
        ########################################
        # Initialize the sigma part of network #
        ########################################
        self.sigma_layers = []
        layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:])
        layer_num = 0
        # Take input from the output of the shared network
        next_input = self.shared_layers[-1].output
        for in_def, out_def in layer_def_pairs:
            first_layer = (layer_num == 0)
            last_layer = (layer_num == (len(layer_def_pairs) - 1))
            l_name = "sigma_layer_{0:d}".format(layer_num)
            if (type(in_def) is list) or (type(in_def) is tuple):
                # Receiving input from a poolish layer...
                in_dim = in_def[0]
            else:
                # Receiving input from a normal layer...
                in_dim = in_def
            if (type(out_def) is list) or (type(out_def) is tuple):
                # Applying some sort of pooling in this layer...
                out_dim = out_def[0]
                pool_size = out_def[1]
            else:
                # Not applying any pooling in this layer...
                out_dim = out_def
                pool_size = 0
            # Select the appropriate noise to add to this layer
            d_rate = self.hid_drop
            i_noise = 0.0
            b_noise = self.bias_noise
            if not self.is_clone:
                ##########################################
                # Initialize a layer with new parameters #
                ##########################################
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name, W_scale=self.init_scale)
                self.sigma_layers.append(new_layer)
                self.shared_param_dicts['sigma'].append({'W': new_layer.W, 'b': new_layer.b})
            else:
                ##################################################
                # Initialize a layer with some shared parameters #
                ##################################################
                init_params = self.shared_param_dicts['sigma'][layer_num]
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        name=l_name, W_scale=self.init_scale)
                self.sigma_layers.append(new_layer)
            next_input = self.sigma_layers[-1].output
            # Acknowledge layer completion
            layer_num = layer_num + 1

        # Mash all the parameters together, into a list.
        self.mlp_params = []
        for layer in self.shared_layers:
            self.mlp_params.extend(layer.params)
        for layer in self.mu_layers:
            self.mlp_params.extend(layer.params)
        for layer in self.sigma_layers:
            self.mlp_params.extend(layer.params)

        # The output of this inference network is given by the noisy output
        # of the final layers of its mu and sigma networks.
        self.output_mu = self.mu_layers[-1].noisy_linear
        self.output_logvar = self.sigma_layers[-1].noisy_linear
        self.output_sigma = T.exp(0.5 * self.output_logvar)
        # We'll also construct an output containing a single samples from each
        # of the distributions represented by the rows of self.output_mu and
        # self.output_sigma.
        self.output = self._construct_post_samples()
        self.out_dim = self.sigma_layers[-1].out_dim
        # Get simple regularization penalty to moderate activation dynamics
        self.act_reg_cost = self.lam_l2a * self._act_reg_cost()
        # Construct a function for penalizing KL divergence between the
        # approximate posteriors produced by this model and some isotropic
        # Gaussian distribution.
        self.kld_cost = self._construct_kld_cost()
        # Construct a theano function for sampling from the approximate
        # posteriors inferred by this model for some collection of points
        # in the "data space".
        self.sample_posterior = self._construct_sample_posterior()
        self.mean_posterior = theano.function([self.Xd, self.Xc, self.Xm], \
                outputs=self.output_mu)
        return

    def _act_reg_cost(self):
        """
        Apply L2 regularization to the activations in each net.
        """
        act_sq_sums = []
        for layer in self.shared_layers:
            act_sq_sums.append(layer.act_l2_sum)
        for layer in self.mu_layers:
            act_sq_sums.append(layer.act_l2_sum)
        for layer in self.sigma_layers:
            act_sq_sums.append(layer.act_l2_sum)
        full_act_sq_sum = T.sum(act_sq_sums)
        return full_act_sq_sum

    def _construct_post_samples(self):
        """
        Draw a single sample from each of the approximate posteriors encoded
        in self.output_mu and self.output_sigma.
        """
        post_samples = self.output_mu + (self.output_sigma * \
                self.rng.normal(size=self.output_sigma.shape, avg=0.0, std=1.0, \
                dtype=theano.config.floatX))
        return post_samples

    def _construct_kld_cost(self):
        """
        Compute (analytically) the KL divergence between each approximate
        posterior encoded by self.mu/self.sigma and the isotropic Gaussian
        distribution with mean 0 and standard deviation self.prior_sigma.
        """
        prior_sigma_sq = self.prior_sigma**2.0
        prior_log_sigma_sq = np.log(prior_sigma_sq)
        kld_cost = 0.5 * T.sum(((self.output_mu**2.0 / prior_sigma_sq) + \
                (T.exp(self.output_logvar) / prior_sigma_sq) - \
                (self.output_logvar - prior_log_sigma_sq) - 1.0), axis=1, keepdims=True)
        return kld_cost

    def _construct_sample_posterior(self):
        """
        Construct a sampler that draws a single sample from the inferred
        posterior for some set of inputs.
        """
        psample = theano.function([self.Xd, self.Xc, self.Xm], \
                outputs=self.output)
        return psample

    def init_biases(self, b_init=0.0):
        """
        Initialize the biases in all hidden layers to some constant.
        """
        for layer in self.shared_layers:
            b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init
            layer.b.set_value(b_vec)
        for layer in self.mu_layers[:-1]:
            b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init
            layer.b.set_value(b_vec)
        for layer in self.sigma_layers[:-1]:
            b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init
            layer.b.set_value(b_vec)
        return

    def shared_param_clone(self, rng=None, Xd=None, Xc=None, Xm=None):
        """
        Return a clone of this network, with shared parameters but with
        different symbolic input variables.

        This can be used for "unrolling" a generate->infer->generate->infer...
        loop. Then, we can do backprop through time for various objectives.
        """
        clone_net = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \
                prior_sigma=self.prior_sigma, params=self.params, \
                shared_param_dicts=self.shared_param_dicts)
        return clone_net
class GenFCModule(object):
    """
    Module that transforms random values through a single fully connected
    layer, and then a linear transform (with another relu, optionally).
    """
    def __init__(self, rand_dim, out_dim, fc_dim,
                 apply_bn_1=True, apply_bn_2=True,
                 init_func=None, rand_type='normal',
                 final_relu=True, mod_name='dm_fc'):
        self.rand_dim = rand_dim
        self.out_dim = out_dim
        self.fc_dim = fc_dim
        self.apply_bn_1 = apply_bn_1
        self.apply_bn_2 = apply_bn_2
        self.mod_name = mod_name
        self.rand_type = rand_type
        self.final_relu = final_relu
        self.rng = RandStream(123)
        if init_func is None:
            self.init_func = inits.Normal(scale=0.02)
        else:
            self.init_func = init_func
        self._init_params() # initialize parameters
        return

    def _init_params(self):
        """
        Initialize parameters for the layers in this generator module.
        """
        self.w1 = self.init_func((self.rand_dim, self.fc_dim),
                                 "{}_w1".format(self.mod_name))
        self.w2 = self.init_func((self.fc_dim, self.out_dim),
                                 "{}_w2".format(self.mod_name))
        self.params = [self.w1, self.w2]
        # make gains and biases for transforms that will get batch normed
        if self.apply_bn_1:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g1 = gain_ifn((self.fc_dim), "{}_g1".format(self.mod_name))
            self.b1 = bias_ifn((self.fc_dim), "{}_b1".format(self.mod_name))
            self.params.extend([self.g1, self.b1])
        if self.apply_bn_2:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g2 = gain_ifn((self.out_dim), "{}_g2".format(self.mod_name))
            self.b2 = bias_ifn((self.out_dim), "{}_b2".format(self.mod_name))
            self.params.extend([self.g2, self.b2])
        return

    def apply(self, batch_size=None, rand_vals=None):
        """
        Apply this generator module. Pass _either_ batch_size or rand_vals.
        """
        assert not ((batch_size is None) and (rand_vals is None)), "need either batch_size or rand_vals"
        if rand_vals is None:
            rand_shape = (batch_size, self.rand_dim)
            if self.rand_type == 'normal':
                rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \
                                            dtype=theano.config.floatX)
            else:
                rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \
                                             dtype=theano.config.floatX)
        else:
            rand_shape = (rand_vals.shape[0], self.rand_dim)
        rand_vals = rand_vals.reshape(rand_shape)
        # transform random values into fc layer
        h1 = T.dot(rand_vals, self.w1)
        if self.apply_bn_1:
            h1 = batchnorm(h1, g=self.g1, b=self.b1)
        h1 = relu(h1)
        # transform from fc layer to output
        h2 = T.dot(h1, self.w2)
        if self.apply_bn_2:
            h2 = batchnorm(h2, g=self.g2, b=self.b2)
        if self.final_relu:
            h2 = relu(h2)
        return h2
class WalkoutModel(object):
    """
    Controller for training a forwards-backwards chainy model.

    Parameters:
        rng: numpy.random.RandomState (for reproducibility)
        x_out: the goal state for forwards-backwards walking process
        p_z_given_x: InfNet for stochastic part of step
        p_x_given_z: HydraNet for deterministic part of step
        params: REQUIRED PARAMS SHOWN BELOW
                x_dim: dimension of observations to construct
                z_dim: dimension of latent space for policy wobble
                walkout_steps: number of steps to walk out
                x_type: can be "bernoulli" or "gaussian"
                x_transform: can be 'none' or 'sigmoid'
    """
    def __init__(self, rng=None,
            x_out=None, \
            p_z_given_x=None, \
            p_x_given_z=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this WalkoutModel
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_dim = self.params['x_dim']
        self.z_dim = self.params['z_dim']
        self.walkout_steps = self.params['walkout_steps']
        self.x_type = self.params['x_type']
        self.shared_param_dicts = shared_param_dicts
        if 'x_transform' in self.params:
            assert((self.params['x_transform'] == 'sigmoid') or \
                    (self.params['x_transform'] == 'none'))
            if self.params['x_transform'] == 'sigmoid':
                self.x_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.x_transform = lambda x: x
        else:
            self.x_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.x_transform = lambda x: T.nnet.sigmoid(x)
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        assert((self.step_type == 'add') or (self.step_type == 'jump'))

        # grab handles to the relevant networks
        self.p_z_given_x = p_z_given_x
        self.p_x_given_z = p_x_given_z

        # record the symbolic variables that will provide inputs to the
        # computation graph created for this WalkoutModel
        self.x_out = x_out           # target output for generation
        self.zi_zmuv = T.tensor3()   # ZMUV gauss noise for walk-out wobble

        if self.shared_param_dicts is None:
            # initialize the parameters "owned" by this model
            zero_ary = to_fX( np.zeros((1,)) )
            self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0])
            self.shared_param_dicts = {}
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            # grab the parameters required by this model from a given dict
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0])

        ###############################################################
        # Setup the forwards (i.e. training) walk-out loop using scan #
        ###############################################################
        def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw):
            # get samples of next zi, according to the forwards model
            zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \
                                       do_samples=False)
            zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv)

            # check reverse direction probability p(xi_fw | zi_fw)
            xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \
                                       do_samples=False)
            xi_bw_mean = self.x_transform(xi_bw_mean)
            nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \
                        log_vars=xi_bw_logvar, mask=None)
            nll_xi_bw = nll_xi_bw.flatten()

            # get samples of next xi, according to the forwards model
            xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \
                                       do_samples=False)
            xi_fw_mean = self.x_transform(xi_fw_mean)
            xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv)

            # check reverse direction probability p(zi_fw | xi_fw)
            zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \
                                       do_samples=False)
            nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \
                        log_vars=zi_bw_logvar, mask=None)
            nll_zi_bw = nll_zi_bw.flatten()

            # each loop iteration produces the following values:
            #   xi_fw: xi generated fom zi by forwards walk
            #   zi_fw: zi generated fom xi by forwards walk
            #   xi_fw_mean: ----
            #   xi_fw_logvar: ----
            #   zi_fw_mean: ----
            #   zi_fw_logvar: ----
            #   nll_xi_bw: NLL for reverse step zi_fw -> xi_fw
            #   nll_zi_bw: NLL for reverse step xi_fw -> zi_fw
            return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw

        # initialize states for x/z
        self.x0 = self.x_out
        self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim)
        # setup initial values to pass to scan op
        outputs_init = [self.x0, self.z0, None, None, None, None, None, None]
        sequences_init = [self.xi_zmuv, self.zi_zmuv]
        # apply scan op for the sequential imputation loop
        self.scan_results, self.scan_updates = theano.scan(forwards_loop, \
                    outputs_info=outputs_init, \
                    sequences=sequences_init)

        # grab results of the scan op. all values are computed for each step
        self.xi = self.scan_results[0]
        self.zi = self.scan_results[1]
        self.xi_fw_mean = self.scan_results[2]
        self.xi_fw_logvar = self.scan_results[3]
        self.zi_fw_mean = self.scan_results[4]
        self.zi_fw_logvar = self.scan_results[5]
        self.nll_xi_bw = self.scan_results[6]
        self.nll_zi_bw = self.scan_results[7]

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr = theano.shared(value=zero_ary, name='srr_lr')
        # shared var momentum parameters for ADAM optimization
        self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared vars for weighting prior kld against reconstruction
        self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p')
        self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q')
        self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g')
        self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s')
        self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w')
        self.set_lam_l2w(1e-5)

        # grab all of the "optimizable" parameters from the base networks
        self.joint_params = [self.s0, self.obs_logvar, self.step_scales]
        self.joint_params.extend(self.p_zi_given_xi.mlp_params)
        self.joint_params.extend(self.p_sip1_given_zi.mlp_params)
        self.joint_params.extend(self.p_x_given_si.mlp_params)
        self.joint_params.extend(self.q_zi_given_xi.mlp_params)

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(p=1.0)
        self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \
                         (self.lam_kld_q[0] * self.kld_q) + \
                         (self.lam_kld_g[0] * self.kld_g) + \
                         (self.lam_kld_s[0] * self.kld_s)
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs
        self.nll_cost = T.mean(self.nll_costs)
        self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel()
        self.nll_bound = T.mean(self.nll_bounds)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-TRIAL COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        for k, v in self.scan_updates.items():
            self.joint_updates[k] = v

        # Construct theano functions for training and diagnostic computations
        print("Compiling cost computer...")
        self.compute_raw_costs = self._construct_raw_costs()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling sequence sampler...")
        self.sequence_sampler = self._construct_sequence_sampler()
        # make easy access points for some interesting parameters
        #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W
        return

    def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999):
        """
        Set learning rate and momentum parameter for all updates.
        """
        zero_ary = np.zeros((1,))
        # set learning rate
        new_lr = zero_ary + lr
        self.lr.set_value(to_fX(new_lr))
        # set momentums (use first and second order "momentum")
        new_mom_1 = zero_ary + mom_1
        self.mom_1.set_value(to_fX(new_mom_1))
        new_mom_2 = zero_ary + mom_2
        self.mom_2.set_value(to_fX(new_mom_2))
        return

    def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0):
        """
        Set the relative weight of prior KL-divergence vs. data likelihood.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_kld_p
        self.lam_kld_p.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_q
        self.lam_kld_q.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_g
        self.lam_kld_g.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_s
        self.lam_kld_s.set_value(to_fX(new_lam))
        return

    def set_lam_l2w(self, lam_l2w=1e-3):
        """
        Set the relative strength of l2 regularization on network params.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_l2w
        self.lam_l2w.set_value(to_fX(new_lam))
        return

    def set_train_switch(self, switch_val=0.0):
        """
        Set the switch for changing between training and sampling behavior.
        """
        if (switch_val < 0.5):
            switch_val = 0.0
        else:
            switch_val = 1.0
        zero_ary = np.zeros((1,))
        new_val = zero_ary + switch_val
        self.train_switch.set_value(to_fX(new_val))
        return

    def _construct_zi_zmuv(self, xo):
        """
        Construct the necessary ZMUV gaussian samples for generating
        trajectories from this WalkoutModel, for input matrix xo.
        """
        zi_zmuv = self.rng.normal( \
                size=(self.total_steps, xo.shape[0], self.z_dim), \
                avg=0.0, std=1.0, dtype=theano.config.floatX)
        return zi_zmuv

    def _construct_rev_masks(self, xo):
        """
        Compute the sequential revelation masks for the input batch in xo.
        -- We need to construct mask sequences for both p and q.
        """
        if self.use_rev_masks:
            # make batch copies of self.rev_masks_p and self.rev_masks_q
            pmasks = self.rev_masks_p.dimshuffle(0,'x',1).repeat(xo.shape[0], axis=1)
            qmasks = self.rev_masks_q.dimshuffle(0,'x',1).repeat(xo.shape[0], axis=1)
        else:
            pm_list = []
            qm_list = []
            # make a zero mask that does nothing
            zero_mask = T.alloc(0.0, 1, xo.shape[0], xo.shape[1])
            # generate independently sampled masks for each revelation block
            for rb in self.rev_sched:
                # make a random binary mask with ones at rate rb[1]
                rand_vals = self.rng.uniform( \
                        size=(1, xo.shape[0], xo.shape[1]), \
                        low=0.0, high=1.0, dtype=theano.config.floatX)
                rand_mask = rand_vals < rb[1]
                # append the masks for this revleation block to the mask lists
                #
                # the guide policy (in q) gets to peek at the values that will be
                # revealed to the primary policy (in p) for the entire block. The
                # primary policy only gets to see these values at end of the final
                # step of the block. Within a given step, values are revealed to q
                # at the beginning of the step, and to p at the end.
                #
                # e.g. in a revelation block with only a single step, the guide
                # policy sees the values at the beginning of the step, which allows
                # it to guide the step. the primary policy only gets to see the
                # values at the end of the step.
                #
                # i.e. a standard variational auto-encoder is equivalent to a
                # sequential revelation and refinement model with only one
                # revelation block, which has one step and a reveal rate of 1.0.
                #
                for refine_step in range(rb[0]-1):
                    pm_list.append(zero_mask)
                    qm_list.append(rand_mask)
                pm_list.append(rand_mask)
                qm_list.append(rand_mask)
            # concatenate each mask list into a 3-tensor
            pmasks = T.cast(T.concatenate(pm_list, axis=0), 'floatX')
            qmasks = T.cast(T.concatenate(qm_list, axis=0), 'floatX')
        return [pmasks, qmasks]

    def _construct_nll_costs(self, si, xo, nll_mask):
        """
        Construct the negative log-likelihood part of free energy.
        -- only check NLL where nll_mask == 1
        """
        xh = self._from_si_to_x( si )
        if self.x_type == 'bernoulli':
            ll_costs = log_prob_bernoulli(xo, xh, mask=nll_mask)
        else:
            ll_costs = log_prob_gaussian2(xo, xh, \
                    log_vars=self.bounded_logvar, mask=nll_mask)
        nll_costs = -ll_costs.flatten()
        return nll_costs

    def _construct_kld_s(self, s_i, s_j):
        """
        Compute KL(s_i || s_j) -- assuming bernoullish outputs
        """
        x_i = self._from_si_to_x( s_i )
        x_j = self._from_si_to_x( s_j )
        kld_s = (x_i * (T.log(x_i)  - T.log(x_j))) + \
                ((1.0 - x_i) * (T.log(1.0-x_i) - T.log(1.0-x_j)))
        sum_kld = T.sum(kld_s, axis=1)
        return sum_kld

    def _construct_kld_costs(self, p=1.0):
        """
        Construct the policy KL-divergence part of cost to minimize.
        """
        kld_pis = []
        kld_qis = []
        kld_gis = []
        kld_sis = []
        s0 = 0.0*self.si[0] + self.s0
        for i in range(self.total_steps):
            kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1))
            kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1))
            kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1))
            if i == 0:
                kld_sis.append(self._construct_kld_s(self.si[i], s0))
            else:
                kld_sis.append(self._construct_kld_s(self.si[i], self.si[i-1]))
        # compute the batch-wise costs
        kld_pi = sum(kld_pis)
        kld_qi = sum(kld_qis)
        kld_gi = sum(kld_gis)
        kld_si = sum(kld_sis)
        return [kld_pi, kld_qi, kld_gi, kld_si]

    def _construct_reg_costs(self):
        """
        Construct the cost for low-level basic regularization. E.g. for
        applying l2 regularization to the network activations and parameters.
        """
        param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params])
        return param_reg_cost

    def _construct_compute_fe_terms(self):
        """
        Construct a function for computing terms in variational free energy.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # construct values to output
        nll = self.nll_costs.flatten()
        kld = self.kld_q.flatten()
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(inputs=[ xo ], \
                outputs=[nll, kld], \
                givens={self.x_out: xo, \
                        self.zi_zmuv: zizmuv, \
                        self.p_masks: pmasks, \
                        self.q_masks: qmasks}, \
                updates=self.scan_updates, \
                on_unused_input='ignore')
        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(XO, sample_count=20, use_guide_policy=True):
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from the guide policy
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from the primary policy
                self.set_train_switch(switch_val=0.0)
            # compute a multi-sample estimate of variational free-energy
            nll_sum = np.zeros((XO.shape[0],))
            kld_sum = np.zeros((XO.shape[0],))
            for i in range(sample_count):
                result = fe_term_sample(XO)
                nll_sum += result[0].ravel()
                kld_sum += result[1].ravel()
            mean_nll = nll_sum / float(sample_count)
            mean_kld = kld_sum / float(sample_count)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            if not use_guide_policy:
                # no KLd if samples are from the primary policy...
                mean_kld = 0.0 * mean_kld
            return [mean_nll, mean_kld]
        return fe_term_estimator

    def _construct_raw_costs(self):
        """
        Construct all the raw, i.e. not weighted by any lambdas, costs.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # compile theano function for computing the costs
        all_step_costs = [self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g]
        cost_func = theano.function(inputs=[ xo ], \
                    outputs=all_step_costs, \
                    givens={self.x_out: xo, \
                            self.zi_zmuv: zizmuv, \
                            self.p_masks: pmasks, \
                            self.q_masks: qmasks}, \
                    updates=self.scan_updates, \
                    on_unused_input='ignore')
        # make a function for computing batch-based estimates of costs.
        #   _step_nlls: the expected NLL cost for each step
        #   _step_klds: the expected KL(q||p) cost for each step
        #   _kld_q2p: the expected KL(q||p) cost for each latent dim
        #   _kld_p2q: the expected KL(p||q) cost for each latent dim
        #   _kld_p2g: the expected KL(p||N(0,I)) cost for each latent dim
        def raw_cost_computer(XO):
            _all_costs = cost_func(to_fX(XO))
            _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0)
            _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0)
            _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0)
            _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1)
            _step_klds = to_fX( np.asarray([k for k in _step_klds]) )
            _step_nlls = np.mean(_all_costs[0], axis=1)
            _step_nlls = to_fX( np.asarray([k for k in _step_nlls]) )
            results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g]
            return results
        return raw_cost_computer

    def _construct_train_joint(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # collect the outputs to return from this function
        outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \
                   self.kld_cost, self.reg_cost, self.obs_costs]
        # compile the theano function
        func = theano.function(inputs=[ xo ], \
                outputs=outputs, \
                givens={self.x_out: xo, \
                        self.zi_zmuv: zizmuv, \
                        self.p_masks: pmasks, \
                        self.q_masks: qmasks}, \
                updates=self.joint_updates, \
                on_unused_input='ignore')
        return func

    def _construct_sequence_sampler(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # collect the outputs to return from this function
        states = [self._from_si_to_x(self.s0_full)] + \
                 [self._from_si_to_x(self.si[i]) for i in range(self.total_steps)]
        masks = [self.m0_full] + [self.mi_p[i] for i in range(self.total_steps)]
        outputs = states + masks
        # compile the theano function
        func = theano.function(inputs=[ xo ], \
                outputs=outputs, \
                givens={self.x_out: xo, \
                        self.zi_zmuv: zizmuv, \
                        self.p_masks: pmasks, \
                        self.q_masks: qmasks}, \
                updates=self.joint_updates, \
                on_unused_input='ignore')
        # visualize trajectories generated by the model
        def sample_func(XO, use_guide_policy=False):
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from the guide policy
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from the primary policy
                self.set_train_switch(switch_val=0.0)
            # get belief states and masks generated by the scan loop
            scan_vals = func(to_fX(XO))
            step_count = self.total_steps + 1
            seq_shape = (step_count, XO.shape[0], XO.shape[1])
            xm_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            xi_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            mi_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            for i in range(step_count):
                _xi = scan_vals[i]
                _mi = scan_vals[i + step_count]
                _xm = (_mi * XO) + ((1.0 - _mi) * _xi)
                xm_seq[i,:,:] = _xm
                xi_seq[i,:,:] = _xi
                mi_seq[i,:,:] = _mi
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            return [xm_seq, xi_seq, mi_seq]
        return sample_func

    def save_to_file(self, f_name=None):
        """
        Dump important stuff to a Python pickle, so that we can reload this
        model later.
        """
        assert(not (f_name is None))
        f_handle = file(f_name, 'wb')
        # dump the dict self.params, which just holds "simple" python values
        cPickle.dump(self.params, f_handle, protocol=-1)
        # make a copy of self.shared_param_dicts, with numpy arrays in place
        # of the theano shared variables
        numpy_param_dicts = {}
        for key in self.shared_param_dicts:
            numpy_ary = self.shared_param_dicts[key].get_value(borrow=False)
            numpy_param_dicts[key] = numpy_ary
        # dump the numpy version of self.shared_param_dicts to pickle file
        cPickle.dump(numpy_param_dicts, f_handle, protocol=-1)
        # get numpy dicts for each of the "child" models that we must save
        child_model_dicts = {}
        child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict()
        child_model_dicts['p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict()
        child_model_dicts['p_x_given_si'] = self.p_x_given_si.save_to_dict()
        child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict()
        # dump the numpy child model dicts to the pickle file
        cPickle.dump(child_model_dicts, f_handle, protocol=-1)
        f_handle.close()
        return
Esempio n. 11
0
class HiddenLayer(object):
    def __init__(self, rng, input, in_dim, out_dim, \
                 activation=None, pool_size=0, \
                 drop_rate=0., input_noise=0., bias_noise=0., \
                 W=None, b=None, b_in=None, s_in=None,
                 name="", W_scale=1.0):

        # Setup a shared random generator for this layer
        self.rng = RandStream(rng.randint(1000000))

        # setup scale and bias params for the input
        if b_in is None:
            # input biases are always initialized to zero
            ary = np.zeros((in_dim, ), dtype=theano.config.floatX)
            b_in = theano.shared(value=ary, name="{0:s}_b_in".format(name))
        if s_in is None:
            # input scales are always initialized to one
            ary = 0.541325 * np.ones((in_dim, ), dtype=theano.config.floatX)
            s_in = theano.shared(value=ary, name="{0:s}_s_in".format(name))
        self.b_in = b_in
        self.s_in = s_in

        # allow an early shift and rescale for inputs to this layer
        #self.clean_input = T.nnet.softplus(self.s_in) * (input + self.b_in)
        # use the input directly
        self.clean_input = input

        zero_ary = np.zeros((1, )).astype(theano.config.floatX)
        self.input_noise = theano.shared(value=(zero_ary+input_noise), \
                name="{0:s}_input_noise".format(name))
        self.bias_noise = theano.shared(value=(zero_ary+bias_noise), \
                name="{0:s}_bias_noise".format(name))
        self.drop_rate = theano.shared(value=(zero_ary+drop_rate), \
                name="{0:s}_bias_noise".format(name))

        # Add gaussian noise to the input (if desired)
        self.fuzzy_input = self.clean_input + (self.input_noise[0] * \
                self.rng.normal(size=self.clean_input.shape, avg=0.0, std=1.0, \
                dtype=theano.config.floatX))

        # Apply masking noise to the input (if desired)
        self.noisy_input = self._drop_from_input(self.fuzzy_input, \
                self.drop_rate[0])

        # Set some basic layer properties
        self.pool_size = pool_size
        self.in_dim = in_dim
        self.out_dim = out_dim
        if self.pool_size <= 1:
            self.filt_count = self.out_dim
        else:
            self.filt_count = self.out_dim * self.pool_size
        self.pool_count = self.filt_count / max(self.pool_size, 1)
        if activation is None:
            activation = relu_actfun
        if self.pool_size <= 1:
            self.activation = activation
        else:
            self.activation = lambda x: \
                    maxout_actfun(x, self.pool_size, self.filt_count)

        # Get some random initial weights and biases, if not given
        if W is None:
            # Generate initial filters using orthogonal random trick
            #W_shape = (self.in_dim, self.filt_count)
            #W_scale = W_scale * (1.0 / np.sqrt(self.in_dim))
            #W_init = W_scale * npr.normal(0.0, 1.0, W_shape)
            W_init = ortho_matrix(shape=(self.in_dim, self.filt_count), \
                    gain=W_scale)
            W_init = W_init.astype(theano.config.floatX)
            W = theano.shared(value=W_init, name="{0:s}_W".format(name))
        if b is None:
            b_init = np.zeros((self.filt_count, ), dtype=theano.config.floatX)
            b = theano.shared(value=b_init, name="{0:s}_b".format(name))

        # Set layer weights and biases
        self.W = W
        self.b = b

        # Compute linear "pre-activation" for this layer
        self.linear_output = T.dot(self.noisy_input, self.W) + self.b

        # Add noise to the pre-activation features (if desired)
        self.noisy_linear = self.linear_output + (self.bias_noise[0] * \
                self.rng.normal(size=self.linear_output.shape, avg=0.0, \
                std=1.0, dtype=theano.config.floatX))

        # Apply activation function
        self.output = self.activation(self.noisy_linear)

        # Compute some properties of the activations, probably to regularize
        self.act_l2_sum = T.sum(self.noisy_linear**2.) / self.output.size

        # Conveniently package layer parameters
        self.params = [self.W, self.b, self.b_in, self.s_in]
        # Layer construction complete...
        return

    def _drop_from_input(self, input, p):
        """p is the probability of dropping elements of input."""
        # get a drop mask that drops things with probability p
        drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \
                dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # get a scaling factor to keep expectations fixed after droppage
        drop_scale = 1. / (1. - p)
        # apply dropout mask and rescaling factor to the input
        droppy_input = drop_scale * input * drop_mask
        return droppy_input

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                dtype=theano.config.floatX)
        return P_nz
Esempio n. 12
0
class VCGLoop(object):
    """
    Controller for training a self-looping VAE using guidance provided by a
    classifier. The classifier tries to discriminate between samples generated
    by the looped VAE while the VAE minimizes a variational generative model
    objective and also shifts mass away from regions where the classifier can
    discern that the generated data is denser than the training data.

    The generator must be an instance of the InfNet class implemented in
    "InfNet.py". The discriminator must be an instance of the PeaNet class,
    as implemented in "PeaNet.py". The inferencer must be an instance of the
    InfNet class implemented in "InfNet.py".

    Parameters:
        rng: numpy.random.RandomState (for reproducibility)
        x_d: symbolic var for providing points for starting the Markov Chain
        x_t: symbolic var for providing samples from the target distribution
        i_net: The InfNet instance that will serve as the inferencer
        g_net: The HydraNet instance that will serve as the generator
        d_net: The PeaNet instance that will serve as the discriminator
        chain_len: number of steps to unroll the VAE Markov Chain
        data_dim: dimension of the generated data
        z_dim: dimension of the model prior
        params: a dict of parameters for controlling various costs
            x_type: can be "bernoulli" or "gaussian"
            xt_transform: optional transform for gaussian means
            logvar_bound: optional bound on gaussian output logvar
            cost_decay: rate of decay for VAE costs in unrolled chain
            chain_type: can be 'walkout' or 'walkback'
            lam_l2d: regularization on squared discriminator output
    """
    def __init__(self, rng=None, x_d=None, x_t=None, \
                 i_net=None, g_net=None, d_net=None, \
                 chain_len=None, data_dim=None, z_dim=None, \
                 params=None):
        # Do some stuff!
        self.rng = RandStream(rng.randint(100000))
        self.data_dim = data_dim
        self.z_dim = z_dim
        self.p_z_mean = 0.0
        self.p_z_logvar = 0.0
        if params is None:
            self.params = {}
        else:
            self.params = params
        if 'cost_decay' in self.params:
            self.cost_decay = self.params['cost_decay']
        else:
            self.cost_decay = 0.1
        if 'chain_type' in self.params:
            assert((self.params['chain_type'] == 'walkback') or \
                (self.params['chain_type'] == 'walkout'))
            self.chain_type = self.params['chain_type']
        else:
            self.chain_type = 'walkout'
        if 'xt_transform' in self.params:
            assert((self.params['xt_transform'] == 'sigmoid') or \
                    (self.params['xt_transform'] == 'none'))
            if self.params['xt_transform'] == 'sigmoid':
                self.xt_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.xt_transform = lambda x: x
        else:
            self.xt_transform = lambda x: T.nnet.sigmoid(x)
        if 'logvar_bound' in self.params:
            self.logvar_bound = self.params['logvar_bound']
        else:
            self.logvar_bound = 10
        #
        # x_type: this tells if we're using bernoulli or gaussian model for
        #         the observations
        #
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))

        # grab symbolic input variables
        self.x_d = x_d             # initial input for starting the chain
        self.x_t = x_t             # samples from target distribution
        self.z_zmuv = T.tensor3()  # ZMUV gaussian samples for use in scan

        # get the number of steps for chain unrolling
        self.chain_len = chain_len 

        # symbolic matrix of indices for inputs from target distribution
        self.It = T.arange(self.x_t.shape[0])
        # symbolic matrix of indices for noise/generated inputs
        self.Id = T.arange(self.chain_len * self.x_d.shape[0]) + self.x_t.shape[0]

        # get a clone of the desired VAE, for easy access
        self.OSM = OneStageModel(rng=rng, x_in=self.x_d, \
                                 p_x_given_z=g_net, q_z_given_x=i_net, \
                                 x_dim=self.data_dim, z_dim=self.z_dim, \
                                 params=self.params)
        self.IN = self.OSM.q_z_given_x
        self.GN = self.OSM.p_x_given_z
        self.transform_x_to_z = self.OSM.transform_x_to_z
        self.transform_z_to_x = self.OSM.transform_z_to_x
        self.bounded_logvar = self.OSM.bounded_logvar

        ##################################################
        # self-loop the VAE into a multi-step Markov chain.
        # ** All VAEs in the chain share the same Xc and Xm, which are the
        #    symbolic inputs for providing the observed portion of the input
        #    and a mask indicating which part of the input is "observed".
        #    These inputs are used for training "reconstruction" policies.
        ##################################################
        # Setup the iterative generation loop using scan #
        ##################################################
        def chain_step_func(zi_zmuv, xim1):
            # get mean and logvar of z samples for this step
            zi_mean, zi_logvar = self.IN.apply(xim1, do_samples=False)
            # transform ZMUV samples to get desired samples
            zi = (T.exp(0.5 * zi_logvar) * zi_zmuv) + zi_mean
            # get the next generated xi (pre-transformation)
            outputs = self.GN.apply(zi)
            xti = outputs[-1]
            # apply the observation "mean" transform
            xgi = self.xt_transform(xti)
            # compute NLL for this step
            if self.chain_type == 'walkout':
                x_true = self.x_d
            else:
                x_true = xim1
            nlli = self._log_prob(x_true, xgi).flatten()
            kldi = T.sum(gaussian_kld(zi_mean, zi_logvar, \
                         self.p_z_mean, self.p_z_logvar), axis=1)
            return xgi, nlli, kldi

        # apply the scan op
        init_values = [self.x_d, None, None]
        self.scan_results, self.scan_updates = \
                theano.scan(chain_step_func, outputs_info=init_values, \
                            sequences=self.z_zmuv)
        # get the outputs of the scan op
        self.xgi = self.scan_results[0]
        self.nlli = self.scan_results[1]
        self.kldi = self.scan_results[2]
        self.xgi_list = [self.xgi[i] for i in range(self.chain_len)]

        # make a clone of the desired discriminator network, which will try
        # to discriminate between samples from the training data and samples
        # generated by the self-looped VAE chain.
        self.DN = d_net.shared_param_clone(rng=rng, \
                          Xd=T.vertical_stack(self.x_t, *self.xgi_list))

        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        # init shared var for weighting nll of data given posterior sample
        self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll')
        self.set_lam_chain_nll(lam_chain_nll=1.0)
        # init shared var for weighting posterior KL-div from prior
        self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld')
        self.set_lam_chain_kld(lam_chain_kld=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w')
        self.set_lam_l2w(lam_l2w=1e-4)
        # shared var learning rates for all networks
        self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn')
        self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn')
        self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in')
        # shared var momentum parameters for all networks
        self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2')
        # shared var weights for adversarial classification objective
        self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn')
        self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn')
        # init parameters for controlling learning dynamics
        self.set_all_sgd_params()
        # init adversarial cost weights for GN/DN
        self.set_disc_weights()  
        # set a shared var for regularizing the output of the discriminator
        self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \
                                     name='vcg_lam_l2d')

        # Grab the full set of "optimizable" parameters from the generator
        # and discriminator networks that we'll be working with. We need to
        # ignore parameters in the final layers of the proto-networks in the
        # discriminator network (a generalized pseudo-ensemble). We ignore them
        # because the VCGair requires that they be "bypassed" in favor of some
        # binary classification layers that will be managed by this VCGair.
        self.dn_params = []
        for pn in self.DN.proto_nets:
            for pnl in pn[0:-1]:
                self.dn_params.extend(pnl.params)
        self.in_params = [p for p in self.IN.mlp_params]
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.in_params + self.gn_params + self.dn_params

        # Now construct a binary discriminator layer for each proto-net in the
        # discriminator network. And, add their params to optimization list.
        self._construct_disc_layers(rng)
        self.disc_reg_cost = self.lam_l2d[0] * \
                T.sum([dl.act_l2_sum for dl in self.disc_layers])

        # Construct costs for the generator and discriminator networks based 
        # on adversarial binary classification
        self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs()

        # first, build the cost to be optimized by the discriminator network,
        # in general this will be treated somewhat indepedently of the
        # optimization of the generator and inferencer networks.
        self.dn_cost = self.disc_cost_dn + self.disc_reg_cost

        # construct costs relevant to the optimization of the generator and
        # discriminator networks
        self.chain_nll_cost = self.lam_chain_nll[0] * \
                self._construct_chain_nll_cost(cost_decay=self.cost_decay)
        self.chain_kld_cost = self.lam_chain_kld[0] * \
                self._construct_chain_kld_cost(cost_decay=self.cost_decay)
        self.other_reg_cost = self._construct_other_reg_cost()
        self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \
                        self.chain_kld_cost + self.other_reg_cost
        # compute total cost on the discriminator and VB generator/inferencer
        self.joint_cost = self.dn_cost + self.osm_cost

        print("Computing VCGLoop joint_grad...")
        # grab the gradients for all parameters to optimize
        self.joint_grads = OrderedDict()
        for p in self.dn_params:
            self.joint_grads[p] = T.grad(self.dn_cost, p)
        for p in self.in_params:
            self.joint_grads[p] = T.grad(self.osm_cost, p)
        for p in self.gn_params:
            self.joint_grads[p] = T.grad(self.osm_cost, p)

        # construct the updates for the discriminator, generator and 
        # inferencer networks. all networks share the same first/second
        # moment momentum and iteration count. the networks each have their
        # own learning rates, which lets you turn their learning on/off.
        self.dn_updates = get_adam_updates(params=self.dn_params, \
                grads=self.joint_grads, alpha=self.lr_dn, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)
        self.in_updates = get_adam_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)
        self.gn_updates = get_adam_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)

        # bag up all the updates required for training
        self.joint_updates = OrderedDict()
        for k in self.dn_updates:
            self.joint_updates[k] = self.dn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]

        print("Compiling VCGLoop train_joint...")
        # construct the function for training on training data
        self.train_joint = self._construct_train_joint()
        return

    def set_dn_sgd_params(self, learn_rate=0.01):
        """
        Set learning rate for the discriminator network.
        """
        zero_ary = np.zeros((1,))
        new_lr = zero_ary + learn_rate
        self.lr_dn.set_value(new_lr.astype(theano.config.floatX))
        return

    def set_in_sgd_params(self, learn_rate=0.01):
        """
        Set learning rate for the inferencer network.
        """
        zero_ary = np.zeros((1,))
        new_lr = zero_ary + learn_rate
        self.lr_in.set_value(new_lr.astype(theano.config.floatX))
        return

    def set_gn_sgd_params(self, learn_rate=0.01):
        """
        Set learning rate for the generator network.
        """
        zero_ary = np.zeros((1,))
        new_lr = zero_ary + learn_rate
        self.lr_gn.set_value(new_lr.astype(theano.config.floatX))
        return

    def set_all_sgd_params(self, learn_rate=0.01, mom_1=0.9, mom_2=0.999):
        """
        Set learning rate and momentum parameter for all updates.
        """
        zero_ary = np.zeros((1,))
        # set learning rates to the same value
        new_lr = zero_ary + learn_rate
        self.lr_dn.set_value(new_lr.astype(theano.config.floatX))
        self.lr_gn.set_value(new_lr.astype(theano.config.floatX))
        self.lr_in.set_value(new_lr.astype(theano.config.floatX))
        # set the first/second moment momentum parameters
        new_mom_1 = zero_ary + mom_1
        new_mom_2 = zero_ary + mom_2
        self.mom_1.set_value(new_mom_1.astype(theano.config.floatX))
        self.mom_2.set_value(new_mom_2.astype(theano.config.floatX))
        return

    def set_disc_weights(self, dweight_gn=1.0, dweight_dn=1.0):
        """
        Set weights for the adversarial classification cost.
        """
        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        new_dw_dn = zero_ary + dweight_dn
        self.dw_dn.set_value(new_dw_dn)
        new_dw_gn = zero_ary + dweight_gn
        self.dw_gn.set_value(new_dw_gn)
        return

    def set_lam_chain_nll(self, lam_chain_nll=1.0):
        """
        Set weight for controlling the influence of the data likelihood.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_chain_nll
        self.lam_chain_nll.set_value(new_lam.astype(theano.config.floatX))
        return

    def set_lam_chain_kld(self, lam_chain_kld=1.0):
        """
        Set the strength of regularization on KL-divergence for continuous
        posterior variables. When set to 1.0, this reproduces the standard
        role of KL(posterior || prior) in variational learning.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_chain_kld
        self.lam_chain_kld.set_value(new_lam.astype(theano.config.floatX))
        return

    def set_lam_l2w(self, lam_l2w=1e-3):
        """
        Set the relative strength of l2 regularization on network params.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_l2w
        self.lam_l2w.set_value(new_lam.astype(theano.config.floatX))
        return

    def _construct_zmuv_samples(self, xi, br):
        """
        Construct the necessary (symbolic) samples for computing through this
        VCGLoop for input (sybolic) matrix X.
        """
        z_zmuv = self.rng.normal( \
                    size=(self.chain_len, xi.shape[0]*br, self.z_dim), \
                    avg=0.0, std=1.0, dtype=theano.config.floatX)
        return z_zmuv

    def _construct_disc_layers(self, rng):
        """
        Construct binary discrimination layers for each spawn-net in the
        underlying discrimnator pseudo-ensemble. All spawn-nets spawned from
        the same proto-net will use the same disc-layer parameters.
        """
        self.disc_layers = []
        self.disc_outputs = []
        dn_init_scale = self.DN.init_scale
        for sn in self.DN.spawn_nets:
            # construct a "binary discriminator" layer to sit on top of each
            # spawn net in the discriminator pseudo-ensemble
            sn_fl = sn[-1]
            self.disc_layers.append(DiscLayer(rng=rng, \
                    input=sn_fl.noisy_input, in_dim=sn_fl.in_dim, \
                    W_scale=dn_init_scale))
            # capture the (linear) output of the DiscLayer, for possible reuse
            self.disc_outputs.append(self.disc_layers[-1].linear_output)
            # get the params of this DiscLayer, for convenient optimization
            self.dn_params.extend(self.disc_layers[-1].params)
        return

    def _construct_disc_costs(self):
        """
        Construct the generator and discriminator adversarial costs.
        """
        gn_costs = []
        dn_costs = []
        for dl_output in self.disc_outputs:
            data_preds = dl_output.take(self.It, axis=0)
            noise_preds = dl_output.take(self.Id, axis=0)
            # compute the cost with respect to which we will be optimizing
            # the parameters of the discriminator network
            data_size = T.cast(self.It.size, 'floatX')
            noise_size = T.cast(self.Id.size, 'floatX')
            dnl_dn_cost = (logreg_loss(data_preds, 1.0) / data_size) + \
                          (logreg_loss(noise_preds, -1.0) / noise_size)
            # compute the cost with respect to which we will be optimizing
            # the parameters of the generative model
            dnl_gn_cost = (hinge_loss(noise_preds, 0.0) + hinge_sq_loss(noise_preds, 0.0)) / (2.0 * noise_size)
            dn_costs.append(dnl_dn_cost)
            gn_costs.append(dnl_gn_cost)
        dn_cost = self.dw_dn[0] * T.sum(dn_costs)
        gn_cost = self.dw_gn[0] * T.sum(gn_costs)
        return [dn_cost, gn_cost]

    def _log_prob(self, x_true, x_apprx):
        """
        Wrap log-prob with switching for bernoulli/gaussian output types.
        """
        if self.x_type == 'bernoulli':
            ll_cost = log_prob_bernoulli(x_true, x_apprx)
        else:
            ll_cost = log_prob_gaussian2(x_true, x_apprx, \
                              log_vars=self.bounded_logvar)
        nll_cost = -ll_cost
        return nll_cost

    def _construct_chain_nll_cost(self, cost_decay=0.1):
        """
        Construct the negative log-likelihood part of cost to minimize.

        This is for operation in "free chain" mode, where a seed point is used
        to initialize a long(ish) running markov chain.
        """
        assert((cost_decay > 0.0) and (cost_decay < 1.0))
        nll_costs = []
        step_weight = 1.0
        step_weights = []
        step_decay = cost_decay
        for i in range(self.chain_len):
            c = T.mean(self.nlli[i])
            nll_costs.append(step_weight * c)
            step_weights.append(step_weight)
            step_weight = step_weight * step_decay
        nll_cost = sum(nll_costs) / sum(step_weights)
        return nll_cost

    def _construct_chain_kld_cost(self, cost_decay=0.1):
        """
        Construct the posterior KLd from prior part of cost to minimize.

        This is for operation in "free chain" mode, where a seed point is used
        to initialize a long(ish) running markov chain.
        """
        assert((cost_decay > 0.0) and (cost_decay < 1.0))
        kld_costs = []
        step_weight = 1.0
        step_weights = []
        step_decay = cost_decay
        for i in range(self.chain_len):
            # sum and reweight the KLd cost for this step in the chain
            c = T.mean(self.kldi[i])
            kld_costs.append(step_weight * c)
            step_weights.append(step_weight)
            step_weight = step_weight * step_decay
        kld_cost = sum(kld_costs) / sum(step_weights)
        return kld_cost

    def _construct_other_reg_cost(self):
        """
        Construct the cost for low-level basic regularization. E.g. for
        applying l2 regularization to the network parameters.
        """
        gp_cost = sum([T.sum(par**2.0) for par in self.gn_params])
        ip_cost = sum([T.sum(par**2.0) for par in self.in_params])
        other_reg_cost = self.lam_l2w[0] * (gp_cost + ip_cost)
        return other_reg_cost

    def _construct_train_joint(self):
        """
        Construct theano function to train generator and discriminator jointly.
        """
        # symbolic vars for passing input to training function
        xd = T.matrix()
        xt = T.matrix()
        br = T.lscalar()
        zzmuv = self._construct_zmuv_samples(xd, br)
        # collect outputs to return to caller
        outputs = [self.joint_cost, self.chain_nll_cost, self.chain_kld_cost, \
                   self.disc_cost_gn, self.disc_cost_dn, self.other_reg_cost]
        func = theano.function(inputs=[ xd, xt, br ], \
                outputs=outputs, updates=self.joint_updates, \
                givens={ self.x_d: xd.repeat(br, axis=0), \
                         self.x_t: xt,
                         self.z_zmuv: zzmuv })
        return func

    def sample_from_chain(self, X_d, X_c=None, X_m=None, loop_iters=5, \
                          sigma_scale=None):
        """
        Sample for several rounds through the I<->G loop, initialized with the
        the "data variable" samples in X_d.
        """
        result = self.OSM.sample_from_chain(X_d, X_c=X_c, X_m=X_m, \
                        loop_iters=loop_iters, sigma_scale=sigma_scale)
        return result

    def sample_from_prior(self, samp_count):
        """
        Draw independent samples from the model's prior.
        """
        Xs = self.OSM.sample_from_prior(samp_count)
        return Xs
Esempio n. 13
0
class GenNet(object):
    """
    A net that transforms a simple distribution so that it matches some
    more complicated distribution, for some definition of match....

    Parameters:
        rng: a numpy.random RandomState object
        Xp: symbolic matrix for inputting latent variable samples
        prior_sigma: standard deviation of isotropic Gaussian prior that this
                     generator will transform to match some other distribution
        params: a dict of parameters describing the desired network:
            lam_l2a: L2 regularization weight on neuron activations
            vis_drop: drop rate to use on the latent variable space
            hid_drop: drop rate to use on the hidden layer activations
                -- note: vis_drop/hid_drop are optional, with defaults 0.0/0.0
            bias_noise: standard dev for noise on the biases of hidden layers
            mlp_config: list of "layer descriptions"
            out_type: set this to "bernoulli" for generating outputs to match
                      bernoulli-valued observations and set it to "gaussian" to
                      match general real-valued observations.
            activation: "function handle" for the desired non-linearity
            init_scale: scaling factor for hidden layer weights (__ * 0.01)
        shared_param_dicts: parameters for the MLP controlled by this GenNet
    """
    def __init__(self, \
            rng=None, \
            Xp=None, \
            prior_sigma=None, \
            params=None, \
            shared_param_dicts=None):
        # First, setup a shared random number generator for this layer
        self.rng = RandStream(rng.randint(1000000))

        # Grab the symbolic input matrix
        self.Xp = Xp
        self.prior_sigma = prior_sigma
        #####################################################
        # Process user-supplied parameters for this network #
        #####################################################
        assert(not (params is None))
        self.params = params
        lam_l2a = self.params['lam_l2a']
        if 'vis_drop' in self.params:
            # Drop rate on the latent variables
            self.vis_drop = self.params['vis_drop']
        else:
            self.vis_drop = 0.0
        if 'hid_drop' in self.params:
            # Drop rate on hidden layer activations
            self.hid_drop = self.params['hid_drop']
        else:
            self.hid_drop = 0.0
        if 'bias_noise' in self.params:
            # Noise sigma for hidden layer biases
            self.bias_noise = self.params['bias_noise']
        else:
            self.bias_noise = 0.0
        if 'init_scale' in params:
            self.init_scale = params['init_scale']
        else:
            self.init_scale = 1.0
        if 'out_type' in params:
            # check which type of output distribution to generate
            self.out_type = params['out_type']
            assert((self.out_type == 'bernoulli') or \
                    (self.out_type == 'gaussian'))
        else:
            # default to bernoulli-valued outputs
            self.out_type = 'bernoulli'
        # Check if the params for this net were given a priori. This option
        # will be used for creating "clones" of a generative network, with all
        # of the network parameters shared between clones.
        if shared_param_dicts is None:
            # This is not a clone, and we will need to make a dict for
            # referring to the parameters of each network layer
            self.shared_param_dicts = []
            self.is_clone = False
        else:
            # This is a clone, and its layer parameters can be found by
            # referring to the given param dict (i.e. shared_param_dicts).
            self.shared_param_dicts = shared_param_dicts
            self.is_clone = True
        # Get the configuration/prototype for this network. The config is a
        # list of layer descriptions, including a description for the input
        # layer, which is typically just the dimension of the inputs. So, the
        # depth of the mlp is one less than the number of layer configs.
        self.mlp_config = params['mlp_config']
        if 'activation' in params:
            self.activation = params['activation']
        else:
            self.activation = relu_actfun
        self.mlp_depth = len(self.mlp_config) - 1
        self.latent_dim = self.mlp_config[0]
        self.data_dim = self.mlp_config[-1]

        ##########################
        # Initialize the network #
        ##########################
        self.mlp_layers = []
        self.logvar_layer = None
        layer_def_pairs = zip(self.mlp_config[:-1],self.mlp_config[1:])
        layer_num = 0
        next_input = self.Xp
        for in_def, out_def in layer_def_pairs:
            first_layer = (layer_num == 0)
            last_layer = (layer_num == (len(layer_def_pairs) - 1))
            l_name = "gn_layer_{0:d}".format(layer_num)
            if (type(in_def) is list) or (type(in_def) is tuple):
                # Receiving input from a poolish layer...
                in_dim = in_def[0]
            else:
                # Receiving input from a normal layer...
                in_dim = in_def
            if (type(out_def) is list) or (type(out_def) is tuple):
                # Applying some sort of pooling in this layer...
                out_dim = out_def[0]
                pool_size = out_def[1]
            else:
                # Not applying any pooling in this layer...
                out_dim = out_def
                pool_size = 0
            # Select the appropriate noise to add to this layer
            if first_layer:
                d_rate = self.vis_drop
            else:
                d_rate = self.hid_drop
            b_noise = self.bias_noise
            if not self.is_clone:
                ##########################################
                # Initialize a layer with new parameters #
                ##########################################
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name, W_scale=self.init_scale)
                self.mlp_layers.append(new_layer)
                self.shared_param_dicts.append({'W': new_layer.W, 'b': new_layer.b})
                if (last_layer and (self.out_type == 'gaussian')):
                    # add an extra layer/transform for encoding log-variance
                    lv_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name+'_logvar', W_scale=self.init_scale)
                    self.logvar_layer = lv_layer
                    self.mlp_layers.append(lv_layer)
                    self.shared_param_dicts.append({'W': lv_layer.W, 'b': lv_layer.b})
            else:
                ##################################################
                # Initialize a layer with some shared parameters #
                ##################################################
                init_params = self.shared_param_dicts[layer_num]
                self.mlp_layers.append(HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        name=l_name, W_scale=self.init_scale))
                if (last_layer and (self.out_type == 'gaussian')):
                    init_params = self.shared_param_dicts[layer_num+1]
                    self.mlp_layers.append(HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        name=l_name, W_scale=self.init_scale))
            next_input = self.mlp_layers[-1].output
            # Acknowledge layer completion
            layer_num = layer_num + 1

        # construct a mask for deciding which output dimensions to keep/ignore
        if self.is_clone:
            self.output_mask = self.shared_param_dicts[-1]['output_mask']
            self.output_bias = self.shared_param_dicts[-1]['output_bias']
        else:
            row_mask = np.ones((self.data_dim,)).astype(theano.config.floatX)
            self.output_mask = theano.shared(value=row_mask, name='gn_output_mask')
            row_mask = 0.0 * row_mask
            self.output_bias = theano.shared(value=row_mask, name='gn_output_bias')
            op_dict = {'output_mask': self.output_mask, \
                       'output_bias': self.output_bias}
            self.shared_param_dicts.append(op_dict)

        # Mash all the parameters together, into a list.
        self.mlp_params = []
        for layer in self.mlp_layers:
            self.mlp_params.extend(layer.params)
        # add the output bias vector to the param list
        self.mlp_params.append(self.output_bias)


        # The output of this generator network is given by the noisy output
        # of its final layer. We will keep a running estimate of the mean and
        # covariance of the distribution induced by combining this network's
        # latent noise source with its deep non-linear transform. These will
        # be used to encourage the induced distribution to match the first and
        # second-order moments of the distribution we are trying to match.
        if self.out_type == 'bernoulli':
            self.output = (T.nnet.sigmoid(self.mlp_layers[-1].linear_output + self.output_bias) * \
                    self.output_mask)
            self.output_mu = self.output
            self.output_logvar = self.output
            self.output_sigma = self.output
        else:
            self.output_mu = self.mlp_layers[-1].linear_output + self.output_bias
            self.output_logvar = self.mlp_layers[-2].linear_output
            self.output_sigma = T.sqrt(T.exp(self.output_logvar))
            self.output = self._construct_post_samples() * self.output_mask
        self.out_dim = self.mlp_layers[-1].out_dim
        C_init = np.zeros((self.out_dim,self.out_dim)).astype(theano.config.floatX)
        m_init = np.zeros((self.out_dim,)).astype(theano.config.floatX)
        self.dist_mean = theano.shared(m_init, name='gn_dist_mean')
        self.dist_cov = theano.shared(C_init, name='gn_dist_cov')
        # Get simple regularization penalty to moderate activation dynamics
        self.act_reg_cost = lam_l2a * self._act_reg_cost()
        # Construct a sampler for drawing independent samples from this model's
        # isotropic Gaussian prior, and a sampler for the model distribution.
        self.sample_from_prior = self._construct_prior_sampler()
        self.sample_from_model = self._construct_model_sampler()
        # Construct a function for passing points from the latent/prior space
        # through the transform induced by the current model parameters.
        self.transform_prior = self._construct_transform_prior()
        return

    def _act_reg_cost(self):
        """
        Apply L2 regularization to the activations in this network.
        """
        act_sq_sums = []
        for layer in self.mlp_layers:
            act_sq_sums.append(layer.act_l2_sum)
        full_act_sq_sum = T.sum(act_sq_sums)
        return full_act_sq_sum

    def _construct_post_samples(self):
        """
        Draw a single sample from each of the approximate posteriors encoded
        in self.output_mu and self.output_sigma.
        """
        post_samples = self.output_mu + (self.output_sigma * \
                self.rng.normal(size=self.output_sigma.shape, avg=0.0, std=1.0, \
                dtype=theano.config.floatX))
        return post_samples

    def _construct_prior_sampler(self):
        """
        Draw independent samples from this model's isotropic Gaussian prior.
        """
        samp_count = T.lscalar()
        prior_samples = self.prior_sigma * self.rng.normal( \
                size=(samp_count, self.latent_dim), avg=0.0, std=1.0, \
                dtype=theano.config.floatX)
        prior_sampler = theano.function([samp_count], outputs=prior_samples)
        return prior_sampler

    def _construct_model_sampler(self):
        """
        Draw independent samples from this model's distribution.
        """
        samp_count = T.lscalar()
        prior_samples = self.prior_sigma * self.rng.normal( \
                size=(samp_count, self.latent_dim), avg=0.0, std=1.0, \
                dtype=theano.config.floatX)
        prior_sampler = theano.function([samp_count], outputs=self.output, \
                givens={self.Xp: prior_samples})
        return prior_sampler

    def _construct_transform_prior(self):
        """
        Apply the tranform induced by the current model parameters to some
        set of points in the latent/prior space.
        """
        feedforward = theano.function([self.Xp], outputs=self.output)
        return feedforward

    def _batch_moments(self):
        """
        Compute covariance and mean of the current sample outputs.
        """
        mu = T.mean(self.output, axis=0, keepdims=True)
        sigma = T.dot((self.output.T - mu.T), (self.output - mu))
        return [mu, sigma]

    def init_biases(self, b_init=0.0):
        """
        Initialize the biases in all hidden layers to some constant.
        """
        for layer in self.mlp_layers[:-1]:
            b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init
            layer.b.set_value(b_vec)
        return

    def init_moments(self, X_noise):
        """
        Initialize the running mean and covariance estimates.
        """
        X_noise_sym = T.matrix()
        out_func = theano.function(inputs=[ X_noise_sym ], \
                outputs=[ self.output ], \
                givens={self.Xp: X_noise_sym})
        # Compute outputs for the input latent noise matrix
        X_out = out_func(X_noise.astype(theano.config.floatX))[0]
        # Compute mean and covariance of the outputs
        mu = np.mean(X_out, axis=0)
        X_out_minus_mu = X_out - mu
        sigma = np.dot(X_out_minus_mu.T,X_out_minus_mu) / X_out.shape[0]
        # Initialize the network's running estimates 
        self.dist_cov.set_value(sigma.astype(theano.config.floatX))
        self.dist_mean.set_value(mu.astype(theano.config.floatX))
        return

    def set_output_mask(self, output_mask):
        """
        Set a (probably) binary mask on the output dimensions.
        """
        assert(output_mask.size == self.data_dim)
        output_mask = output_mask.reshape((self.data_dim,))
        self.output_mask.set_value(output_mask.astype(theano.config.floatX))
        return

    def compute_log_prob(self, Xd=None):
        """
        Compute negative log likelihood of the data in Xd, with respect to the
        output distributions currently at self.output_....

        Compute log-prob for all entries in Xd.
        """
        if (self.out_type == 'bernoulli'):
            log_prob_cost = log_prob_bernoulli(Xd, self.output, mask=self.output_mask)
        else:
            log_prob_cost = log_prob_gaussian2(Xd, self.output_mu, \
                    les_logvars=self.output_logvar, mask=self.output_mask)
        return log_prob_cost

    def masked_log_prob(self, Xc=None, Xm=None):
        """
        Compute negative log likelihood of the data in Xc, with respect to the
        output distributions currently at self.output_....

        Select entries in Xd to compute log-prob for based on the mask Xm. When
        Xm[i] == 1, don't measure NLL Xc[i]...
        """
        # to measure NLL for Xc[i] only when Xm[i] is 0, we need to make an
        # inverse mask Xm_inv = 1 - X_m, because the masking in the log pdf
        # functions measures NLL only for observations where the mask != 0.
        Xm_inv = 1.0 - Xm
        if (self.out_type == 'bernoulli'):
            log_prob_cost = log_prob_bernoulli(Xc, self.output, mask=Xm_inv)
        else:
            log_prob_cost = log_prob_gaussian2(Xc, self.output_mu, \
                    les_logvars=self.output_logvar, mask=Xm_inv)
        return log_prob_cost

    def shared_param_clone(self, rng=None, Xp=None):
        """
        Return a clone of this network, with shared parameters but with
        different symbolic input variables.

        This can be used for "unrolling" a generate->infer->generate->infer...
        loop. Then, we can do backprop through time for various objectives.
        """
        clone_net = GenNet(rng=rng, Xp=Xp, \
                prior_sigma=self.prior_sigma, params=self.params, \
                shared_param_dicts=self.shared_param_dicts)
        return clone_net
class GPSImputer(object):
    """
    Controller for training a multi-step imputater via guided policy search.

    Parameters:
        rng: numpy.random.RandomState (for reproducibility)
        x_in: the initial state for imputation
        x_out: the goal state for imputation
        x_mask: mask for state dims to keep fixed during imputation
        p_zi_given_xi: HydraNet for stochastic part of step (2 outputs)
        p_sip1_given_zi: HydraNet for deterministic part of step (3 outputs)
        q_zi_given_xi: HydraNet for the guide policy (2 outputs)
        params: REQUIRED PARAMS SHOWN BELOW
                x_dim: dimension of inputs to reconstruct
                z_dim: dimension of latent space for policy wobble
                imp_steps: number of reconstruction steps to perform
                step_type: either "add", "jump", "lstm", or "layer"
                x_type: can be "bernoulli" or "gaussian"
    """
    def __init__(self, rng=None,
            x_in=None, x_mask=None, x_out=None, \
            p_zi_given_xi=None, \
            p_sip1_given_zi=None, \
            q_zi_given_xi=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_dim = self.params['x_dim']
        self.z_dim = self.params['z_dim']
        self.imp_steps = self.params['imp_steps']
        self.step_type = self.params['step_type']
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        self.shared_param_dicts = shared_param_dicts

        # grab handles to the relevant InfNets
        self.p_zi_given_xi = p_zi_given_xi
        self.p_sip1_given_zi = p_sip1_given_zi
        self.q_zi_given_xi = q_zi_given_xi

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out
        self.x_mask = x_mask
        self.zi_zmuv = T.tensor3()

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch')
        self.set_train_switch(1.0)

        if self.shared_param_dicts is None:
            # initialize parameters "owned" by this model
            s0_init = to_fX( np.zeros((self.x_dim,)) )
            init_ary = to_fX( np.zeros((self.x_dim,)) )
            self.x_null = theano.shared(value=init_ary, name='gpis_xn')
            self.grad_null = theano.shared(value=init_ary, name='gpsi_gn')
            self.s0 = theano.shared(value=s0_init, name='gpsi_s0')
            self.obs_logvar = theano.shared(value=zero_ary, name='gpsi_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0])
            self.shared_param_dicts = {}
            self.shared_param_dicts['x_null'] = self.x_null
            self.shared_param_dicts['grad_null'] = self.grad_null
            self.shared_param_dicts['s0'] = self.s0
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            # grab the parameters required by this model from a given dict
            self.x_null = self.shared_param_dicts['x_null']
            self.grad_null = self.shared_param_dicts['grad_null']
            self.s0 = self.shared_param_dicts['s0']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0])

        ##################################################
        # Setup the iterative imputation loop using scan #
        ##################################################
        self.ones_mask = T.ones_like(self.x_mask)
        def imp_step_func(zi_zmuv, si):
            si_as_x = self._si_as_x(si)
            xi_unmasked = self.x_out
            xi_masked = (self.x_mask * xi_unmasked) + \
                        ((1.0 - self.x_mask) * si_as_x)
            grad_unmasked = self.x_out - si_as_x
            grad_masked = self.x_mask * grad_unmasked
            # get samples of next zi, according to the global policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked)
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply(
                    T.concatenate([xi_masked, xi_unmasked], axis=1))
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)

            # make zi samples that can be switched between zi_p and zi_q
            zi = ((self.train_switch[0] * zi_q) + \
                 ((1.0 - self.train_switch[0]) * zi_p))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar,
                                    zi_p_mean, zi_p_logvar) # KL(q || p)
            kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar,
                                    zi_q_mean, zi_q_logvar) # KL(p || q)
            kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar,
                                    0.0, 0.0) # KL(p || global prior)

            # compute the next si, given the sampled zi
            hydra_out = self.p_sip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if (self.step_type == 'jump'):
                # jump steps always completely overwrite the current guesses
                sip1 = si_step
            elif (self.step_type == 'add'):
                # add steps just update the guesses additively
                sip1 = si + si_step
            elif (self.step_type == 'lstm'):
                # LSTM-style updates with write and erase gates
                write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1])
                erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2])
                sip1 = (erase_gate * si) + (write_gate * si_step)
            elif (self.step_type == 'layer'):
                alpha_gate = T.nnet.sigmoid(hydra_out[1])
                sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step)
            else:
                assert False, "Unknown step type!"

            # compute NLL for the current imputation
            nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask)
            return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g

        # apply scan op for the sequential imputation loop
        self.s0_full = T.alloc(0.0, self.x_in.shape[0], self.x_dim) + self.s0
        init_vals = [self.s0_full, None, None, None, None]
        self.scan_results, self.scan_updates = theano.scan(imp_step_func, \
                    outputs_info=init_vals, sequences=self.zi_zmuv)

        self.si = self.scan_results[0]
        self.nlli = self.scan_results[1]
        self.kldi_q2p = self.scan_results[2]
        self.kldi_p2q = self.scan_results[3]
        self.kldi_p2g = self.scan_results[4]

        # get the initial imputation state
        self.x0 = (self.x_mask * self.x_in) + \
                  ((1.0 - self.x_mask) * self._si_as_x(self.s0_full))

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr = theano.shared(value=zero_ary, name='gpsi_lr')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p')
        self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q')
        self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g')
        self.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, lam_kld_g=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.joint_params = [self.s0, self.obs_logvar]
        self.joint_params.extend(self.p_zi_given_xi.mlp_params)
        self.joint_params.extend(self.p_sip1_given_zi.mlp_params)
        self.joint_params.extend(self.q_zi_given_xi.mlp_params)

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_p, self.kld_q, self.kld_g = self._construct_kld_costs(p=1.0)
        self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \
                         (self.lam_kld_q[0] * self.kld_q) + \
                         (self.lam_kld_g[0] * self.kld_g)
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self.nlli[-1]
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel()
        self.nll_bound = T.mean(self.nll_bounds)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-TRIAL COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)
        for k, v in self.scan_updates.items():
            self.joint_updates[k] = v

        # Construct a function for jointly training the generator/inferencer
        print("Compiling cost computer...")
        self.compute_raw_costs = self._construct_raw_costs()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling best step cost computer...")
        self.compute_per_step_cost = self._construct_compute_per_step_cost()
        print("Compiling data-guided imputer sampler...")
        self.sample_imputer = self._construct_sample_imputer()
        # make easy access points for some interesting parameters
        #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W
        return

    def _si_as_x(self, si):
        """
        Convert from "state" to "observation".
        """
        si_as_x = T.nnet.sigmoid(si)
        return si_as_x

    def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999):
        """
        Set learning rate and momentum parameter for all updates.
        """
        zero_ary = np.zeros((1,))
        # set learning rate
        new_lr = zero_ary + lr
        self.lr.set_value(to_fX(new_lr))
        # set momentums (use first and second order "momentum")
        new_mom_1 = zero_ary + mom_1
        self.mom_1.set_value(to_fX(new_mom_1))
        new_mom_2 = zero_ary + mom_2
        self.mom_2.set_value(to_fX(new_mom_2))
        return

    def set_lam_nll(self, lam_nll=1.0):
        """
        Set weight for controlling the influence of the data likelihood.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_nll
        self.lam_nll.set_value(to_fX(new_lam))
        return

    def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0):
        """
        Set the relative weight of prior KL-divergence vs. data likelihood.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_kld_p
        self.lam_kld_p.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_q
        self.lam_kld_q.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_g
        self.lam_kld_g.set_value(to_fX(new_lam))
        return

    def set_lam_l2w(self, lam_l2w=1e-3):
        """
        Set the relative strength of l2 regularization on network params.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_l2w
        self.lam_l2w.set_value(to_fX(new_lam))
        return

    def set_train_switch(self, switch_val=0.0):
        """
        Set the switch for changing between training and sampling behavior.
        """
        if (switch_val < 0.5):
            switch_val = 0.0
        else:
            switch_val = 1.0
        zero_ary = np.zeros((1,))
        new_val = zero_ary + switch_val
        self.train_switch.set_value(to_fX(new_val))
        return

    def _construct_zi_zmuv(self, xi, br):
        """
        Construct the necessary (symbolic) samples for computing through this
        GPSImputer for input (sybolic) matrix xi.
        """
        zi_zmuv = self.rng.normal( \
                size=(self.imp_steps, xi.shape[0]*br, self.z_dim), \
                avg=0.0, std=1.0, dtype=theano.config.floatX)
        return zi_zmuv

    def _construct_nll_costs(self, si, xo, xm):
        """
        Construct the negative log-likelihood part of free energy.
        """
        # average log-likelihood over the refinement sequence
        xh = self._si_as_x(si)
        xm_inv = 1.0 - xm # we will measure nll only where xm_inv is 1
        if self.x_type == 'bernoulli':
            ll_costs = log_prob_bernoulli(xo, xh, mask=xm_inv)
        else:
            ll_costs = log_prob_gaussian2(xo, xh, \
                    log_vars=self.bounded_logvar, mask=xm_inv)
        nll_costs = -ll_costs.flatten()
        return nll_costs

    def _construct_kld_costs(self, p=1.0):
        """
        Construct the policy KL-divergence part of cost to minimize.
        """
        kld_pis = []
        kld_qis = []
        kld_gis = []
        for i in range(self.imp_steps):
            kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1))
            kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1))
            kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1))
        # compute the batch-wise costs
        kld_pi = sum(kld_pis)
        kld_qi = sum(kld_qis)
        kld_gi = sum(kld_gis)
        return [kld_pi, kld_qi, kld_gi]

    def _construct_reg_costs(self):
        """
        Construct the cost for low-level basic regularization. E.g. for
        applying l2 regularization to the network activations and parameters.
        """
        param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params])
        return param_reg_cost

    def _construct_compute_fe_terms(self):
        """
        Construct a function for computing terms in variational free energy.
        """
        # setup some symbolic variables for theano to deal with
        xi = T.matrix()
        xo = T.matrix()
        xm = T.matrix()
        zizmuv = self._construct_zi_zmuv(xi, 1)
        # construct values to output
        nll = self.nll_costs.flatten()
        kld = self.kld_q.flatten()
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(inputs=[ xi, xo, xm ], \
                outputs=[nll, kld], \
                givens={self.x_in: xi, \
                        self.x_out: xo, \
                        self.x_mask: xm, \
                        self.zi_zmuv: zizmuv}, \
                updates=self.scan_updates, \
                on_unused_input='ignore')
        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(XI, XO, XM, sample_count=20, use_guide_policy=True):
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from guide policies (i.e. variational q)
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from model's imputation policy
                self.set_train_switch(switch_val=0.0)
            # compute a multi-sample estimate of variational free-energy
            nll_sum = np.zeros((XI.shape[0],))
            kld_sum = np.zeros((XI.shape[0],))
            for i in range(sample_count):
                result = fe_term_sample(XI, XO, XM)
                nll_sum += result[0].ravel()
                kld_sum += result[1].ravel()
            mean_nll = nll_sum / float(sample_count)
            mean_kld = kld_sum / float(sample_count)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            if not use_guide_policy:
                # no KLd if samples are from the primary policy...
                mean_kld = 0.0 * mean_kld
            return [mean_nll, mean_kld]
        return fe_term_estimator

    def _construct_raw_costs(self):
        """
        Construct all the raw, i.e. not weighted by any lambdas, costs.
        """
        # setup some symbolic variables for theano to deal with
        xi = T.matrix()
        xo = T.matrix()
        xm = T.matrix()
        zizmuv = self._construct_zi_zmuv(xi, 1)
        # compile theano function for computing the costs
        all_step_costs = [self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g]
        cost_func = theano.function(inputs=[xi, xo, xm], \
                    outputs=all_step_costs, \
                    givens={ self.x_in: xi, \
                             self.x_out: xo, \
                             self.x_mask: xm, \
                             self.zi_zmuv: zizmuv }, \
                    updates=self.scan_updates, \
                    on_unused_input='ignore')
        # make a function for computing multi-sample estimates of cost
        def raw_cost_computer(XI, XO, XM):
            _all_costs = cost_func(to_fX(XI), to_fX(XO), to_fX(XM))
            _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0)
            _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0)
            _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0)
            _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1)
            _step_klds = to_fX( np.asarray([k for k in _step_klds]) )
            _step_nlls = np.mean(_all_costs[0], axis=1)
            _step_nlls = to_fX( np.asarray([k for k in _step_nlls]) )
            results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g]
            return results
        return raw_cost_computer

    def _construct_compute_per_step_cost(self):
        """
        Construct a theano function for computing the best possible cost
        achieved by sequential imputation.
        """
        # setup some symbolic variables for theano to deal with
        xi = T.matrix()
        xo = T.matrix()
        xm = T.matrix()
        zizmuv = self._construct_zi_zmuv(xi, 1)
        # construct symbolic variables for the step-wise cost
        step_mean_nll = T.mean(self.nlli, axis=1).flatten()
        step_lone_kld = T.sum(self.kldi_q2p, axis=2)
        step_cumu_kld = T.extra_ops.cumsum(step_lone_kld, axis=0)
        step_mean_kld = T.mean(step_cumu_kld, axis=1).flatten()
        # compile theano function for computing the step-wise cost
        step_cost_func = theano.function(inputs=[xi, xo, xm], \
                    outputs=[step_mean_nll, step_mean_kld], \
                    givens={ self.x_in: xi, \
                             self.x_out: xo, \
                             self.x_mask: xm, \
                             self.zi_zmuv: zizmuv }, \
                    updates=self.scan_updates, \
                    on_unused_input='ignore')
        def best_cost_computer(XI, XO, XM, sample_count=20):
            # compute a multi-sample estimate of variational free-energy
            step_nll_sum = np.zeros((self.imp_steps,))
            step_kld_sum = np.zeros((self.imp_steps,))
            for i in range(sample_count):
                result = step_cost_func(XI, XO, XM)
                step_nll_sum += result[0].ravel()
                step_kld_sum += result[1].ravel()
            mean_step_nll = step_nll_sum / float(sample_count)
            mean_step_kld = step_kld_sum / float(sample_count)
            return [mean_step_nll, mean_step_kld]
        return best_cost_computer

    def _construct_train_joint(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xi = T.matrix()
        xo = T.matrix()
        xm = T.matrix()
        br = T.lscalar()
        zizmuv = self._construct_zi_zmuv(xi, br)
        # collect the outputs to return from this function
        outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \
                   self.kld_cost, self.reg_cost, self.obs_costs]
        # compile the theano function
        func = theano.function(inputs=[ xi, xo, xm, br ], \
                outputs=outputs, \
                givens={ self.x_in: xi.repeat(br, axis=0), \
                         self.x_out: xo.repeat(br, axis=0), \
                         self.x_mask: xm.repeat(br, axis=0), \
                         self.zi_zmuv: zizmuv }, \
                updates=self.joint_updates, \
                on_unused_input='ignore')
        return func

    def _construct_sample_imputer(self):
        """
        Construct a function for drawing samples from the distribution
        generated by running this imputer.
        """
        xi = T.matrix()
        xo = T.matrix()
        xm = T.matrix()
        zizmuv = self._construct_zi_zmuv(xi, 1)
        oputs = [self.x0] + [self._si_as_x(self.si[i]) for i in range(self.imp_steps)]
        sample_func = theano.function(inputs=[xi, xo, xm], outputs=oputs, \
                givens={self.x_in: xi, \
                        self.x_out: xo, \
                        self.x_mask: xm, \
                        self.zi_zmuv: zizmuv}, \
                updates=self.scan_updates, \
                on_unused_input='ignore')
        def imputer_sampler(XI, XO, XM, use_guide_policy=False):
            XI = to_fX( XI )
            XO = to_fX( XO )
            XM = to_fX( XM )
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from guide policies (i.e. variational q)
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from model's imputation policy
                self.set_train_switch(switch_val=0.0)
            # draw guided/unguided conditional samples
            model_samps = sample_func(XI, XO, XM)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            # reverse engineer the "masked" samples...
            masked_samps = []
            for xs in model_samps:
                xsm = (XM * XI) + ((1.0 - XM) * xs)
                masked_samps.append(xsm)
            return model_samps, masked_samps
        return imputer_sampler

    def save_to_file(self, f_name=None):
        """
        Dump important stuff to a Python pickle, so that we can reload this
        model later.
        """
        assert(not (f_name is None))
        f_handle = file(f_name, 'wb')
        # dump the dict self.params, which just holds "simple" python values
        cPickle.dump(self.params, f_handle, protocol=-1)
        # make a copy of self.shared_param_dicts, with numpy arrays in place
        # of the theano shared variables
        numpy_param_dicts = {}
        for key in self.shared_param_dicts:
            numpy_ary = self.shared_param_dicts[key].get_value(borrow=False)
            numpy_param_dicts[key] = numpy_ary
        # dump the numpy version of self.shared_param_dicts to pickle file
        cPickle.dump(numpy_param_dicts, f_handle, protocol=-1)
        # get numpy dicts for each of the "child" models that we must save
        child_model_dicts = {}
        child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict()
        child_model_dicts['p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict()
        child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict()
        # dump the numpy child model dicts to the pickle file
        cPickle.dump(child_model_dicts, f_handle, protocol=-1)
        f_handle.close()
        return
Esempio n. 15
0
class GPSImputer(object):
    """
    Controller for training a multi-step imputater via guided policy search.

    Parameters:
        rng: numpy.random.RandomState (for reproducibility)
        x_in: the initial state for imputation
        x_out: the goal state for imputation
        x_mask: mask for state dims to keep fixed during imputation
        p_zi_given_xi: HydraNet for stochastic part of step (2 outputs)
        p_sip1_given_zi: HydraNet for deterministic part of step (3 outputs)
        q_zi_given_xi: HydraNet for the guide policy (2 outputs)
        params: REQUIRED PARAMS SHOWN BELOW
                x_dim: dimension of inputs to reconstruct
                z_dim: dimension of latent space for policy wobble
                imp_steps: number of reconstruction steps to perform
                step_type: either "add", "jump", "lstm", or "layer"
                x_type: can be "bernoulli" or "gaussian"
    """
    def __init__(self, rng=None,
            x_in=None, x_mask=None, x_out=None, \
            p_zi_given_xi=None, \
            p_sip1_given_zi=None, \
            q_zi_given_xi=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_dim = self.params['x_dim']
        self.z_dim = self.params['z_dim']
        self.imp_steps = self.params['imp_steps']
        self.step_type = self.params['step_type']
        self.x_type = self.params['x_type']
        assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        self.shared_param_dicts = shared_param_dicts

        # grab handles to the relevant InfNets
        self.p_zi_given_xi = p_zi_given_xi
        self.p_sip1_given_zi = p_sip1_given_zi
        self.q_zi_given_xi = q_zi_given_xi

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out
        self.x_mask = x_mask
        self.zi_zmuv = T.tensor3()

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX(np.zeros((1, )))
        self.train_switch = theano.shared(value=zero_ary,
                                          name='msm_train_switch')
        self.set_train_switch(1.0)

        if self.shared_param_dicts is None:
            # initialize parameters "owned" by this model
            s0_init = to_fX(np.zeros((self.x_dim, )))
            init_ary = to_fX(np.zeros((self.x_dim, )))
            self.x_null = theano.shared(value=init_ary, name='gpis_xn')
            self.grad_null = theano.shared(value=init_ary, name='gpsi_gn')
            self.s0 = theano.shared(value=s0_init, name='gpsi_s0')
            self.obs_logvar = theano.shared(value=zero_ary,
                                            name='gpsi_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh(
                (1.0 / 8.0) * self.obs_logvar[0])
            self.shared_param_dicts = {}
            self.shared_param_dicts['x_null'] = self.x_null
            self.shared_param_dicts['grad_null'] = self.grad_null
            self.shared_param_dicts['s0'] = self.s0
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            # grab the parameters required by this model from a given dict
            self.x_null = self.shared_param_dicts['x_null']
            self.grad_null = self.shared_param_dicts['grad_null']
            self.s0 = self.shared_param_dicts['s0']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh(
                (1.0 / 8.0) * self.obs_logvar[0])

        ##################################################
        # Setup the iterative imputation loop using scan #
        ##################################################
        self.ones_mask = T.ones_like(self.x_mask)

        def imp_step_func(zi_zmuv, si):
            si_as_x = self._si_as_x(si)
            xi_unmasked = self.x_out
            xi_masked = (self.x_mask * xi_unmasked) + \
                        ((1.0 - self.x_mask) * si_as_x)
            grad_unmasked = self.x_out - si_as_x
            grad_masked = self.x_mask * grad_unmasked
            # get samples of next zi, according to the global policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(xi_masked)
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply(
                T.concatenate([xi_masked, xi_unmasked], axis=1))
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)

            # make zi samples that can be switched between zi_p and zi_q
            zi = ((self.train_switch[0] * zi_q) + \
                 ((1.0 - self.train_switch[0]) * zi_p))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean,
                                    zi_p_logvar)  # KL(q || p)
            kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean,
                                    zi_q_logvar)  # KL(p || q)
            kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0,
                                    0.0)  # KL(p || global prior)

            # compute the next si, given the sampled zi
            hydra_out = self.p_sip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if (self.step_type == 'jump'):
                # jump steps always completely overwrite the current guesses
                sip1 = si_step
            elif (self.step_type == 'add'):
                # add steps just update the guesses additively
                sip1 = si + si_step
            elif (self.step_type == 'lstm'):
                # LSTM-style updates with write and erase gates
                write_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[1])
                erase_gate = 1.1 * T.nnet.sigmoid(1.0 + hydra_out[2])
                sip1 = (erase_gate * si) + (write_gate * si_step)
            elif (self.step_type == 'layer'):
                alpha_gate = T.nnet.sigmoid(hydra_out[1])
                sip1 = (alpha_gate * si) + ((1.0 - alpha_gate) * si_step)
            else:
                assert False, "Unknown step type!"

            # compute NLL for the current imputation
            nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask)
            return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g

        # apply scan op for the sequential imputation loop
        self.s0_full = T.alloc(0.0, self.x_in.shape[0], self.x_dim) + self.s0
        init_vals = [self.s0_full, None, None, None, None]
        self.scan_results, self.scan_updates = theano.scan(imp_step_func, \
                    outputs_info=init_vals, sequences=self.zi_zmuv)

        self.si = self.scan_results[0]
        self.nlli = self.scan_results[1]
        self.kldi_q2p = self.scan_results[2]
        self.kldi_p2q = self.scan_results[3]
        self.kldi_p2g = self.scan_results[4]

        # get the initial imputation state
        self.x0 = (self.x_mask * self.x_in) + \
                  ((1.0 - self.x_mask) * self._si_as_x(self.s0_full))

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX(np.zeros((1, )))
        self.lr = theano.shared(value=zero_ary, name='gpsi_lr')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='gpsi_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='gpsi_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='gpsi_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_p = theano.shared(value=zero_ary, name='gpsi_lam_kld_p')
        self.lam_kld_q = theano.shared(value=zero_ary, name='gpsi_lam_kld_q')
        self.lam_kld_g = theano.shared(value=zero_ary, name='gpsi_lam_kld_g')
        self.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, lam_kld_g=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.joint_params = [self.s0, self.obs_logvar]
        self.joint_params.extend(self.p_zi_given_xi.mlp_params)
        self.joint_params.extend(self.p_sip1_given_zi.mlp_params)
        self.joint_params.extend(self.q_zi_given_xi.mlp_params)

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_p, self.kld_q, self.kld_g = self._construct_kld_costs(p=1.0)
        self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \
                         (self.lam_kld_q[0] * self.kld_q) + \
                         (self.lam_kld_g[0] * self.kld_g)
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self.nlli[-1]
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel()
        self.nll_bound = T.mean(self.nll_bounds)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-TRIAL COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)
        for k, v in self.scan_updates.items():
            self.joint_updates[k] = v

        # Construct a function for jointly training the generator/inferencer
        print("Compiling cost computer...")
        self.compute_raw_costs = self._construct_raw_costs()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling best step cost computer...")
        self.compute_per_step_cost = self._construct_compute_per_step_cost()
        print("Compiling data-guided imputer sampler...")
        self.sample_imputer = self._construct_sample_imputer()
        # make easy access points for some interesting parameters
        #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W
        return

    def _si_as_x(self, si):
        """
        Convert from "state" to "observation".
        """
        si_as_x = T.nnet.sigmoid(si)
        return si_as_x

    def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999):
        """
        Set learning rate and momentum parameter for all updates.
        """
        zero_ary = np.zeros((1, ))
        # set learning rate
        new_lr = zero_ary + lr
        self.lr.set_value(to_fX(new_lr))
        # set momentums (use first and second order "momentum")
        new_mom_1 = zero_ary + mom_1
        self.mom_1.set_value(to_fX(new_mom_1))
        new_mom_2 = zero_ary + mom_2
        self.mom_2.set_value(to_fX(new_mom_2))
        return

    def set_lam_nll(self, lam_nll=1.0):
        """
        Set weight for controlling the influence of the data likelihood.
        """
        zero_ary = np.zeros((1, ))
        new_lam = zero_ary + lam_nll
        self.lam_nll.set_value(to_fX(new_lam))
        return

    def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0):
        """
        Set the relative weight of prior KL-divergence vs. data likelihood.
        """
        zero_ary = np.zeros((1, ))
        new_lam = zero_ary + lam_kld_p
        self.lam_kld_p.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_q
        self.lam_kld_q.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_g
        self.lam_kld_g.set_value(to_fX(new_lam))
        return

    def set_lam_l2w(self, lam_l2w=1e-3):
        """
        Set the relative strength of l2 regularization on network params.
        """
        zero_ary = np.zeros((1, ))
        new_lam = zero_ary + lam_l2w
        self.lam_l2w.set_value(to_fX(new_lam))
        return

    def set_train_switch(self, switch_val=0.0):
        """
        Set the switch for changing between training and sampling behavior.
        """
        if (switch_val < 0.5):
            switch_val = 0.0
        else:
            switch_val = 1.0
        zero_ary = np.zeros((1, ))
        new_val = zero_ary + switch_val
        self.train_switch.set_value(to_fX(new_val))
        return

    def _construct_zi_zmuv(self, xi, br):
        """
        Construct the necessary (symbolic) samples for computing through this
        GPSImputer for input (sybolic) matrix xi.
        """
        zi_zmuv = self.rng.normal( \
                size=(self.imp_steps, xi.shape[0]*br, self.z_dim), \
                avg=0.0, std=1.0, dtype=theano.config.floatX)
        return zi_zmuv

    def _construct_nll_costs(self, si, xo, xm):
        """
        Construct the negative log-likelihood part of free energy.
        """
        # average log-likelihood over the refinement sequence
        xh = self._si_as_x(si)
        xm_inv = 1.0 - xm  # we will measure nll only where xm_inv is 1
        if self.x_type == 'bernoulli':
            ll_costs = log_prob_bernoulli(xo, xh, mask=xm_inv)
        else:
            ll_costs = log_prob_gaussian2(xo, xh, \
                    log_vars=self.bounded_logvar, mask=xm_inv)
        nll_costs = -ll_costs.flatten()
        return nll_costs

    def _construct_kld_costs(self, p=1.0):
        """
        Construct the policy KL-divergence part of cost to minimize.
        """
        kld_pis = []
        kld_qis = []
        kld_gis = []
        for i in range(self.imp_steps):
            kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1))
            kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1))
            kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1))
        # compute the batch-wise costs
        kld_pi = sum(kld_pis)
        kld_qi = sum(kld_qis)
        kld_gi = sum(kld_gis)
        return [kld_pi, kld_qi, kld_gi]

    def _construct_reg_costs(self):
        """
        Construct the cost for low-level basic regularization. E.g. for
        applying l2 regularization to the network activations and parameters.
        """
        param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params])
        return param_reg_cost

    def _construct_compute_fe_terms(self):
        """
        Construct a function for computing terms in variational free energy.
        """
        # setup some symbolic variables for theano to deal with
        xi = T.matrix()
        xo = T.matrix()
        xm = T.matrix()
        zizmuv = self._construct_zi_zmuv(xi, 1)
        # construct values to output
        nll = self.nll_costs.flatten()
        kld = self.kld_q.flatten()
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(inputs=[ xi, xo, xm ], \
                outputs=[nll, kld], \
                givens={self.x_in: xi, \
                        self.x_out: xo, \
                        self.x_mask: xm, \
                        self.zi_zmuv: zizmuv}, \
                updates=self.scan_updates, \
                on_unused_input='ignore')

        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(XI,
                              XO,
                              XM,
                              sample_count=20,
                              use_guide_policy=True):
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from guide policies (i.e. variational q)
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from model's imputation policy
                self.set_train_switch(switch_val=0.0)
            # compute a multi-sample estimate of variational free-energy
            nll_sum = np.zeros((XI.shape[0], ))
            kld_sum = np.zeros((XI.shape[0], ))
            for i in range(sample_count):
                result = fe_term_sample(XI, XO, XM)
                nll_sum += result[0].ravel()
                kld_sum += result[1].ravel()
            mean_nll = nll_sum / float(sample_count)
            mean_kld = kld_sum / float(sample_count)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            if not use_guide_policy:
                # no KLd if samples are from the primary policy...
                mean_kld = 0.0 * mean_kld
            return [mean_nll, mean_kld]

        return fe_term_estimator

    def _construct_raw_costs(self):
        """
        Construct all the raw, i.e. not weighted by any lambdas, costs.
        """
        # setup some symbolic variables for theano to deal with
        xi = T.matrix()
        xo = T.matrix()
        xm = T.matrix()
        zizmuv = self._construct_zi_zmuv(xi, 1)
        # compile theano function for computing the costs
        all_step_costs = [
            self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g
        ]
        cost_func = theano.function(inputs=[xi, xo, xm], \
                    outputs=all_step_costs, \
                    givens={ self.x_in: xi, \
                             self.x_out: xo, \
                             self.x_mask: xm, \
                             self.zi_zmuv: zizmuv }, \
                    updates=self.scan_updates, \
                    on_unused_input='ignore')

        # make a function for computing multi-sample estimates of cost
        def raw_cost_computer(XI, XO, XM):
            _all_costs = cost_func(to_fX(XI), to_fX(XO), to_fX(XM))
            _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True),
                              axis=0)
            _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True),
                              axis=0)
            _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True),
                              axis=0)
            _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True),
                                 axis=1)
            _step_klds = to_fX(np.asarray([k for k in _step_klds]))
            _step_nlls = np.mean(_all_costs[0], axis=1)
            _step_nlls = to_fX(np.asarray([k for k in _step_nlls]))
            results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g]
            return results

        return raw_cost_computer

    def _construct_compute_per_step_cost(self):
        """
        Construct a theano function for computing the best possible cost
        achieved by sequential imputation.
        """
        # setup some symbolic variables for theano to deal with
        xi = T.matrix()
        xo = T.matrix()
        xm = T.matrix()
        zizmuv = self._construct_zi_zmuv(xi, 1)
        # construct symbolic variables for the step-wise cost
        step_mean_nll = T.mean(self.nlli, axis=1).flatten()
        step_lone_kld = T.sum(self.kldi_q2p, axis=2)
        step_cumu_kld = T.extra_ops.cumsum(step_lone_kld, axis=0)
        step_mean_kld = T.mean(step_cumu_kld, axis=1).flatten()
        # compile theano function for computing the step-wise cost
        step_cost_func = theano.function(inputs=[xi, xo, xm], \
                    outputs=[step_mean_nll, step_mean_kld], \
                    givens={ self.x_in: xi, \
                             self.x_out: xo, \
                             self.x_mask: xm, \
                             self.zi_zmuv: zizmuv }, \
                    updates=self.scan_updates, \
                    on_unused_input='ignore')

        def best_cost_computer(XI, XO, XM, sample_count=20):
            # compute a multi-sample estimate of variational free-energy
            step_nll_sum = np.zeros((self.imp_steps, ))
            step_kld_sum = np.zeros((self.imp_steps, ))
            for i in range(sample_count):
                result = step_cost_func(XI, XO, XM)
                step_nll_sum += result[0].ravel()
                step_kld_sum += result[1].ravel()
            mean_step_nll = step_nll_sum / float(sample_count)
            mean_step_kld = step_kld_sum / float(sample_count)
            return [mean_step_nll, mean_step_kld]

        return best_cost_computer

    def _construct_train_joint(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xi = T.matrix()
        xo = T.matrix()
        xm = T.matrix()
        br = T.lscalar()
        zizmuv = self._construct_zi_zmuv(xi, br)
        # collect the outputs to return from this function
        outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \
                   self.kld_cost, self.reg_cost, self.obs_costs]
        # compile the theano function
        func = theano.function(inputs=[ xi, xo, xm, br ], \
                outputs=outputs, \
                givens={ self.x_in: xi.repeat(br, axis=0), \
                         self.x_out: xo.repeat(br, axis=0), \
                         self.x_mask: xm.repeat(br, axis=0), \
                         self.zi_zmuv: zizmuv }, \
                updates=self.joint_updates, \
                on_unused_input='ignore')
        return func

    def _construct_sample_imputer(self):
        """
        Construct a function for drawing samples from the distribution
        generated by running this imputer.
        """
        xi = T.matrix()
        xo = T.matrix()
        xm = T.matrix()
        zizmuv = self._construct_zi_zmuv(xi, 1)
        oputs = [self.x0] + [
            self._si_as_x(self.si[i]) for i in range(self.imp_steps)
        ]
        sample_func = theano.function(inputs=[xi, xo, xm], outputs=oputs, \
                givens={self.x_in: xi, \
                        self.x_out: xo, \
                        self.x_mask: xm, \
                        self.zi_zmuv: zizmuv}, \
                updates=self.scan_updates, \
                on_unused_input='ignore')

        def imputer_sampler(XI, XO, XM, use_guide_policy=False):
            XI = to_fX(XI)
            XO = to_fX(XO)
            XM = to_fX(XM)
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from guide policies (i.e. variational q)
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from model's imputation policy
                self.set_train_switch(switch_val=0.0)
            # draw guided/unguided conditional samples
            model_samps = sample_func(XI, XO, XM)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            # reverse engineer the "masked" samples...
            masked_samps = []
            for xs in model_samps:
                xsm = (XM * XI) + ((1.0 - XM) * xs)
                masked_samps.append(xsm)
            return model_samps, masked_samps

        return imputer_sampler

    def save_to_file(self, f_name=None):
        """
        Dump important stuff to a Python pickle, so that we can reload this
        model later.
        """
        assert (not (f_name is None))
        f_handle = file(f_name, 'wb')
        # dump the dict self.params, which just holds "simple" python values
        cPickle.dump(self.params, f_handle, protocol=-1)
        # make a copy of self.shared_param_dicts, with numpy arrays in place
        # of the theano shared variables
        numpy_param_dicts = {}
        for key in self.shared_param_dicts:
            numpy_ary = self.shared_param_dicts[key].get_value(borrow=False)
            numpy_param_dicts[key] = numpy_ary
        # dump the numpy version of self.shared_param_dicts to pickle file
        cPickle.dump(numpy_param_dicts, f_handle, protocol=-1)
        # get numpy dicts for each of the "child" models that we must save
        child_model_dicts = {}
        child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict()
        child_model_dicts[
            'p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict()
        child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict()
        # dump the numpy child model dicts to the pickle file
        cPickle.dump(child_model_dicts, f_handle, protocol=-1)
        f_handle.close()
        return
Esempio n. 16
0
class SimpleInfNet(object):
    def __init__(self, rng, in_dim, out_dim, \
                 W_mean=None, b_mean=None, \
                 W_logvar=None, b_logvar=None, \
                 name="", W_scale=1.0):
        # setup a shared random generator for this network 
        self.rng = RandStream(rng.randint(1000000))

        # set some basic layer properties
        self.in_dim = in_dim
        self.out_dim = out_dim

        # initialize weights and biases for mean estimate
        if W_mean is None:
            # Generate initial filters using orthogonal random trick
            W_shape = (self.in_dim, self.out_dim)
            if W_scale > 0.1:
                W_scale = W_scale * (1.0 / np.sqrt(self.in_dim))
            W_init = W_scale * npr.normal(0.0, 1.0, W_shape)
            W_init = W_init.astype(theano.config.floatX)
            W_mean = theano.shared(value=W_init, \
                    name="{0:s}_W_mean".format(name))
        if b_mean is None:
            b_init = np.zeros((self.out_dim,), \
                    dtype=theano.config.floatX)
            b_mean = theano.shared(value=b_init, \
                    name="{0:s}_b_mean".format(name))
        # grab handles for easy access
        self.W_mean = W_mean
        self.b_mean = b_mean

        # initialize weights and biases for log-variance estimate
        if W_logvar is None:
            # Generate initial filters using orthogonal random trick
            W_shape = (self.in_dim, self.out_dim)
            W_scale = W_scale * (1.0 / np.sqrt(self.in_dim))
            W_init = W_scale * npr.normal(0.0, 1.0, W_shape)
            #W_init = ortho_matrix(shape=W_shape, gain=W_scale)
            W_init = W_init.astype(theano.config.floatX)
            W_logvar = theano.shared(value=W_init, \
                    name="{0:s}_W_logvar".format(name))
        if b_logvar is None:
            b_init = np.zeros((self.out_dim,), \
                    dtype=theano.config.floatX)
            b_logvar = theano.shared(value=b_init, \
                    name="{0:s}_b_logvar".format(name))
        # grab handles for easy access
        self.W_logvar = W_logvar
        self.b_logvar = b_logvar

        # Conveniently package layer parameters
        self.mlp_params = [self.W_mean, self.b_mean, \
                           self.W_logvar, self.b_logvar]
        # Layer construction complete...
        return

    def get_bias(self):
        """
        Get the bias at output layer.
        """
        out_bias = self.b_mean
        return out_bias

    def apply(self, x, do_samples=True):
        """
        Apply this SimpleInfNet to some input.
        """
        z_mean = T.dot(x, self.W_mean) + self.b_mean
        z_logvar = T.dot(x, self.W_logvar) + self.b_logvar
        z_samples = z_mean + ( (T.exp(0.5*z_logvar)) * \
                DCG(self.rng.normal(size=z_mean.shape, avg=0.0, std=1.0, \
                dtype=theano.config.floatX)) )
        # wrap them up for easy returnage
        result = [z_mean, z_logvar]
        if do_samples:
            result.append(z_samples)
        return result
Esempio n. 17
0
class GenFCModule(object):
    """
    Module that transforms random values through a single fully connected
    layer, and then a linear transform (with another relu, optionally).
    """
    def __init__(self,
                 rand_dim,
                 out_dim,
                 fc_dim,
                 apply_bn_1=True,
                 apply_bn_2=True,
                 init_func=None,
                 rand_type='normal',
                 final_relu=True,
                 mod_name='dm_fc'):
        self.rand_dim = rand_dim
        self.out_dim = out_dim
        self.fc_dim = fc_dim
        self.apply_bn_1 = apply_bn_1
        self.apply_bn_2 = apply_bn_2
        self.mod_name = mod_name
        self.rand_type = rand_type
        self.final_relu = final_relu
        self.rng = RandStream(123)
        if init_func is None:
            self.init_func = inits.Normal(scale=0.02)
        else:
            self.init_func = init_func
        self._init_params()  # initialize parameters
        return

    def _init_params(self):
        """
        Initialize parameters for the layers in this generator module.
        """
        self.w1 = self.init_func((self.rand_dim, self.fc_dim),
                                 "{}_w1".format(self.mod_name))
        self.w2 = self.init_func((self.fc_dim, self.out_dim),
                                 "{}_w2".format(self.mod_name))
        self.params = [self.w1, self.w2]
        # make gains and biases for transforms that will get batch normed
        if self.apply_bn_1:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g1 = gain_ifn((self.fc_dim), "{}_g1".format(self.mod_name))
            self.b1 = bias_ifn((self.fc_dim), "{}_b1".format(self.mod_name))
            self.params.extend([self.g1, self.b1])
        if self.apply_bn_2:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g2 = gain_ifn((self.out_dim), "{}_g2".format(self.mod_name))
            self.b2 = bias_ifn((self.out_dim), "{}_b2".format(self.mod_name))
            self.params.extend([self.g2, self.b2])
        return

    def apply(self, batch_size=None, rand_vals=None):
        """
        Apply this generator module. Pass _either_ batch_size or rand_vals.
        """
        assert not ((batch_size is None) and
                    (rand_vals is None)), "need either batch_size or rand_vals"
        if rand_vals is None:
            rand_shape = (batch_size, self.rand_dim)
            if self.rand_type == 'normal':
                rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \
                                            dtype=theano.config.floatX)
            else:
                rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \
                                             dtype=theano.config.floatX)
        else:
            rand_shape = (rand_vals.shape[0], self.rand_dim)
        rand_vals = rand_vals.reshape(rand_shape)
        # transform random values into fc layer
        h1 = T.dot(rand_vals, self.w1)
        if self.apply_bn_1:
            h1 = batchnorm(h1, g=self.g1, b=self.b1)
        h1 = relu(h1)
        # transform from fc layer to output
        h2 = T.dot(h1, self.w2)
        if self.apply_bn_2:
            h2 = batchnorm(h2, g=self.g2, b=self.b2)
        if self.final_relu:
            h2 = relu(h2)
        return h2
Esempio n. 18
0
class GenUniModule(object):
    """
    Module that applies a linear transform followed by an non-linearity.
    """
    def __init__(self,
                 rand_dim,
                 out_dim,
                 apply_bn=True,
                 init_func=None,
                 rand_type='normal',
                 final_relu=True,
                 mod_name='dm_uni'):
        self.rand_dim = rand_dim
        self.out_dim = out_dim
        self.apply_bn = apply_bn
        self.mod_name = mod_name
        self.rand_type = rand_type
        self.final_relu = final_relu
        self.rng = RandStream(123)
        if init_func is None:
            self.init_func = inits.Normal(scale=0.02)
        else:
            self.init_func = init_func
        self._init_params()  # initialize parameters
        return

    def _init_params(self):
        """
        Initialize parameters for the layers in this generator module.
        """
        self.w1 = self.init_func((self.rand_dim, self.out_dim),
                                 "{}_w1".format(self.mod_name))
        self.params = [self.w1]
        # make gains and biases for transforms that will get batch normed
        if self.apply_bn:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g1 = gain_ifn((self.out_dim), "{}_g1".format(self.mod_name))
            self.b1 = bias_ifn((self.out_dim), "{}_b1".format(self.mod_name))
            self.params.extend([self.g1, self.b1])
        return

    def apply(self, batch_size=None, rand_vals=None):
        """
        Apply this generator module. Pass _either_ batch_size or rand_vals.
        """
        assert not ((batch_size is None) and
                    (rand_vals is None)), "need either batch_size or rand_vals"
        if rand_vals is None:
            rand_shape = (batch_size, self.rand_dim)
            if self.rand_type == 'normal':
                rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \
                                            dtype=theano.config.floatX)
            else:
                rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \
                                             dtype=theano.config.floatX)
        else:
            rand_shape = (rand_vals.shape[0], self.rand_dim)
        rand_vals = rand_vals.reshape(rand_shape)
        # transform random values linearly
        h1 = T.dot(rand_vals, self.w1)
        if self.apply_bn:
            h1 = batchnorm(h1, g=self.g1, b=self.b1)
        if self.final_relu:
            h1 = relu(h1)
        return h1


##############
# EYE BUFFER #
##############
Esempio n. 19
0
class GenConvModule(object):
    """
    Module of one "fractionally strided" convolution layer followed by one
    regular convolution layer. Inputs to the fractionally strided convolution
    can optionally be augmented with some random values.

    Params:
        filt_shape: shape for convolution filters -- should be square and odd
        in_chans: number of channels in the inputs to module
        out_chans: number of channels in the outputs from module
        rand_chans: number of random channels to augment input
        use_rand: flag for whether or not to augment inputs
        apply_bn_1: flag for whether to batch normalize following first conv
        apply_bn_2: flag for whether to batch normalize following second conv
        us_stride: upsampling ratio in the fractionally strided convolution
        use_pooling: whether to use unpooling or fractional striding
        init_func: function for initializing module parameters
        mod_name: text name for identifying module in theano graph
        rand_type: whether to use Gaussian or uniform randomness
    """
    def __init__(self,
                 filt_shape,
                 in_chans,
                 out_chans,
                 rand_chans,
                 use_rand=True,
                 apply_bn_1=True,
                 apply_bn_2=True,
                 us_stride=2,
                 use_pooling=True,
                 init_func=None,
                 mod_name='gm_conv',
                 rand_type='normal'):
        assert ((filt_shape[0] % 2) > 0), "filter dim should be odd (not even)"
        self.filt_dim = filt_shape[0]
        self.in_chans = in_chans
        self.out_chans = out_chans
        self.rand_chans = rand_chans
        self.use_rand = use_rand
        self.apply_bn_1 = apply_bn_1
        self.apply_bn_2 = apply_bn_2
        self.us_stride = us_stride
        self.use_pooling = use_pooling
        self.mod_name = mod_name
        self.rand_type = rand_type
        self.rng = RandStream(123)
        if init_func is None:
            self.init_func = inits.Normal(scale=0.02)
        else:
            self.init_func = init_func
        self._init_params()  # initialize parameters
        return

    def _init_params(self):
        """
        Initialize parameters for the layers in this generator module.
        """
        if self.use_rand:
            # random values will be stacked on exogenous input
            self.w1 = self.init_func(
                (self.out_chans, (self.in_chans + self.rand_chans),
                 self.filt_dim, self.filt_dim), "{}_w1".format(self.mod_name))
        else:
            # random values won't be stacked on exogenous input
            self.w1 = self.init_func(
                (self.out_chans, self.in_chans, self.filt_dim, self.filt_dim),
                "{}_w1".format(self.mod_name))
        self.w2 = self.init_func(
            (self.out_chans, self.out_chans, self.filt_dim, self.filt_dim),
            "{}_w2".format(self.mod_name))
        self.params = [self.w1, self.w2]
        # make gains and biases for transforms that will get batch normed
        if self.apply_bn_1:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g1 = gain_ifn((self.out_chans), "{}_g1".format(self.mod_name))
            self.b1 = bias_ifn((self.out_chans), "{}_b1".format(self.mod_name))
            self.params.extend([self.g1, self.b1])
        if self.apply_bn_2:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g2 = gain_ifn((self.out_chans), "{}_g2".format(self.mod_name))
            self.b2 = bias_ifn((self.out_chans), "{}_b2".format(self.mod_name))
            self.params.extend([self.g2, self.b2])
        return

    def apply(self, input, rand_vals=None):
        """
        Apply this generator module to some input.
        """
        batch_size = input.shape[0]
        bm = int((self.filt_dim - 1) / 2)  # use "same" mode convolutions
        ss = self.us_stride  # stride for "learned upsampling"
        if self.use_pooling:
            # "unpool" the input if desired
            input = input.repeat(ss, axis=2).repeat(ss, axis=3)
        # get shape for random values that will augment input
        rand_shape = (batch_size, self.rand_chans, input.shape[2],
                      input.shape[3])
        if self.use_rand:
            # augment input with random channels
            if rand_vals is None:
                if self.rand_type == 'normal':
                    rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \
                                                dtype=theano.config.floatX)
                else:
                    rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \
                                                 dtype=theano.config.floatX)
            rand_vals = rand_vals.reshape(rand_shape)
            # stack random values on top of input
            full_input = T.concatenate([rand_vals, input], axis=1)
        else:
            # don't augment input with random channels
            full_input = input
        # apply first convolution, perhaps with fractional striding
        if self.use_pooling:
            h1 = dnn_conv(full_input,
                          self.w1,
                          subsample=(1, 1),
                          border_mode=(bm, bm))
        else:
            # apply first conv layer (with fractional stride for upsampling)
            h1 = deconv(full_input,
                        self.w1,
                        subsample=(ss, ss),
                        border_mode=(bm, bm))
        if self.apply_bn_1:
            h1 = batchnorm(h1, g=self.g1, b=self.b1)
        h1 = relu(h1)
        # apply second conv layer
        h2 = dnn_conv(h1, self.w2, subsample=(1, 1), border_mode=(bm, bm))
        if self.apply_bn_2:
            h2 = batchnorm(h2, g=self.g2, b=self.b2)
        h2 = relu(h2)
        return h2
Esempio n. 20
0
class ConvPoolLayer(object):
    """
    A simple convolution --> max-pooling layer.

    The (symbolic) input to this layer must be a theano.tensor.dtensor4 shaped
    like (batch_size, chan_count, im_dim_1, im_dim_2).

    filt_def should be a 4-tuple like (filt_count, in_chans, filt_def_1, filt_def_2)

    pool_def should be a 3-tuple like (pool_dim, pool_stride)
    """
    def __init__(self, rng, input=None, filt_def=None, pool_def=(2, 2), \
    		activation=None, drop_rate=0., input_noise=0., bias_noise=0., \
    		W=None, b=None, name="", W_scale=1.0):

        # Setup a shared random generator for this layer
        #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \
        #        rng.randint(100000))
        self.rng = CURAND_RandomStreams(rng.randint(1000000))

        self.clean_input = input

        # Add gaussian noise to the input (if desired)
        if (input_noise > 1e-4):
            self.fuzzy_input = input + self.rng.normal(size=input.shape, \
                    avg=0.0, std=input_noise, dtype=theano.config.floatX)
        else:
            self.fuzzy_input = input

        # Apply masking noise to the input (if desired)
        if (drop_rate > 1e-4):
            self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate)
        else:
            self.noisy_input = self.fuzzy_input

        # Set the activation function for the conv filters
        if activation:
            self.activation = activation
        else:
        	self.activation = lambda x: relu_actfun(x)

        # initialize weights with random weights
        W_init = 0.01 * np.asarray(rng.normal( \
        		size=filt_def), dtype=theano.config.floatX)
        self.W = theano.shared(value=(W_scale*W_init), \
        		name="{0:s}_W".format(name))

        # the bias is a 1D tensor -- one bias per output feature map
        b_init = np.zeros((filt_def[0],), dtype=theano.config.floatX) + 0.1
        self.b = theano.shared(value=b_init, name="{0:s}_b".format(name))

        # convolve input feature maps with filters
        input_c01b = self.noisy_input.dimshuffle(1, 2, 3, 0) # bc01 to c01b
        filters_c01b = self.W.dimshuffle(1, 2, 3, 0) # bc01 to c01b
        conv_op = FilterActs(stride=1, partial_sum=1)
        contig_input = gpu_contiguous(input_c01b)
        contig_filters = gpu_contiguous(filters_c01b)
        conv_out_c01b = conv_op(contig_input, contig_filters)

        if (bias_noise > 1e-4):
        	noisy_conv_out_c01b = conv_out_c01b + self.rng.normal( \
        			size=conv_out_c01b.shape, avg=0.0, std=bias_noise, \
        			dtype=theano.config.floatX)
        else:
        	noisy_conv_out_c01b = conv_out_c01b

        # downsample each feature map individually, using maxpooling
        pool_op = MaxPool(ds=pool_def[0], stride=pool_def[1])
        mp_out_c01b = pool_op(noisy_conv_out_c01b)
        mp_out_bc01 = mp_out_c01b.dimshuffle(3, 0, 1, 2) # c01b to bc01

        # add the bias term. Since the bias is a vector (1D array), we first
        # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will
        # thus be broadcasted across mini-batches and feature map
        # width & height
        self.noisy_linear_output = mp_out_bc01 + self.b.dimshuffle('x', 0, 'x', 'x')
        self.linear_output = self.noisy_linear_output
        self.output = self.activation(self.noisy_linear_output)

        # store parameters of this layer
        self.params = [self.W, self.b]

        return

    def _drop_from_input(self, input, p):
        """p is the probability of dropping elements of input."""
        # get a drop mask that drops things with probability p
        drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \
                dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # get a scaling factor to keep expectations fixed after droppage
        drop_scale = 1. / (1. - p)
        # apply dropout mask and rescaling factor to the input
        droppy_input = drop_scale * input * drop_mask
        return droppy_input

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                dtype=theano.config.floatX)
        return P_nz
Esempio n. 21
0
class GenNet(object):
    """
    A net that transforms a simple distribution so that it matches some
    more complicated distribution, for some definition of match....

    Parameters:
        rng: a numpy.random RandomState object
        Xp: symbolic matrix for inputting latent variable samples
        prior_sigma: standard deviation of isotropic Gaussian prior that this
                     generator will transform to match some other distribution
        params: a dict of parameters describing the desired network:
            lam_l2a: L2 regularization weight on neuron activations
            vis_drop: drop rate to use on the latent variable space
            hid_drop: drop rate to use on the hidden layer activations
                -- note: vis_drop/hid_drop are optional, with defaults 0.0/0.0
            bias_noise: standard dev for noise on the biases of hidden layers
            out_noise: standard dev for noise on the output of this net
            mlp_config: list of "layer descriptions"
            activation: "function handle" for the desired non-linearity
        mlp_param_dicts: parameters for the MLP controlled by this GenNet
    """
    def __init__(self, \
            rng=None, \
            Xp=None, \
            prior_sigma=None, \
            params=None, \
            mlp_param_dicts=None):
        # First, setup a shared random number generator for this layer
        #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \
        #    rng.randint(100000))
        self.rng = CURAND_RandomStreams(rng.randint(1000000))
        # Grab the symbolic input matrix
        self.Xp = Xp
        self.prior_sigma = prior_sigma
        #####################################################
        # Process user-supplied parameters for this network #
        #####################################################
        assert(not (params is None))
        self.params = params
        lam_l2a = self.params['lam_l2a']
        if 'vis_drop' in self.params:
            # Drop rate on the latent variables
            self.vis_drop = self.params['vis_drop']
        else:
            self.vis_drop = 0.0
        if 'hid_drop' in self.params:
            # Drop rate on hidden layer activations
            self.hid_drop = self.params['hid_drop']
        else:
            self.hid_drop = 0.0
        if 'bias_noise' in self.params:
            # Noise sigma for hidden layer biases
            self.bias_noise = self.params['bias_noise']
        else:
            self.bias_noise = 0.0
        if 'out_noise' in self.params:
            # Noise sigma for the output/observable layer
            self.out_noise = self.params['out_noise']
        else:
            self.out_noise = 0.0
        # Check if the params for this net were given a priori. This option
        # will be used for creating "clones" of a generative network, with all
        # of the network parameters shared between clones.
        if mlp_param_dicts is None:
            # This is not a clone, and we will need to make a dict for
            # referring to the parameters of each network layer
            self.mlp_param_dicts = []
            self.is_clone = False
        else:
            # This is a clone, and its layer parameters can be found by
            # referring to the given param dict (i.e. mlp_param_dicts).
            self.mlp_param_dicts = mlp_param_dicts
            self.is_clone = True
        # Get the configuration/prototype for this network. The config is a
        # list of layer descriptions, including a description for the input
        # layer, which is typically just the dimension of the inputs. So, the
        # depth of the mlp is one less than the number of layer configs.
        self.mlp_config = params['mlp_config']
        if 'activation' in params:
            self.activation = params['activation']
        else:
            self.activation = relu_actfun
        self.mlp_depth = len(self.mlp_config) - 1
        self.latent_dim = self.mlp_config[0]
        self.data_dim = self.mlp_config[-1]
        ##########################
        # Initialize the network #
        ##########################
        self.clip_params = {}
        self.mlp_layers = []
        layer_def_pairs = zip(self.mlp_config[:-1],self.mlp_config[1:])
        layer_num = 0
        next_input = self.Xp
        for in_def, out_def in layer_def_pairs:
            first_layer = (layer_num == 0)
            last_layer = (layer_num == (len(layer_def_pairs) - 1))
            l_name = "gn_layer_{0:d}".format(layer_num)
            if (type(in_def) is list) or (type(in_def) is tuple):
                # Receiving input from a poolish layer...
                in_dim = in_def[0]
            else:
                # Receiving input from a normal layer...
                in_dim = in_def
            if (type(out_def) is list) or (type(out_def) is tuple):
                # Applying some sort of pooling in this layer...
                out_dim = out_def[0]
                pool_size = out_def[1]
            else:
                # Not applying any pooling in this layer...
                out_dim = out_def
                pool_size = 0
            # Select the appropriate noise to add to this layer
            if first_layer:
                d_rate = self.vis_drop
            else:
                d_rate = self.hid_drop
            if last_layer:
                b_noise = self.out_noise
            else:
                b_noise = self.bias_noise
            if not self.is_clone:
                ##########################################
                # Initialize a layer with new parameters #
                ##########################################
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name, W_scale=1.0)
                self.mlp_layers.append(new_layer)
                self.mlp_param_dicts.append({'W': new_layer.W, 'b': new_layer.b})
            else:
                ##################################################
                # Initialize a layer with some shared parameters #
                ##################################################
                init_params = self.mlp_param_dicts[layer_num]
                self.mlp_layers.append(HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=0., bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        name=l_name, W_scale=1.0))
            next_input = self.mlp_layers[-1].output
            # Set the non-bias parameters of this layer to be clipped
            self.clip_params[self.mlp_layers[-1].W] = 1
            # Acknowledge layer completion
            layer_num = layer_num + 1

        # TODO: implement adjustable norm clipping
        self.clip_norms = {}

        # Mash all the parameters together, into a list.
        self.mlp_params = []
        for layer in self.mlp_layers:
            self.mlp_params.extend(layer.params)

        # The output of this generator network is given by the noisy output
        # of its final layer. We will keep a running estimate of the mean and
        # covariance of the distribution induced by combining this network's
        # latent noise source with its deep non-linear transform. These will
        # be used to encourage the induced distribution to match the first and
        # second-order moments of the distribution we are trying to match.
        #self.output = self.mlp_layers[-1].noisy_linear
        self.output = T.nnet.sigmoid(self.mlp_layers[-1].noisy_linear)
        self.out_dim = self.mlp_layers[-1].out_dim
        C_init = np.zeros((self.out_dim,self.out_dim)).astype(theano.config.floatX)
        m_init = np.zeros((self.out_dim,)).astype(theano.config.floatX)
        self.dist_mean = theano.shared(m_init, name='gn_dist_mean')
        self.dist_cov = theano.shared(C_init, name='gn_dist_cov')
        # Get simple regularization penalty to moderate activation dynamics
        self.act_reg_cost = lam_l2a * self._act_reg_cost()
        # Construct a sampler for drawing independent samples from this model's
        # isotropic Gaussian prior, and a sampler for the model distribution.
        self.sample_from_prior = self._construct_prior_sampler()
        self.sample_from_model = self._construct_model_sampler()
        # Construct a function for passing points from the latent/prior space
        # through the transform induced by the current model parameters.
        self.transform_prior = self._construct_transform_prior()
        return

    def _act_reg_cost(self):
        """
        Apply L2 regularization to the activations in this network.
        """
        act_sq_sums = []
        for layer in self.mlp_layers:
            act_sq_sums.append(layer.act_l2_sum)
        full_act_sq_sum = T.sum(act_sq_sums)
        return full_act_sq_sum

    def _construct_prior_sampler(self):
        """
        Draw independent samples from this model's isotropic Gaussian prior.
        """
        samp_count = T.lscalar()
        prior_samples = self.prior_sigma * self.rng.normal( \
                size=(samp_count, self.latent_dim), avg=0.0, std=1.0, \
                dtype=theano.config.floatX)
        prior_sampler = theano.function([samp_count], outputs=prior_samples)
        return prior_sampler

    def _construct_model_sampler(self):
        """
        Draw independent samples from this model's distribution.
        """
        samp_count = T.lscalar()
        prior_samples = self.prior_sigma * self.rng.normal( \
                size=(samp_count, self.latent_dim), avg=0.0, std=1.0, \
                dtype=theano.config.floatX)
        prior_sampler = theano.function([samp_count], outputs=self.output, \
                givens={self.Xp: prior_samples})
        return prior_sampler

    def _construct_transform_prior(self):
        """
        Apply the tranform induced by the current model parameters to some
        set of points in the latent/prior space.
        """
        feedforward = theano.function([self.Xp], outputs=self.output)
        return feedforward

    def _batch_moments(self):
        """
        Compute covariance and mean of the current sample outputs.
        """
        mu = T.mean(self.output, axis=0, keepdims=True)
        sigma = T.dot((self.output.T - mu.T), (self.output - mu))
        return [mu, sigma]

    def init_biases(self, b_init=0.0):
        """
        Initialize the biases in all hidden layers to some constant.
        """
        for layer in self.mlp_layers[:-1]:
            b_init = (0.0 * layer.b.get_value(borrow=False)) + b_init
            layer.b.set_value(b_init)
        return

    def init_moments(self, X_noise):
        """
        Initialize the running mean and covariance estimates.
        """
        X_noise_sym = T.matrix()
        out_func = theano.function(inputs=[ X_noise_sym ], \
                outputs=[ self.output ], \
                givens={self.Xp: X_noise_sym})
        # Compute outputs for the input latent noise matrix
        X_out = out_func(X_noise.astype(theano.config.floatX))[0]
        # Compute mean and covariance of the outputs
        mu = np.mean(X_out, axis=0)
        X_out_minus_mu = X_out - mu
        sigma = np.dot(X_out_minus_mu.T,X_out_minus_mu) / X_out.shape[0]
        # Initialize the network's running estimates 
        self.dist_cov.set_value(sigma.astype(theano.config.floatX))
        self.dist_mean.set_value(mu.astype(theano.config.floatX))
        return

    def shared_param_clone(self, rng=None, Xp=None):
        """
        Return a clone of this network, with shared parameters but with
        different symbolic input variables.

        This can be used for "unrolling" a generate->infer->generate->infer...
        loop. Then, we can do backprop through time for various objectives.
        """
        clone_net = GenNet(rng=rng, Xp=Xp, \
                prior_sigma=self.prior_sigma, params=self.params, \
                mlp_param_dicts=self.mlp_param_dicts)
        return clone_net
Esempio n. 22
0
class WalkoutModel(object):
    """
    Controller for training a forwards-backwards chainy model.

    Parameters:
        rng: numpy.random.RandomState (for reproducibility)
        x_out: the goal state for forwards-backwards walking process
        p_z_given_x: InfNet for stochastic part of step
        p_x_given_z: HydraNet for deterministic part of step
        params: REQUIRED PARAMS SHOWN BELOW
                x_dim: dimension of observations to construct
                z_dim: dimension of latent space for policy wobble
                walkout_steps: number of steps to walk out
                x_type: can be "bernoulli" or "gaussian"
                x_transform: can be 'none' or 'sigmoid'
    """
    def __init__(self, rng=None,
            x_out=None, \
            p_z_given_x=None, \
            p_x_given_z=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this WalkoutModel
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_dim = self.params['x_dim']
        self.z_dim = self.params['z_dim']
        self.walkout_steps = self.params['walkout_steps']
        self.x_type = self.params['x_type']
        self.shared_param_dicts = shared_param_dicts
        if 'x_transform' in self.params:
            assert((self.params['x_transform'] == 'sigmoid') or \
                    (self.params['x_transform'] == 'none'))
            if self.params['x_transform'] == 'sigmoid':
                self.x_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.x_transform = lambda x: x
        else:
            self.x_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.x_transform = lambda x: T.nnet.sigmoid(x)
        assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        assert ((self.step_type == 'add') or (self.step_type == 'jump'))

        # grab handles to the relevant networks
        self.p_z_given_x = p_z_given_x
        self.p_x_given_z = p_x_given_z

        # record the symbolic variables that will provide inputs to the
        # computation graph created for this WalkoutModel
        self.x_out = x_out  # target output for generation
        self.zi_zmuv = T.tensor3()  # ZMUV gauss noise for walk-out wobble

        if self.shared_param_dicts is None:
            # initialize the parameters "owned" by this model
            zero_ary = to_fX(np.zeros((1, )))
            self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh(
                (1.0 / 8.0) * self.obs_logvar[0])
            self.shared_param_dicts = {}
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            # grab the parameters required by this model from a given dict
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh(
                (1.0 / 8.0) * self.obs_logvar[0])

        ###############################################################
        # Setup the forwards (i.e. training) walk-out loop using scan #
        ###############################################################
        def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw):
            # get samples of next zi, according to the forwards model
            zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \
                                       do_samples=False)
            zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv)

            # check reverse direction probability p(xi_fw | zi_fw)
            xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \
                                       do_samples=False)
            xi_bw_mean = self.x_transform(xi_bw_mean)
            nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \
                        log_vars=xi_bw_logvar, mask=None)
            nll_xi_bw = nll_xi_bw.flatten()

            # get samples of next xi, according to the forwards model
            xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \
                                       do_samples=False)
            xi_fw_mean = self.x_transform(xi_fw_mean)
            xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv)

            # check reverse direction probability p(zi_fw | xi_fw)
            zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \
                                       do_samples=False)
            nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \
                        log_vars=zi_bw_logvar, mask=None)
            nll_zi_bw = nll_zi_bw.flatten()

            # each loop iteration produces the following values:
            #   xi_fw: xi generated fom zi by forwards walk
            #   zi_fw: zi generated fom xi by forwards walk
            #   xi_fw_mean: ----
            #   xi_fw_logvar: ----
            #   zi_fw_mean: ----
            #   zi_fw_logvar: ----
            #   nll_xi_bw: NLL for reverse step zi_fw -> xi_fw
            #   nll_zi_bw: NLL for reverse step xi_fw -> zi_fw
            return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw

        # initialize states for x/z
        self.x0 = self.x_out
        self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim)
        # setup initial values to pass to scan op
        outputs_init = [self.x0, self.z0, None, None, None, None, None, None]
        sequences_init = [self.xi_zmuv, self.zi_zmuv]
        # apply scan op for the sequential imputation loop
        self.scan_results, self.scan_updates = theano.scan(forwards_loop, \
                    outputs_info=outputs_init, \
                    sequences=sequences_init)

        # grab results of the scan op. all values are computed for each step
        self.xi = self.scan_results[0]
        self.zi = self.scan_results[1]
        self.xi_fw_mean = self.scan_results[2]
        self.xi_fw_logvar = self.scan_results[3]
        self.zi_fw_mean = self.scan_results[4]
        self.zi_fw_logvar = self.scan_results[5]
        self.nll_xi_bw = self.scan_results[6]
        self.nll_zi_bw = self.scan_results[7]

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX(np.zeros((1, )))
        self.lr = theano.shared(value=zero_ary, name='srr_lr')
        # shared var momentum parameters for ADAM optimization
        self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared vars for weighting prior kld against reconstruction
        self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p')
        self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q')
        self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g')
        self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s')
        self.set_lam_kld(lam_kld_p=0.0,
                         lam_kld_q=1.0,
                         lam_kld_g=0.0,
                         lam_kld_s=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w')
        self.set_lam_l2w(1e-5)

        # grab all of the "optimizable" parameters from the base networks
        self.joint_params = [self.s0, self.obs_logvar, self.step_scales]
        self.joint_params.extend(self.p_zi_given_xi.mlp_params)
        self.joint_params.extend(self.p_sip1_given_zi.mlp_params)
        self.joint_params.extend(self.p_x_given_si.mlp_params)
        self.joint_params.extend(self.q_zi_given_xi.mlp_params)

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(
            p=1.0)
        self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \
                         (self.lam_kld_q[0] * self.kld_q) + \
                         (self.lam_kld_g[0] * self.kld_g) + \
                         (self.lam_kld_s[0] * self.kld_s)
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = T.sum(self.nlli, axis=0)  # sum the per-step NLLs
        self.nll_cost = T.mean(self.nll_costs)
        self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel()
        self.nll_bound = T.mean(self.nll_bounds)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-TRIAL COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        for k, v in self.scan_updates.items():
            self.joint_updates[k] = v

        # Construct theano functions for training and diagnostic computations
        print("Compiling cost computer...")
        self.compute_raw_costs = self._construct_raw_costs()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling sequence sampler...")
        self.sequence_sampler = self._construct_sequence_sampler()
        # make easy access points for some interesting parameters
        #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W
        return

    def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999):
        """
        Set learning rate and momentum parameter for all updates.
        """
        zero_ary = np.zeros((1, ))
        # set learning rate
        new_lr = zero_ary + lr
        self.lr.set_value(to_fX(new_lr))
        # set momentums (use first and second order "momentum")
        new_mom_1 = zero_ary + mom_1
        self.mom_1.set_value(to_fX(new_mom_1))
        new_mom_2 = zero_ary + mom_2
        self.mom_2.set_value(to_fX(new_mom_2))
        return

    def set_lam_kld(self,
                    lam_kld_p=0.0,
                    lam_kld_q=1.0,
                    lam_kld_g=0.0,
                    lam_kld_s=0.0):
        """
        Set the relative weight of prior KL-divergence vs. data likelihood.
        """
        zero_ary = np.zeros((1, ))
        new_lam = zero_ary + lam_kld_p
        self.lam_kld_p.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_q
        self.lam_kld_q.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_g
        self.lam_kld_g.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_s
        self.lam_kld_s.set_value(to_fX(new_lam))
        return

    def set_lam_l2w(self, lam_l2w=1e-3):
        """
        Set the relative strength of l2 regularization on network params.
        """
        zero_ary = np.zeros((1, ))
        new_lam = zero_ary + lam_l2w
        self.lam_l2w.set_value(to_fX(new_lam))
        return

    def set_train_switch(self, switch_val=0.0):
        """
        Set the switch for changing between training and sampling behavior.
        """
        if (switch_val < 0.5):
            switch_val = 0.0
        else:
            switch_val = 1.0
        zero_ary = np.zeros((1, ))
        new_val = zero_ary + switch_val
        self.train_switch.set_value(to_fX(new_val))
        return

    def _construct_zi_zmuv(self, xo):
        """
        Construct the necessary ZMUV gaussian samples for generating
        trajectories from this WalkoutModel, for input matrix xo.
        """
        zi_zmuv = self.rng.normal( \
                size=(self.total_steps, xo.shape[0], self.z_dim), \
                avg=0.0, std=1.0, dtype=theano.config.floatX)
        return zi_zmuv

    def _construct_rev_masks(self, xo):
        """
        Compute the sequential revelation masks for the input batch in xo.
        -- We need to construct mask sequences for both p and q.
        """
        if self.use_rev_masks:
            # make batch copies of self.rev_masks_p and self.rev_masks_q
            pmasks = self.rev_masks_p.dimshuffle(0, 'x', 1).repeat(xo.shape[0],
                                                                   axis=1)
            qmasks = self.rev_masks_q.dimshuffle(0, 'x', 1).repeat(xo.shape[0],
                                                                   axis=1)
        else:
            pm_list = []
            qm_list = []
            # make a zero mask that does nothing
            zero_mask = T.alloc(0.0, 1, xo.shape[0], xo.shape[1])
            # generate independently sampled masks for each revelation block
            for rb in self.rev_sched:
                # make a random binary mask with ones at rate rb[1]
                rand_vals = self.rng.uniform( \
                        size=(1, xo.shape[0], xo.shape[1]), \
                        low=0.0, high=1.0, dtype=theano.config.floatX)
                rand_mask = rand_vals < rb[1]
                # append the masks for this revleation block to the mask lists
                #
                # the guide policy (in q) gets to peek at the values that will be
                # revealed to the primary policy (in p) for the entire block. The
                # primary policy only gets to see these values at end of the final
                # step of the block. Within a given step, values are revealed to q
                # at the beginning of the step, and to p at the end.
                #
                # e.g. in a revelation block with only a single step, the guide
                # policy sees the values at the beginning of the step, which allows
                # it to guide the step. the primary policy only gets to see the
                # values at the end of the step.
                #
                # i.e. a standard variational auto-encoder is equivalent to a
                # sequential revelation and refinement model with only one
                # revelation block, which has one step and a reveal rate of 1.0.
                #
                for refine_step in range(rb[0] - 1):
                    pm_list.append(zero_mask)
                    qm_list.append(rand_mask)
                pm_list.append(rand_mask)
                qm_list.append(rand_mask)
            # concatenate each mask list into a 3-tensor
            pmasks = T.cast(T.concatenate(pm_list, axis=0), 'floatX')
            qmasks = T.cast(T.concatenate(qm_list, axis=0), 'floatX')
        return [pmasks, qmasks]

    def _construct_nll_costs(self, si, xo, nll_mask):
        """
        Construct the negative log-likelihood part of free energy.
        -- only check NLL where nll_mask == 1
        """
        xh = self._from_si_to_x(si)
        if self.x_type == 'bernoulli':
            ll_costs = log_prob_bernoulli(xo, xh, mask=nll_mask)
        else:
            ll_costs = log_prob_gaussian2(xo, xh, \
                    log_vars=self.bounded_logvar, mask=nll_mask)
        nll_costs = -ll_costs.flatten()
        return nll_costs

    def _construct_kld_s(self, s_i, s_j):
        """
        Compute KL(s_i || s_j) -- assuming bernoullish outputs
        """
        x_i = self._from_si_to_x(s_i)
        x_j = self._from_si_to_x(s_j)
        kld_s = (x_i * (T.log(x_i)  - T.log(x_j))) + \
                ((1.0 - x_i) * (T.log(1.0-x_i) - T.log(1.0-x_j)))
        sum_kld = T.sum(kld_s, axis=1)
        return sum_kld

    def _construct_kld_costs(self, p=1.0):
        """
        Construct the policy KL-divergence part of cost to minimize.
        """
        kld_pis = []
        kld_qis = []
        kld_gis = []
        kld_sis = []
        s0 = 0.0 * self.si[0] + self.s0
        for i in range(self.total_steps):
            kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1))
            kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1))
            kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1))
            if i == 0:
                kld_sis.append(self._construct_kld_s(self.si[i], s0))
            else:
                kld_sis.append(
                    self._construct_kld_s(self.si[i], self.si[i - 1]))
        # compute the batch-wise costs
        kld_pi = sum(kld_pis)
        kld_qi = sum(kld_qis)
        kld_gi = sum(kld_gis)
        kld_si = sum(kld_sis)
        return [kld_pi, kld_qi, kld_gi, kld_si]

    def _construct_reg_costs(self):
        """
        Construct the cost for low-level basic regularization. E.g. for
        applying l2 regularization to the network activations and parameters.
        """
        param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params])
        return param_reg_cost

    def _construct_compute_fe_terms(self):
        """
        Construct a function for computing terms in variational free energy.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # construct values to output
        nll = self.nll_costs.flatten()
        kld = self.kld_q.flatten()
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(inputs=[ xo ], \
                outputs=[nll, kld], \
                givens={self.x_out: xo, \
                        self.zi_zmuv: zizmuv, \
                        self.p_masks: pmasks, \
                        self.q_masks: qmasks}, \
                updates=self.scan_updates, \
                on_unused_input='ignore')

        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(XO, sample_count=20, use_guide_policy=True):
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from the guide policy
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from the primary policy
                self.set_train_switch(switch_val=0.0)
            # compute a multi-sample estimate of variational free-energy
            nll_sum = np.zeros((XO.shape[0], ))
            kld_sum = np.zeros((XO.shape[0], ))
            for i in range(sample_count):
                result = fe_term_sample(XO)
                nll_sum += result[0].ravel()
                kld_sum += result[1].ravel()
            mean_nll = nll_sum / float(sample_count)
            mean_kld = kld_sum / float(sample_count)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            if not use_guide_policy:
                # no KLd if samples are from the primary policy...
                mean_kld = 0.0 * mean_kld
            return [mean_nll, mean_kld]

        return fe_term_estimator

    def _construct_raw_costs(self):
        """
        Construct all the raw, i.e. not weighted by any lambdas, costs.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # compile theano function for computing the costs
        all_step_costs = [
            self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g
        ]
        cost_func = theano.function(inputs=[ xo ], \
                    outputs=all_step_costs, \
                    givens={self.x_out: xo, \
                            self.zi_zmuv: zizmuv, \
                            self.p_masks: pmasks, \
                            self.q_masks: qmasks}, \
                    updates=self.scan_updates, \
                    on_unused_input='ignore')

        # make a function for computing batch-based estimates of costs.
        #   _step_nlls: the expected NLL cost for each step
        #   _step_klds: the expected KL(q||p) cost for each step
        #   _kld_q2p: the expected KL(q||p) cost for each latent dim
        #   _kld_p2q: the expected KL(p||q) cost for each latent dim
        #   _kld_p2g: the expected KL(p||N(0,I)) cost for each latent dim
        def raw_cost_computer(XO):
            _all_costs = cost_func(to_fX(XO))
            _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True),
                              axis=0)
            _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True),
                              axis=0)
            _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True),
                              axis=0)
            _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True),
                                 axis=1)
            _step_klds = to_fX(np.asarray([k for k in _step_klds]))
            _step_nlls = np.mean(_all_costs[0], axis=1)
            _step_nlls = to_fX(np.asarray([k for k in _step_nlls]))
            results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g]
            return results

        return raw_cost_computer

    def _construct_train_joint(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # collect the outputs to return from this function
        outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \
                   self.kld_cost, self.reg_cost, self.obs_costs]
        # compile the theano function
        func = theano.function(inputs=[ xo ], \
                outputs=outputs, \
                givens={self.x_out: xo, \
                        self.zi_zmuv: zizmuv, \
                        self.p_masks: pmasks, \
                        self.q_masks: qmasks}, \
                updates=self.joint_updates, \
                on_unused_input='ignore')
        return func

    def _construct_sequence_sampler(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # collect the outputs to return from this function
        states = [self._from_si_to_x(self.s0_full)] + \
                 [self._from_si_to_x(self.si[i]) for i in range(self.total_steps)]
        masks = [self.m0_full
                 ] + [self.mi_p[i] for i in range(self.total_steps)]
        outputs = states + masks
        # compile the theano function
        func = theano.function(inputs=[ xo ], \
                outputs=outputs, \
                givens={self.x_out: xo, \
                        self.zi_zmuv: zizmuv, \
                        self.p_masks: pmasks, \
                        self.q_masks: qmasks}, \
                updates=self.joint_updates, \
                on_unused_input='ignore')

        # visualize trajectories generated by the model
        def sample_func(XO, use_guide_policy=False):
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from the guide policy
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from the primary policy
                self.set_train_switch(switch_val=0.0)
            # get belief states and masks generated by the scan loop
            scan_vals = func(to_fX(XO))
            step_count = self.total_steps + 1
            seq_shape = (step_count, XO.shape[0], XO.shape[1])
            xm_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            xi_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            mi_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            for i in range(step_count):
                _xi = scan_vals[i]
                _mi = scan_vals[i + step_count]
                _xm = (_mi * XO) + ((1.0 - _mi) * _xi)
                xm_seq[i, :, :] = _xm
                xi_seq[i, :, :] = _xi
                mi_seq[i, :, :] = _mi
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            return [xm_seq, xi_seq, mi_seq]

        return sample_func

    def save_to_file(self, f_name=None):
        """
        Dump important stuff to a Python pickle, so that we can reload this
        model later.
        """
        assert (not (f_name is None))
        f_handle = file(f_name, 'wb')
        # dump the dict self.params, which just holds "simple" python values
        cPickle.dump(self.params, f_handle, protocol=-1)
        # make a copy of self.shared_param_dicts, with numpy arrays in place
        # of the theano shared variables
        numpy_param_dicts = {}
        for key in self.shared_param_dicts:
            numpy_ary = self.shared_param_dicts[key].get_value(borrow=False)
            numpy_param_dicts[key] = numpy_ary
        # dump the numpy version of self.shared_param_dicts to pickle file
        cPickle.dump(numpy_param_dicts, f_handle, protocol=-1)
        # get numpy dicts for each of the "child" models that we must save
        child_model_dicts = {}
        child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict()
        child_model_dicts[
            'p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict()
        child_model_dicts['p_x_given_si'] = self.p_x_given_si.save_to_dict()
        child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict()
        # dump the numpy child model dicts to the pickle file
        cPickle.dump(child_model_dicts, f_handle, protocol=-1)
        f_handle.close()
        return
Esempio n. 23
0
class InfNet(object):
    """
    A net that tries to infer an approximate posterior for some observation,
    given some deep, directed generative model. The output of this network
    comprises two constructs: an approximate mean vector and an approximate
    standard deviation vector (i.e. diagonal matrix) for a Gaussian posterior.

    Parameters:
        rng: a numpy.random RandomState object
        Xd: symbolic input matrix for inputs
        params: a dict of parameters describing the desired network:
            vis_drop: drop rate to use on observable variables
            hid_drop: drop rate to use on hidden layer activations
                -- note: vis_drop/hid_drop are optional, with defaults 0.0/0.0
            input_noise: standard dev for noise on the input of this net
            bias_noise: standard dev for noise on the biases of hidden layers
            shared_config: list of "layer descriptions" for shared part
            mu_config: list of "layer descriptions" for mu part
            sigma_config: list of "layer descriptions" for sigma part
            activation: "function handle" for the desired non-linearity
            init_scale: scaling factor for hidden layer weights (__ * 0.01)
        shared_param_dicts: parameters for the MLP controlled by this InfNet
    """
    def __init__(self, \
            rng=None, \
            Xd=None, \
            params=None, \
            shared_param_dicts=None):
        # Setup a shared random generator for this network 
        self.rng = RandStream(rng.randint(1000000))
        # Grab the symbolic input matrix
        self.Xd = Xd
        #####################################################
        # Process user-supplied parameters for this network #
        #####################################################
        self.params = params
        if 'build_theano_funcs' in params:
            self.build_theano_funcs = params['build_theano_funcs']
        else:
            self.build_theano_funcs = True
        if 'vis_drop' in params:
            self.vis_drop = params['vis_drop']
        else:
            self.vis_drop = 0.0
        if 'hid_drop' in params:
            self.hid_drop = params['hid_drop']
        else:
            self.hid_drop = 0.0
        if 'input_noise' in params:
            self.input_noise = params['input_noise']
        else:
            self.input_noise = 0.0
        if 'bias_noise' in params:
            self.bias_noise = params['bias_noise']
        else:
            self.bias_noise = 0.0
        if 'init_scale' in params:
            self.init_scale = params['init_scale']
        else:
            self.init_scale = 1.0
        if 'sigma_init_scale' in params:
            self.sigma_init_scale = params['sigma_init_scale']
        else:
            self.sigma_init_scale = 1.0
        # Check if the params for this net were given a priori. This option
        # will be used for creating "clones" of an inference network, with all
        # of the network parameters shared between clones.
        if shared_param_dicts is None:
            # This is not a clone, and we will need to make a dict for
            # referring to the parameters of each network layer
            self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []}
            self.is_clone = False
        else:
            # This is a clone, and its layer parameters can be found by
            # referring to the given param dict (i.e. shared_param_dicts).
            self.shared_param_dicts = shared_param_dicts
            self.is_clone = True
        # Get the configuration/prototype for this network. The config is a
        # list of layer descriptions, including a description for the input
        # layer, which is typically just the dimension of the inputs. So, the
        # depth of the mlp is one less than the number of layer configs.
        self.shared_config = params['shared_config']
        self.mu_config = params['mu_config']
        self.sigma_config = params['sigma_config']
        if 'activation' in params:
            self.activation = params['activation']
        else:
            self.activation = relu_actfun
        #########################################
        # Initialize the shared part of network #
        #########################################
        self.shared_layers = []
        layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:])
        layer_num = 0
        # Construct input to the inference network
        next_input = self.Xd
        for in_def, out_def in layer_def_pairs:
            first_layer = (layer_num == 0)
            last_layer = (layer_num == (len(layer_def_pairs) - 1))
            l_name = "share_layer_{0:d}".format(layer_num)
            if (type(in_def) is list) or (type(in_def) is tuple):
                # Receiving input from a poolish layer...
                in_dim = in_def[0]
            else:
                # Receiving input from a normal layer...
                in_dim = in_def
            if (type(out_def) is list) or (type(out_def) is tuple):
                # Applying some sort of pooling in this layer...
                out_dim = out_def[0]
                pool_size = out_def[1]
            else:
                # Not applying any pooling in this layer...
                out_dim = out_def
                pool_size = 0
            # Select the appropriate noise to add to this layer
            if first_layer:
                d_rate = self.vis_drop
            else:
                d_rate = self.hid_drop
            if first_layer:
                i_noise = self.input_noise
                b_noise = 0.0
            else:
                i_noise = 0.0
                b_noise = self.bias_noise
            # set in-bound weights to have norm self.init_scale
            i_scale = self.init_scale
            if not self.is_clone:
                ##########################################
                # Initialize a layer with new parameters #
                ##########################################
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name, W_scale=i_scale)
                self.shared_layers.append(new_layer)
                self.shared_param_dicts['shared'].append( \
                        new_layer.shared_param_dicts)
            else:
                ##################################################
                # Initialize a layer with some shared parameters #
                ##################################################
                init_params = self.shared_param_dicts['shared'][layer_num]
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        b_in=init_params['b_in'], s_in=init_params['s_in'], \
                        name=l_name, W_scale=i_scale)
                self.shared_layers.append(new_layer)
            next_input = self.shared_layers[-1].output
            # Acknowledge layer completion
            layer_num = layer_num + 1
        #####################################
        # Initialize the mu part of network #
        #####################################
        self.mu_layers = []
        layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:])
        layer_num = 0
        # Take input from the output of the shared network
        next_input = self.shared_layers[-1].output
        for in_def, out_def in layer_def_pairs:
            first_layer = (layer_num == 0)
            last_layer = (layer_num == (len(layer_def_pairs) - 1))
            l_name = "mu_layer_{0:d}".format(layer_num)
            if (type(in_def) is list) or (type(in_def) is tuple):
                # Receiving input from a poolish layer...
                in_dim = in_def[0]
            else:
                # Receiving input from a normal layer...
                in_dim = in_def
            if (type(out_def) is list) or (type(out_def) is tuple):
                # Applying some sort of pooling in this layer...
                out_dim = out_def[0]
                pool_size = out_def[1]
            else:
                # Not applying any pooling in this layer...
                out_dim = out_def
                pool_size = 0
            # Select the appropriate noise to add to this layer
            d_rate = self.hid_drop
            i_noise = 0.0
            b_noise = self.bias_noise
            # set in-bound weights to have norm self.init_scale
            i_scale = self.init_scale
            if not self.is_clone:
                ##########################################
                # Initialize a layer with new parameters #
                ##########################################
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name, W_scale=i_scale)
                self.mu_layers.append(new_layer)
                self.shared_param_dicts['mu'].append( \
                        new_layer.shared_param_dicts)
            else:
                ##################################################
                # Initialize a layer with some shared parameters #
                ##################################################
                init_params = self.shared_param_dicts['mu'][layer_num]
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        b_in=init_params['b_in'], s_in=init_params['s_in'], \
                        name=l_name, W_scale=i_scale)
                self.mu_layers.append(new_layer)
            next_input = self.mu_layers[-1].output
            # Acknowledge layer completion
            layer_num = layer_num + 1
        ########################################
        # Initialize the sigma part of network #
        ########################################
        self.sigma_layers = []
        layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:])
        layer_num = 0
        # Take input from the output of the shared network
        next_input = self.shared_layers[-1].output
        for in_def, out_def in layer_def_pairs:
            first_layer = (layer_num == 0)
            last_layer = (layer_num == (len(layer_def_pairs) - 1))
            l_name = "sigma_layer_{0:d}".format(layer_num)
            if (type(in_def) is list) or (type(in_def) is tuple):
                # Receiving input from a poolish layer...
                in_dim = in_def[0]
            else:
                # Receiving input from a normal layer...
                in_dim = in_def
            if (type(out_def) is list) or (type(out_def) is tuple):
                # Applying some sort of pooling in this layer...
                out_dim = out_def[0]
                pool_size = out_def[1]
            else:
                # Not applying any pooling in this layer...
                out_dim = out_def
                pool_size = 0
            # Select the appropriate noise to add to this layer
            d_rate = self.hid_drop
            i_noise = 0.0
            b_noise = self.bias_noise
            # set in-bound weights to have norm self.init_scale
            i_scale = self.init_scale
            if last_layer:
                # set in-bound weights for logvar predictions to 0
                i_scale = 0.0 * i_scale
            if not self.is_clone:
                ##########################################
                # Initialize a layer with new parameters #
                ##########################################
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name, W_scale=i_scale)
                self.sigma_layers.append(new_layer)
                self.shared_param_dicts['sigma'].append( \
                        new_layer.shared_param_dicts)
            else:
                ##################################################
                # Initialize a layer with some shared parameters #
                ##################################################
                init_params = self.shared_param_dicts['sigma'][layer_num]
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        b_in=init_params['b_in'], s_in=init_params['s_in'], \
                        name=l_name, W_scale=i_scale)
                self.sigma_layers.append(new_layer)
            next_input = self.sigma_layers[-1].output
            # Acknowledge layer completion
            layer_num = layer_num + 1

        # Create a shared parameter for rescaling posterior "sigmas" to allow
        # control over the velocity of the markov chain generated by repeated
        # cycling through the INF -> GEN loop.
        if not ('sigma_scale' in self.shared_param_dicts['sigma'][-1]):
            # we use a hack-ish check to remain compatible with loading models
            # that were saved before the addition of the sigma_scale param.
            zero_ary = to_fX(np.zeros((1,)))
            self.sigma_scale = theano.shared(value=zero_ary)
            new_dict = {'sigma_scale': self.sigma_scale}
            self.shared_param_dicts['sigma'].append(new_dict)
            self.set_sigma_scale(1.0)
        else:
            # this is a clone of some other InfNet, and that InfNet was made
            # after adding the sigma_scale param, so use its sigma_scale
            self.sigma_scale = \
                    self.shared_param_dicts['sigma'][-1]['sigma_scale']

        # Mash all the parameters together, into a list.
        self.mlp_params = []
        for layer in self.shared_layers:
            self.mlp_params.extend(layer.params)
        for layer in self.mu_layers:
            self.mlp_params.extend(layer.params)
        for layer in self.sigma_layers:
            self.mlp_params.extend(layer.params)

        # The output of this inference network is given by the noisy output
        # of the final layers of its mu and sigma networks.
        self.output_mean, self.output_logvar, self.output_samples = \
                self.apply(Xd)
        self.output = self.output_samples
        self.out_dim = self.sigma_layers[-1].out_dim
        # Construct a theano function for sampling from the approximate
        # posteriors inferred by this model for some collection of points
        # in the "data space".
        if self.build_theano_funcs:
            self.sample_posterior = self._construct_sample_posterior()
            self.mean_posterior = theano.function([self.Xd], \
                    outputs=self.output_mean)
        else:
            self.sample_posterior = None
            self.mean_posterior = None

        ########################################################
        # CONSTRUCT FUNCTIONS FOR RICA PRETRAINING INPUT LAYER #
        ########################################################
        self.rica_func = None
        self.W_rica = self.shared_layers[0].W
        return

    def apply(self, X, do_samples=True):
        """
        Pass input X through this InfNet and get the resulting Gaussian
        conditional distribution.
        """
        # pass activations through the shared layers
        shared_acts = [X]
        for layer in self.shared_layers:
            r0, r1, layer_acts = layer.apply(shared_acts[-1])
            shared_acts.append(layer_acts)
        # pass activations through the mean estimating layers
        mu_acts = [shared_acts[-1]]
        for layer in self.mu_layers:
            r0, r1, layer_acts = layer.apply(mu_acts[-1])
            mu_acts.append(layer_acts)
        layer_acts, r0, r1 = self.mu_layers[-1].apply(mu_acts[-2])
        mu_acts[-1] = layer_acts # use linear output at last layer
        # pass activations through the logvar estimating layers
        sigma_acts = [shared_acts[-1]]
        for layer in self.sigma_layers:
            r0, r1, layer_acts = layer.apply(sigma_acts[-1])
            sigma_acts.append(layer_acts)
        layer_acts, r0, r1 = self.sigma_layers[-1].apply(sigma_acts[-2])
        sigma_acts[-1] = layer_acts # use linear output at last layer

        # construct the outputs we will want to access
        output_mean = mu_acts[-1]
        output_logvar = sigma_acts[-1]

        # wrap them up for easy returnage
        result = [output_mean, output_logvar]
        if do_samples:
            output_samples = output_mean + \
                    ( (self.sigma_scale[0] * T.exp(0.5*output_logvar)) * \
                    self.rng.normal(size=output_mean.shape, avg=0.0, std=1.0, \
                    dtype=theano.config.floatX) )
            result.append(output_samples)
        return result

    def apply_shared(self, X):
        """
        Pass input X through this InfNet's shared layers.
        """
        # pass activations through the shared layers
        shared_acts = [X]
        for layer in self.shared_layers:
            r0, r1, layer_acts = layer.apply(shared_acts[-1])
            shared_acts.append(layer_acts)
        result = shared_acts[-1]
        return result

    def train_rica(self, X, lr, lam):
        """
        CONSTRUCT FUNCTIONS FOR RICA PRETRAINING INPUT LAYER
        """
        if self.rica_func is None:
            l_rate = T.scalar()
            lam_l1 = T.scalar()
            X_in = T.matrix('in_X_in')
            W_in = self.W_rica + self.rng.normal(size=self.W_rica.shape, \
                avg=0.0, std=0.01, dtype=theano.config.floatX)
            X_enc = X_in
            H_rec = T.dot(X_enc, W_in)
            X_rec = T.dot(H_rec, W_in.T)
            recon_cost = T.sum((X_enc - X_rec)**2.0) / X_enc.shape[0]
            spars_cost = lam_l1 * (T.sum(soft_abs(H_rec)) / H_rec.shape[0])
            rica_cost = recon_cost + spars_cost
            dW = T.grad(rica_cost, self.W_rica)
            rica_updates = {self.W_rica: self.W_rica - (l_rate * dW)}
            rica_outputs = [rica_cost, recon_cost, spars_cost]
            self.rica_func = theano.function([X_in, l_rate, lam_l1], \
                    outputs=rica_outputs, \
                    updates=rica_updates)
        outputs = self.rica_func(X, lr, lam)
        return outputs

    def set_sigma_scale(self, sigma_scale=1.0):
        """
        Set the posterior sigma rescaling shared parameter to some value.
        """
        zero_ary = np.zeros((1,))
        new_scale = zero_ary + sigma_scale
        self.sigma_scale.set_value(to_fX(new_scale))
        return

    def set_bias_noise(self, bias_noise=0.0):
        """
        Set the bias noise in all hidden layers to the given value.
        """
        new_ary = np.zeros((1,)) + bias_noise
        new_bn = to_fX( new_ary )
        for layer in self.shared_layers:
            layer.bias_noise.set_value(new_bn)
        for layer in self.mu_layers:
            layer.bias_noise.set_value(new_bn)
        for layer in self.sigma_layers:
            layer.bias_noise.set_value(new_bn)
        return

    def _construct_sample_posterior(self):
        """
        Construct a sampler that draws a single sample from the inferred
        posterior for some set of inputs.
        """
        psample = theano.function([self.Xd], \
                outputs=self.output)
        return psample

    def init_biases(self, b_init=0.0, b_std=1e-2):
        """
        Initialize the biases in all hidden layers to some constant.
        """
        for layer in self.shared_layers:
            b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init
            b_vec = b_vec + (b_std * npr.randn(*b_vec.shape))
            layer.b.set_value(to_fX(b_vec))
        for layer in self.mu_layers[:-1]:
            b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init
            b_vec = b_vec + (b_std * npr.randn(*b_vec.shape))
            layer.b.set_value(to_fX(b_vec))
        for layer in self.sigma_layers[:-1]:
            b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init
            b_vec = b_vec + (b_std * npr.randn(*b_vec.shape))
            layer.b.set_value(to_fX(b_vec))
        return

    def shared_param_clone(self, rng=None, Xd=None):
        """
        Return a clone of this network, with shared parameters but with
        different symbolic input variables.

        This can be used for "unrolling" a generate->infer->generate->infer...
        loop. Then, we can do backprop through time for various objectives.
        """
        clone_net = InfNet(rng=rng, Xd=Xd, params=self.params, \
                shared_param_dicts=self.shared_param_dicts)
        return clone_net

    def forked_param_clone(self, rng=None, Xd=None):
        """
        Return a clone of this network, with forked copies of the current
        shared parameters of this InfNet, with different symbolic inputs too.
        """
        new_spds = {}
        old_spds = self.shared_param_dicts
        # shared param dicts is nested like: dict of list of dicts
        # i.e., spd[k] is a list and spd[k][i] is a dict
        for k1 in old_spds:
            new_spds[k1] = []
            for i in range(len(old_spds[k1])):
                new_spds[k1].append({})
                for k2 in old_spds[k1][i]:
                    old_sp = old_spds[k1][i][k2]
                    old_sp_forked = old_sp.get_value(borrow=False)
                    new_sp = theano.shared(value=old_sp_forked)
                    new_spds[k1][i][k2] = new_sp
        clone_net = InfNet(rng=rng, Xd=Xd, params=self.params, \
                shared_param_dicts=new_spds)
        return clone_net

    def save_to_file(self, f_name=None):
        """
        Dump important stuff to a Python pickle, so that we can reload this
        model later. We'll pickle everything required to create a clone of
        this model given the pickle and the rng/Xd params to the cloning
        function: "InfNet.shared_param_clone()".
        """
        assert(not (f_name is None))
        f_handle = file(f_name, 'wb')
        # dump the dict self.params, which just holds "simple" python values
        cPickle.dump(self.params, f_handle, protocol=-1)
        # make a copy of self.shared_param_dicts, with numpy arrays in place
        # of the theano shared variables
        numpy_param_dicts = {'shared': [], 'mu': [], 'sigma': []}
        for layer_group in ['shared', 'mu', 'sigma']:
            for shared_dict in self.shared_param_dicts[layer_group]:
                numpy_dict = {}
                for key in shared_dict:
                    numpy_dict[key] = shared_dict[key].get_value(borrow=False)
                numpy_param_dicts[layer_group].append(numpy_dict)
        # dump the numpy version of self.shared_param_dicts
        cPickle.dump(numpy_param_dicts, f_handle, protocol=-1)
        f_handle.close()
        return

    def save_to_dict(self):
        """
        Dump important stuff to a dict that can reboot the model.
        """
        model_dict = {}
        # dump the dict self.params, which just holds "simple" python values
        model_dict['params'] = self.params
        # make a copy of self.shared_param_dicts, with numpy arrays in place
        # of the theano shared variables
        numpy_param_dicts = {'shared': [], 'mu': [], 'sigma': []}
        for layer_group in ['shared', 'mu', 'sigma']:
            for shared_dict in self.shared_param_dicts[layer_group]:
                numpy_dict = {}
                for key in shared_dict:
                    numpy_dict[key] = shared_dict[key].get_value(borrow=False)
                numpy_param_dicts[layer_group].append(numpy_dict)
        # dump the numpy version of self.shared_param_dicts
        model_dict['numpy_param_dicts'] = numpy_param_dicts
        return model_dict
Esempio n. 24
0
class HiddenLayer(object):
    def __init__(self, rng, input, in_dim, out_dim, \
                 activation=None, pool_size=0, \
                 drop_rate=0., input_noise=0., bias_noise=0., \
                 W=None, b=None, \
                 use_bias=True, name=""):

        # Setup a shared random generator for this layer
        #self.srng = theano.tensor.shared_randomstreams.RandomStreams( \
        #        rng.randint(100000))

        self.srng = CURAND_RandomStreams(rng.randint(1000000))

        self.clean_input = input

        # Add gaussian noise to the input (if desired)
        if (input_noise > 1e-4):
            self.fuzzy_input = input + \
                    (input_noise * self.srng.normal(size=input.shape, \
                    dtype=theano.config.floatX))
        else:
            self.fuzzy_input = input

        # Apply masking noise to the input (if desired)
        if (drop_rate > 1e-4):
            self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate)
        else:
            self.noisy_input = self.fuzzy_input

        # Set some basic layer properties
        self.pool_size = pool_size
        self.in_dim = in_dim
        self.out_dim = out_dim
        if self.pool_size <= 1:
            self.filt_count = self.out_dim
        else:
            self.filt_count = self.out_dim * self.pool_size
        self.pool_count = self.filt_count / max(self.pool_size, 1)
        if activation:
            self.activation = activation
        else:
            if self.pool_size <= 1:
                self.activation = lambda x: relu_actfun(x)
            else:
                self.activation = lambda x: \
                        maxout_actfun(x, self.pool_size, self.filt_count)

        # Get some random initial weights and biases, if not given
        if W is None:
            if self.pool_size <= 1:
                # Generate random initial filters in a typical way
                W_init = np.asarray(0.04 * rng.standard_normal( \
                          size=(self.in_dim, self.filt_count)), \
                          dtype=theano.config.floatX)
            else:
                # Generate groups of random filters to pool over such that
                # intra-group correlations are stronger than inter-group
                # correlations, to encourage pooling over similar filters...
                filters = []
                for g_num in range(self.pool_count):
                    g_filt = 0.01 * rng.standard_normal(size=(self.in_dim,1))
                    for f_num in range(self.pool_size):
                        f_filt = g_filt + (0.005 * rng.standard_normal( \
                                size=(self.in_dim,1)))
                        filters.append(f_filt)
                W_init = np.hstack(filters).astype(theano.config.floatX)

            W = theano.shared(value=W_init, name="{0:s}_W".format(name))
        if b is None:
            b_init = np.zeros((self.filt_count,), dtype=theano.config.floatX)
            b = theano.shared(value=b_init, name="{0:s}_b".format(name))

        # Set layer weights and biases
        self.W = W
        self.b = b

        # Compute linear "pre-activation" for this layer
        if use_bias:
            self.linear_output = T.dot(self.noisy_input, self.W) + self.b
        else:
            self.linear_output = T.dot(self.noisy_input, self.W)

        # Add noise to the pre-activation features (if desired)
        self.noisy_linear = self.linear_output  + \
                (bias_noise * self.srng.normal(size=self.linear_output.shape, \
                dtype=theano.config.floatX))

        # Apply activation function
        self.output = self.activation(self.noisy_linear)

        # Compute some properties of the activations, probably to regularize
        self.act_l2_sum = T.sum(self.output**2.) / self.output.size
        self.row_l1_sum = T.sum(abs(row_normalize(self.output))) / \
                self.output.shape[0]
        self.col_l1_sum = T.sum(abs(col_normalize(self.output))) / \
                self.output.shape[1]

        # Conveniently package layer parameters
        if use_bias:
            self.params = [self.W, self.b]
        else:
            self.params = [self.W]
        # Layer construction complete...
        return

    def _drop_from_input(self, input, p):
        """p is the probability of dropping elements of input."""
        # get a drop mask that drops things with probability p
        #drop_mask = self.srng.binomial(n=1, p=1-p, size=input.shape, \
        #        dtype=theano.config.floatX)
        noise_rnd = self.srng.uniform(input.shape, low=0.0, high=1.0, \
            dtype=theano.config.floatX)
        drop_mask = noise_rnd > p
        # get a scaling factor to keep expectations fixed after droppage
        drop_scale = 1. / (1. - p)
        # apply dropout mask and rescaling factor to the input
        droppy_input = drop_scale * input * drop_mask
        return droppy_input

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        #P_nz = P + self.srng.normal(size=P.shape, avg=0., std=noise_lvl, \
        #        dtype=theano.config.floatX)
        P_nz = P + self.srng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                dtype=theano.config.floatX)
        return P_nz
Esempio n. 25
0
class InfNet(object):
    """
    A net that tries to infer an approximate posterior for some observation,
    given some deep, directed generative model. The output of this network
    comprises two constructs: an approximate mean vector and an approximate
    standard deviation vector (i.e. diagonal matrix) for a Gaussian posterior.

    Parameters:
        rng: a numpy.random RandomState object
        Xd: symbolic input matrix for inputs
        prior_sigma: standard deviation of isotropic Gaussian prior that our
                     inferred posteriors will be penalized for deviating from.
        params: a dict of parameters describing the desired network:
            lam_l2a: L2 regularization weight on neuron activations
            vis_drop: drop rate to use on observable variables
            hid_drop: drop rate to use on hidden layer activations
                -- note: vis_drop/hid_drop are optional, with defaults 0.0/0.0
            input_noise: standard dev for noise on the input of this net
            bias_noise: standard dev for noise on the biases of hidden layers
            shared_config: list of "layer descriptions" for shared part
            mu_config: list of "layer descriptions" for mu part
            sigma_config: list of "layer descriptions" for sigma part
            activation: "function handle" for the desired non-linearity
            init_scale: scaling factor for hidden layer weights (__ * 0.01)
            encoder: a function that will be applied to inputs prior to
                     passing them through the network. this can be used for
                     in-lining, e.g., PCA preprocessing on training data
        shared_param_dicts: parameters for the MLP controlled by this InfNet
    """
    def __init__(self, \
            rng=None, \
            Xd=None, \
            prior_sigma=None, \
            params=None, \
            shared_param_dicts=None):
        # Setup a shared random generator for this network 
        self.rng = RandStream(rng.randint(1000000))
        # Grab the symbolic input matrix
        self.Xd = Xd
        self.prior_sigma = prior_sigma
        #####################################################
        # Process user-supplied parameters for this network #
        #####################################################
        self.params = params
        self.lam_l2a = params['lam_l2a']
        if 'build_theano_funcs' in params:
            self.build_theano_funcs = params['build_theano_funcs']
        else:
            self.build_theano_funcs = True
        if 'vis_drop' in params:
            self.vis_drop = params['vis_drop']
        else:
            self.vis_drop = 0.0
        if 'hid_drop' in params:
            self.hid_drop = params['hid_drop']
        else:
            self.hid_drop = 0.0
        if 'input_noise' in params:
            self.input_noise = params['input_noise']
        else:
            self.input_noise = 0.0
        if 'bias_noise' in params:
            self.bias_noise = params['bias_noise']
        else:
            self.bias_noise = 0.0
        if 'init_scale' in params:
            self.init_scale = params['init_scale']
        else:
            self.init_scale = 1.0
        if 'encoder' in params:
            self.encoder = params['encoder']
            self.decoder = params['decoder']
            self.use_encoder = True
            self.Xd_encoded = self.encoder(self.Xd)
        else:
            self.encoder = lambda x: x
            self.decoder = lambda x: x
            self.use_encoder = False
            self.Xd_encoded = self.encoder(self.Xd)
        if 'kld2_scale' in params:
            self.kld2_scale = params['kld2_scale']
        else:
            self.kld2_scale = 0.0
        if 'sigma_init_scale' in params:
            self.sigma_init_scale = params['sigma_init_scale']
        else:
            self.sigma_init_scale = 1.0
        # Check if the params for this net were given a priori. This option
        # will be used for creating "clones" of an inference network, with all
        # of the network parameters shared between clones.
        if shared_param_dicts is None:
            # This is not a clone, and we will need to make a dict for
            # referring to the parameters of each network layer
            self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []}
            self.is_clone = False
        else:
            # This is a clone, and its layer parameters can be found by
            # referring to the given param dict (i.e. shared_param_dicts).
            self.shared_param_dicts = shared_param_dicts
            self.is_clone = True
        # Get the configuration/prototype for this network. The config is a
        # list of layer descriptions, including a description for the input
        # layer, which is typically just the dimension of the inputs. So, the
        # depth of the mlp is one less than the number of layer configs.
        self.shared_config = params['shared_config']
        self.mu_config = params['mu_config']
        self.sigma_config = params['sigma_config']
        if 'activation' in params:
            self.activation = params['activation']
        else:
            self.activation = relu_actfun
        #########################################
        # Initialize the shared part of network #
        #########################################
        self.shared_layers = []
        layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:])
        layer_num = 0
        # Construct input to the inference network
        if self.use_encoder:
            next_input = self.encoder(self.Xd)
        else:
            next_input = self.Xd
        for in_def, out_def in layer_def_pairs:
            first_layer = (layer_num == 0)
            last_layer = (layer_num == (len(layer_def_pairs) - 1))
            l_name = "share_layer_{0:d}".format(layer_num)
            if (type(in_def) is list) or (type(in_def) is tuple):
                # Receiving input from a poolish layer...
                in_dim = in_def[0]
            else:
                # Receiving input from a normal layer...
                in_dim = in_def
            if (type(out_def) is list) or (type(out_def) is tuple):
                # Applying some sort of pooling in this layer...
                out_dim = out_def[0]
                pool_size = out_def[1]
            else:
                # Not applying any pooling in this layer...
                out_dim = out_def
                pool_size = 0
            # Select the appropriate noise to add to this layer
            if first_layer:
                d_rate = self.vis_drop
            else:
                d_rate = self.hid_drop
            if first_layer:
                i_noise = self.input_noise
                b_noise = 0.0
            else:
                i_noise = 0.0
                b_noise = self.bias_noise
            # set in-bound weights to have norm self.init_scale
            i_scale = self.init_scale
            if not self.is_clone:
                ##########################################
                # Initialize a layer with new parameters #
                ##########################################
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name, W_scale=i_scale)
                self.shared_layers.append(new_layer)
                self.shared_param_dicts['shared'].append( \
                        {'W': new_layer.W, 'b': new_layer.b, \
                         'b_in': new_layer.b_in, 's_in': new_layer.s_in})
            else:
                ##################################################
                # Initialize a layer with some shared parameters #
                ##################################################
                init_params = self.shared_param_dicts['shared'][layer_num]
                if not (('b_in' in init_params) and ('s_in' in init_params)):
                    init_params['b_in'] = None
                    init_params['s_in'] = None
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        b_in=init_params['b_in'], s_in=init_params['s_in'], \
                        name=l_name, W_scale=i_scale)
                self.shared_layers.append(new_layer)
                if ((init_params['b_in'] is None) or (init_params['s_in'] is None)):
                    init_params['b_in'] = new_layer.b_in
                    init_params['s_in'] = new_layer.s_in
            next_input = self.shared_layers[-1].output
            # Acknowledge layer completion
            layer_num = layer_num + 1
        #####################################
        # Initialize the mu part of network #
        #####################################
        self.mu_layers = []
        layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:])
        layer_num = 0
        # Take input from the output of the shared network
        next_input = self.shared_layers[-1].output
        for in_def, out_def in layer_def_pairs:
            first_layer = (layer_num == 0)
            last_layer = (layer_num == (len(layer_def_pairs) - 1))
            l_name = "mu_layer_{0:d}".format(layer_num)
            if (type(in_def) is list) or (type(in_def) is tuple):
                # Receiving input from a poolish layer...
                in_dim = in_def[0]
            else:
                # Receiving input from a normal layer...
                in_dim = in_def
            if (type(out_def) is list) or (type(out_def) is tuple):
                # Applying some sort of pooling in this layer...
                out_dim = out_def[0]
                pool_size = out_def[1]
            else:
                # Not applying any pooling in this layer...
                out_dim = out_def
                pool_size = 0
            # Select the appropriate noise to add to this layer
            d_rate = self.hid_drop
            i_noise = 0.0
            b_noise = self.bias_noise
            # set in-bound weights to have norm self.init_scale
            i_scale = self.init_scale
            if not self.is_clone:
                ##########################################
                # Initialize a layer with new parameters #
                ##########################################
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name, W_scale=i_scale)
                self.mu_layers.append(new_layer)
                self.shared_param_dicts['mu'].append( \
                        {'W': new_layer.W, 'b': new_layer.b, \
                         'b_in': new_layer.b_in, 's_in': new_layer.s_in})
            else:
                ##################################################
                # Initialize a layer with some shared parameters #
                ##################################################
                init_params = self.shared_param_dicts['mu'][layer_num]
                if not (('b_in' in init_params) and ('s_in' in init_params)):
                    init_params['b_in'] = None
                    init_params['s_in'] = None
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        b_in=init_params['b_in'], s_in=init_params['s_in'], \
                        name=l_name, W_scale=i_scale)
                self.mu_layers.append(new_layer)
                if ((init_params['b_in'] is None) or (init_params['s_in'] is None)):
                    init_params['b_in'] = new_layer.b_in
                    init_params['s_in'] = new_layer.s_in
            next_input = self.mu_layers[-1].output
            # Acknowledge layer completion
            layer_num = layer_num + 1
        ########################################
        # Initialize the sigma part of network #
        ########################################
        self.sigma_layers = []
        layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:])
        layer_num = 0
        # Take input from the output of the shared network
        next_input = self.shared_layers[-1].output
        for in_def, out_def in layer_def_pairs:
            first_layer = (layer_num == 0)
            last_layer = (layer_num == (len(layer_def_pairs) - 1))
            l_name = "sigma_layer_{0:d}".format(layer_num)
            if (type(in_def) is list) or (type(in_def) is tuple):
                # Receiving input from a poolish layer...
                in_dim = in_def[0]
            else:
                # Receiving input from a normal layer...
                in_dim = in_def
            if (type(out_def) is list) or (type(out_def) is tuple):
                # Applying some sort of pooling in this layer...
                out_dim = out_def[0]
                pool_size = out_def[1]
            else:
                # Not applying any pooling in this layer...
                out_dim = out_def
                pool_size = 0
            # Select the appropriate noise to add to this layer
            d_rate = self.hid_drop
            i_noise = 0.0
            b_noise = self.bias_noise
            # set in-bound weights to have norm self.init_scale
            i_scale = self.init_scale
            if last_layer:
                # set in-bound weights for logvar predictions to 0
                i_scale = 0.0 * i_scale
            if not self.is_clone:
                ##########################################
                # Initialize a layer with new parameters #
                ##########################################
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name, W_scale=i_scale)
                self.sigma_layers.append(new_layer)
                self.shared_param_dicts['sigma'].append( \
                        {'W': new_layer.W, 'b': new_layer.b, \
                         'b_in': new_layer.b_in, 's_in': new_layer.s_in})
            else:
                ##################################################
                # Initialize a layer with some shared parameters #
                ##################################################
                init_params = self.shared_param_dicts['sigma'][layer_num]
                if not (('b_in' in init_params) and ('s_in' in init_params)):
                    init_params['b_in'] = None
                    init_params['s_in'] = None
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        b_in=init_params['b_in'], s_in=init_params['s_in'], \
                        name=l_name, W_scale=i_scale)
                self.sigma_layers.append(new_layer)
                if ((init_params['b_in'] is None) or (init_params['s_in'] is None)):
                    init_params['b_in'] = new_layer.b_in
                    init_params['s_in'] = new_layer.s_in
            next_input = self.sigma_layers[-1].output
            # Acknowledge layer completion
            layer_num = layer_num + 1

        # Create a shared parameter for rescaling posterior "sigmas" to allow
        # control over the velocity of the markov chain generated by repeated
        # cycling through the INF -> GEN loop.
        if not ('sigma_scale' in self.shared_param_dicts['sigma'][-1]):
            # we use a hack-ish check to remain compatible with loading models
            # that were saved before the addition of the sigma_scale param.
            zero_ary = np.zeros((1,)).astype(theano.config.floatX)
            self.sigma_scale = theano.shared(value=zero_ary)
            new_dict = {'sigma_scale': self.sigma_scale}
            self.shared_param_dicts['sigma'].append(new_dict)
            self.set_sigma_scale(1.0)
        else:
            # this is a clone of some other InfNet, and that InfNet was made
            # after adding the sigma_scale param, so use its sigma_scale
            self.sigma_scale = \
                    self.shared_param_dicts['sigma'][-1]['sigma_scale']

        # Create a shared parameter for maintaining an exponentially decaying
        # estimate of the population mean of posterior KL divergence.
        if not ('kld_mean' in self.shared_param_dicts['sigma'][-1]):
            # add a kld_mean if none was already present
            zero_ary = np.zeros((1,)).astype(theano.config.floatX) + 100.0
            self.kld_mean = theano.shared(value=zero_ary)
            self.shared_param_dicts['sigma'][-1]['kld_mean'] = self.kld_mean
        else:
            # use a kld_mean that's already present
            self.kld_mean = self.shared_param_dicts['sigma'][-1]['kld_mean']

        # Mash all the parameters together, into a list.
        self.mlp_params = []
        for layer in self.shared_layers:
            self.mlp_params.extend(layer.params)
        for layer in self.mu_layers:
            self.mlp_params.extend(layer.params)
        for layer in self.sigma_layers:
            self.mlp_params.extend(layer.params)

        # The output of this inference network is given by the noisy output
        # of the final layers of its mu and sigma networks.
        self.output_mean = self.mu_layers[-1].linear_output
        self.output_logvar = self.sigma_layers[-1].linear_output
        self.output_sigma = self.sigma_init_scale * self.sigma_scale[0] * \
                T.exp(0.5 * self.output_logvar)

        # We'll also construct an output containing a single samples from each
        # of the distributions represented by the rows of self.output_mean and
        # self.output_sigma.
        self.output = self._construct_post_samples()
        self.out_dim = self.sigma_layers[-1].out_dim
        # Get simple regularization penalty to moderate activation dynamics
        self.act_reg_cost = self.lam_l2a * self._act_reg_cost()
        # Construct a function for penalizing KL divergence between the
        # approximate posteriors produced by this model and some isotropic
        # Gaussian distribution.
        self.kld_cost = self._construct_kld_cost()
        self.kld_mean_update = T.cast((0.98 * self.kld_mean) + \
                (0.02 * T.mean(self.kld_cost)), 'floatX')
        # Construct a theano function for sampling from the approximate
        # posteriors inferred by this model for some collection of points
        # in the "data space".
        if self.build_theano_funcs:
            self.sample_posterior = self._construct_sample_posterior()
            self.mean_posterior = theano.function([self.Xd], \
                    outputs=self.output_mean)
        else:
            self.sample_posterior = None
            self.mean_posterior = None
        return

    def set_sigma_scale(self, sigma_scale=1.0):
        """
        Set the posterior sigma rescaling shared parameter to some value.
        """
        zero_ary = np.zeros((1,))
        new_scale = zero_ary + sigma_scale
        self.sigma_scale.set_value(new_scale.astype(theano.config.floatX))
        return

    def _act_reg_cost(self):
        """
        Apply L2 regularization to the activations in each net.
        """
        act_sq_sums = []
        for layer in self.shared_layers:
            act_sq_sums.append(layer.act_l2_sum)
        for layer in self.mu_layers:
            act_sq_sums.append(layer.act_l2_sum)
        for layer in self.sigma_layers:
            act_sq_sums.append(layer.act_l2_sum)
        full_act_sq_sum = T.sum(act_sq_sums)
        return full_act_sq_sum

    def _construct_post_samples(self):
        """
        Draw a single sample from each of the approximate posteriors encoded
        in self.output_mean and self.output_sigma.
        """
        post_samples = self.output_mean + (self.output_sigma * \
                self.rng.normal(size=self.output_sigma.shape, avg=0.0, std=1.0, \
                dtype=theano.config.floatX))
        return post_samples

    def _construct_kld_cost(self):
        """
        Compute (analytically) the KL divergence between each approximate
        posterior encoded by self.mu/self.sigma and the isotropic Gaussian
        distribution with mean 0 and standard deviation self.prior_sigma.
        """
        prior_mu = 0.0
        prior_logvar = np.log(self.prior_sigma**2.0)
        post_klds = gaussian_kld(self.output_mean, self.output_logvar, \
                prior_mu, prior_logvar)
        kld_cost = T.sum(post_klds, axis=1, keepdims=True)
        return kld_cost

    def _construct_sample_posterior(self):
        """
        Construct a sampler that draws a single sample from the inferred
        posterior for some set of inputs.
        """
        psample = theano.function([self.Xd], \
                outputs=self.output)
        return psample

    def init_biases(self, b_init=0.0, b_std=1e-2):
        """
        Initialize the biases in all hidden layers to some constant.
        """
        for layer in self.shared_layers:
            b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init
            b_vec = b_vec + (b_std * npr.randn(*b_vec.shape))
            layer.b.set_value(b_vec.astype(theano.config.floatX))
        for layer in self.mu_layers[:-1]:
            b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init
            b_vec = b_vec + (b_std * npr.randn(*b_vec.shape))
            layer.b.set_value(b_vec.astype(theano.config.floatX))
        for layer in self.sigma_layers[:-1]:
            b_vec = (0.0 * layer.b.get_value(borrow=False)) + b_init
            b_vec = b_vec + (b_std * npr.randn(*b_vec.shape))
            layer.b.set_value(b_vec.astype(theano.config.floatX))
        return

    def shared_param_clone(self, rng=None, Xd=None, build_funcs=True):
        """
        Return a clone of this network, with shared parameters but with
        different symbolic input variables.

        This can be used for "unrolling" a generate->infer->generate->infer...
        loop. Then, we can do backprop through time for various objectives.
        """
        new_params = self.params
        new_params['build_theano_funcs'] = build_funcs
        clone_net = InfNet(rng=rng, Xd=Xd, \
                prior_sigma=self.prior_sigma, params=self.params, \
                shared_param_dicts=self.shared_param_dicts)
        return clone_net

    def save_to_file(self, f_name=None):
        """
        Dump important stuff to a Python pickle, so that we can reload this
        model later. We'll pickle everything required to create a clone of
        this model given the pickle and the rng/Xd params to the cloning
        function: "InfNet.shared_param_clone()".
        """
        assert(not (f_name is None))
        f_handle = file(f_name, 'wb')
        # dump the "simple" python value in self.prior_sigma
        cPickle.dump(self.prior_sigma, f_handle, protocol=-1)
        # dump the dict self.params, which just holds "simple" python values
        cPickle.dump(self.params, f_handle, protocol=-1)
        # make a copy of self.shared_param_dicts, with numpy arrays in place
        # of the theano shared variables
        numpy_param_dicts = {'shared': [], 'mu': [], 'sigma': []}
        for layer_group in ['shared', 'mu', 'sigma']:
            for shared_dict in self.shared_param_dicts[layer_group]:
                numpy_dict = {}
                for key in shared_dict:
                    numpy_dict[key] = shared_dict[key].get_value(borrow=False)
                numpy_param_dicts[layer_group].append(numpy_dict)
        # dump the numpy version of self.shared_param_dicts
        cPickle.dump(numpy_param_dicts, f_handle, protocol=-1)
        f_handle.close()
        return
class TwoStageModel2(object):
    """
    Controller for training a two-step hierarchical generative model.
      x: the "observation" variables
      z: the "prior" latent variables
      h: the "hidden" latent variables

    Generative model is: p(x) = \sum_{z,h} p(x|h) p(h|z) p(z)
    Variational model is: q(h,z|x) = q(h|x) q(z|h)

    Parameters:
        rng: numpy.random.RandomState (for reproducibility)
        x_in: the input data to encode
        x_out: the target output to decode
        p_h_given_z: InfNet for h given z
        p_x_given_h: InfNet for x given h
        q_h_given_x: InfNet for h given x
        q_z_given_h: InfNet for z given h
        x_dim: dimension of the "observation" space
        z_dim: dimension of the "prior" latent space
        h_dim: dimension of the "hidden" latent space
        params: REQUIRED PARAMS SHOWN BELOW
                x_type: can be "bernoulli" or "gaussian"
                obs_transform: can be 'none' or 'sigmoid'
    """
    def __init__(self, rng=None, \
            x_in=None, x_out=None, \
            p_h_given_z=None, \
            p_x_given_h=None, \
            q_h_given_x=None, \
            q_z_given_h=None, \
            x_dim=None, \
            z_dim=None, \
            h_dim=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        self.shared_param_dicts = shared_param_dicts

        # record the dimensions of various spaces relevant to this model
        self.x_dim = x_dim
        self.z_dim = z_dim
        self.h_dim = h_dim

        # grab handles to the relevant InfNets
        self.q_h_given_x = q_h_given_x
        self.q_z_given_h = q_z_given_h
        self.p_h_given_z = p_h_given_z
        self.p_x_given_h = p_x_given_h

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch')
        self.set_train_switch(1.0)

        if self.shared_param_dicts is None:
            # initialize "optimizable" parameters specific to this TSM
            init_vec = to_fX( np.zeros((1,self.z_dim)) )
            self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean')
            self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar')
            self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)
            self.shared_param_dicts = {}
            self.shared_param_dicts['p_z_mean'] = self.p_z_mean
            self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            self.p_z_mean = self.shared_param_dicts['p_z_mean']
            self.p_z_logvar = self.shared_param_dicts['p_z_logvar']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)

        ##############################################
        # Setup the TwoStageModels main computation. #
        ##############################################
        print("Building TSM...")
        # samples of "hidden" latent state (from q)
        h_q_mean, h_q_logvar, h_q = \
                self.q_h_given_x.apply(self.x_in, do_samples=True)
        # samples of "prior" latent state (from q)
        z_q_mean, z_q_logvar, z_q = \
                self.q_z_given_h.apply(h_q, do_samples=True)
        # samples of "prior" latent state (from p)
        z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0)
        z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0)
        zmuv = self.rng.normal(size=z_q.shape, avg=0.0, std=1.0, \
                               dtype=theano.config.floatX)
        z_p = (T.exp(0.5*z_p_logvar) * zmuv) + z_p_mean
        # samples from z -- switched between q/p
        self.z = (self.train_switch[0] * z_q) + \
                 ((1.0 - self.train_switch[0]) * z_p)
        # samples of "hidden" latent state (from p)
        h_p_mean, h_p_logvar, h_p = \
                self.p_h_given_z.apply(self.z, do_samples=True)
        # samples from h -- switched between q/p
        self.h = (self.train_switch[0] * h_q) + \
                 ((1.0 - self.train_switch[0]) * h_p)
        # compute KLds for "prior" and "hidden" latent distributions
        self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar, \
                                      z_p_mean, z_p_logvar)
        self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar, \
                                      z_q_mean, z_q_logvar)
        self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar, \
                                      h_p_mean, h_p_logvar)
        self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar, \
                                      h_q_mean, h_q_logvar)

        # p_x_given_h generates an observation x conditioned on the "hidden"
        # latent variables h.
        self.x_gen, _ = self.p_x_given_h.apply(self.h, do_samples=False)


        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr = theano.shared(value=zero_ary, name='tsm_lr')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # get optimizable parameters belonging to the TwoStageModel
        self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar]
        # get optimizable parameters belonging to the underlying networks
        child_params = []
        child_params.extend(self.q_h_given_x.mlp_params)
        child_params.extend(self.q_z_given_h.mlp_params)
        child_params.extend(self.p_h_given_z.mlp_params)
        child_params.extend(self.p_x_given_h.mlp_params)
        # make a joint list of all optimizable parameters
        self.joint_params = self_params + child_params


        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_h_p2q)
        self.kld_costs = T.sum(self.kld_z, axis=1) + \
                         T.sum(self.kld_h, axis=1)
        # compute "mean" (rather than per-input) costs
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self._construct_nll_costs(self.x_out)
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # construct the updates for the generator and inferencer networks
        all_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0)
        self.joint_updates = OrderedDict()
        for k in all_updates:
            self.joint_updates[k] = all_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling open-loop model sampler...")
        self.sample_from_prior = self._construct_sample_from_prior()
        return

    def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999):
        """
        Set learning rate and momentum parameter for all updates.
        """
        zero_ary = np.zeros((1,))
        # set learning rate
        new_lr = zero_ary + lr
        self.lr.set_value(to_fX(new_lr))
        # set momentums
        new_mom_1 = zero_ary + mom_1
        self.mom_1.set_value(to_fX(new_mom_1))
        new_mom_2 = zero_ary + mom_2
        self.mom_2.set_value(to_fX(new_mom_2))
        return

    def set_lam_nll(self, lam_nll=1.0):
        """
        Set weight for controlling the influence of the data likelihood.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_nll
        self.lam_nll.set_value(to_fX(new_lam))
        return

    def set_lam_kld(self, lam_kld_q2p=1.0, lam_kld_p2q=1.0):
        """
        Set the relative weight of various KL-divergences.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_kld_q2p
        self.lam_kld_q2p.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_p2q
        self.lam_kld_p2q.set_value(to_fX(new_lam))
        return

    def set_lam_l2w(self, lam_l2w=1e-3):
        """
        Set the relative strength of l2 regularization on network params.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_l2w
        self.lam_l2w.set_value(to_fX(new_lam))
        return

    def set_train_switch(self, switch_val=0.0):
        """
        Set the switch for changing between training and sampling behavior.
        """
        if (switch_val < 0.5):
            switch_val = 0.0
        else:
            switch_val = 1.0
        zero_ary = np.zeros((1,))
        new_val = zero_ary + switch_val
        self.train_switch.set_value(to_fX(new_val))
        return

    def _construct_nll_costs(self, xo):
        """
        Construct the negative log-likelihood part of free energy.
        """
        # average log-likelihood over the refinement sequence
        xh = self.obs_transform(self.x_gen)
        if self.x_type == 'bernoulli':
            ll_costs = log_prob_bernoulli(xo, xh)
        else:
            ll_costs = log_prob_gaussian2(xo, xh, \
                    log_vars=self.bounded_logvar)
        nll_costs = -ll_costs
        return nll_costs

    def _construct_kld_costs(self, p=1.0):
        """
        Construct the posterior KL-divergence part of cost to minimize.
        """
        kld_z_q2p = T.sum(self.kld_z_q2p**p, axis=1, keepdims=True)
        kld_z_p2q = T.sum(self.kld_z_p2q**p, axis=1, keepdims=True)
        kld_h_q2p = T.sum(self.kld_h_q2p**p, axis=1, keepdims=True)
        kld_h_p2q = T.sum(self.kld_h_p2q**p, axis=1, keepdims=True)
        return [kld_z_q2p, kld_z_p2q, kld_h_q2p, kld_h_p2q]

    def _construct_reg_costs(self):
        """
        Construct the cost for low-level basic regularization. E.g. for
        applying l2 regularization to the network activations and parameters.
        """
        param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params])
        return param_reg_cost

    def _construct_train_joint(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xi = T.matrix()
        xo = T.matrix()
        br = T.lscalar()
        # collect the outputs to return from this function
        outputs = [self.joint_cost, self.nll_cost, self.kld_cost, \
                   self.reg_cost, self.obs_costs]
        # compile the theano function
        func = theano.function(inputs=[ xi, xo, br ], \
                outputs=outputs, \
                givens={ self.x_in: xi.repeat(br, axis=0), \
                         self.x_out: xo.repeat(br, axis=0) }, \
                updates=self.joint_updates)
        return func

    def _construct_compute_fe_terms(self):
        """
        Construct a function for computing terms in variational free energy.
        """
        # construct values to output
        nll = self._construct_nll_costs(self.x_out)
        kld_z = self.kld_z_q2p
        kld_h = self.kld_h_q2p
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(inputs=[self.x_in, self.x_out], \
                                         outputs=[nll, kld_z, kld_h])
        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(XI, XO, sample_count):
            # compute a multi-sample estimate of variational free-energy
            nll_sum = np.zeros((XI.shape[0],))
            kld_z_sum = np.zeros((XI.shape[0],))
            kld_h_sum = np.zeros((XI.shape[0],))
            for i in range(sample_count):
                result = fe_term_sample(XI, XO)
                nll_sum += result[0].ravel()
                kld_z_sum += np.sum(result[1], axis=1).ravel()
                kld_h_sum += np.sum(result[2], axis=1).ravel()
            mean_nll = nll_sum / float(sample_count)
            mean_kld = (kld_z_sum + kld_h_sum) / float(sample_count)
            return [mean_nll, mean_kld]
        return fe_term_estimator

    def _construct_sample_from_prior(self):
        """
        Construct a function for drawing independent samples from the
        distribution generated by this TwoStageModel.
        """
        x_sym = T.matrix()
        sample_func = theano.function(inputs=[x_sym], \
                outputs=self.obs_transform(self.x_gen), \
                givens={self.x_in: T.zeros_like(x_sym), \
                        self.x_out: T.zeros_like(x_sym)})
        def prior_sampler(samp_count):
            x_samps = to_fX( np.zeros((samp_count, self.x_dim)) )
            old_switch = self.train_switch.get_value(borrow=False)
            # set model to generation mode
            self.set_train_switch(switch_val=0.0)
            # generate samples from model
            model_samps = sample_func(x_samps)
            # set model back to previous mode
            self.set_train_switch(switch_val=old_switch)
            return model_samps
        return prior_sampler
class HiddenLayer(object):
    def __init__(self, rng, input, in_dim, out_dim, \
                 activation=None, pool_size=0, \
                 drop_rate=0., input_noise=0., bias_noise=0., \
                 W=None, b=None, b_in=None, s_in=None, \
                 name="", W_scale=1.0):

        # Setup a shared random generator for this layer
        self.rng = RandStream(rng.randint(1000000))

        # setup parameters for controlling 
        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        self.input_noise = theano.shared(value=(zero_ary+input_noise), \
                name="{0:s}_input_noise".format(name))
        self.bias_noise = theano.shared(value=(zero_ary+bias_noise), \
                name="{0:s}_bias_noise".format(name))
        self.drop_rate = theano.shared(value=(zero_ary+drop_rate), \
                name="{0:s}_drop_rate".format(name))

        # setup scale and bias params for the input
        if b_in is None:
            # input biases are always initialized to zero
            ary = np.zeros((in_dim,), dtype=theano.config.floatX)
            b_in = theano.shared(value=ary, name="{0:s}_b_in".format(name))
        if s_in is None:
            # input scales are always initialized to one
            ary = 0.541325 * np.ones((in_dim,), dtype=theano.config.floatX)
            s_in = theano.shared(value=ary, name="{0:s}_s_in".format(name))
        self.b_in = b_in
        self.s_in = s_in

        # Set some basic layer properties
        self.pool_size = pool_size
        self.in_dim = in_dim
        self.out_dim = out_dim
        if self.pool_size <= 1:
            self.filt_count = self.out_dim
        else:
            self.filt_count = self.out_dim * self.pool_size
        self.pool_count = self.filt_count / max(self.pool_size, 1)
        if activation is None:
            activation = relu_actfun
        if self.pool_size <= 1:
            self.activation = activation
        else:
            self.activation = lambda x: \
                    maxout_actfun(x, self.pool_size, self.filt_count)

        # Get some random initial weights and biases, if not given
        if W is None:
            # Generate initial filters using orthogonal random trick
            W_shape = (self.in_dim, self.filt_count)
            #W_scale = W_scale * (1.0 / np.sqrt(self.in_dim))
            #W_init = W_scale * npr.normal(0.0, 1.0, W_shape)
            W_init = ortho_matrix(shape=(self.in_dim, self.filt_count), \
                    gain=W_scale)
            #W_init = 0.01 * npr.normal(0.0, 1.0, W_shape)
            W_init = W_init.astype(theano.config.floatX)
            W = theano.shared(value=W_init, name="{0:s}_W".format(name))
        if b is None:
            b_init = np.zeros((self.filt_count,), dtype=theano.config.floatX)
            b = theano.shared(value=b_init, name="{0:s}_b".format(name))

        # Set layer weights and biases
        self.W = W
        self.b = b

        # Feedforward through the layer
        use_in = input_noise > 0.001
        use_bn = bias_noise > 0.001
        use_drop = drop_rate > 0.001
        self.linear_output, self.noisy_linear, self.output = \
                self.apply(input, use_in=use_in, use_bn=use_bn, \
                use_drop=use_drop)

        # Compute some properties of the activations, probably to regularize
        self.act_l2_sum = T.sum(self.noisy_linear**2.) / self.output.size

        # Conveniently package layer parameters
        self.params = [self.W, self.b, self.b_in, self.s_in]
        self.shared_param_dicts = { \
                'W': self.W, \
                'b': self.b, \
                'b_in': self.b_in, \
                's_in': self.s_in }
        # Layer construction complete...
        return

    def apply(self, input, use_in=False, use_bn=False, use_drop=False):
        """
        Apply feedforward to this input, returning several partial results.
        """
        # Add gaussian noise to the input (if desired)
        #fancy_input = T.nnet.softplus(self.s_in) * (input + self.b_in)
        fancy_input = input
        if use_in:
            fuzzy_input = fancy_input + self.input_noise[0] * \
                    self.rng.normal(size=fancy_input.shape, avg=0.0, std=1.0, \
                    dtype=theano.config.floatX)
        else:
            fuzzy_input = fancy_input
        # Apply masking noise to the input (if desired)
        if use_drop:
            noisy_input = self._drop_from_input(fuzzy_input, self.drop_rate[0])
        else:
            noisy_input = fuzzy_input
        self.noisy_input = noisy_input
        # Compute linear "pre-activation" for this layer
        linear_output = T.dot(noisy_input, self.W) + self.b
        # Add noise to the pre-activation features (if desired)
        if use_bn:
            noisy_linear = linear_output + self.bias_noise[0] * \
                    self.rng.normal(size=linear_output.shape, avg=0.0, \
                    std=1.0, dtype=theano.config.floatX)
        else:
            noisy_linear = linear_output
        # Apply activation function
        final_output = self.activation(noisy_linear)
        # package partial results for easy return
        results = [linear_output, noisy_linear, final_output]
        return results

    def _drop_from_input(self, input, p):
        """p is the probability of dropping elements of input."""
        # get a drop mask that drops things with probability p
        drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \
                dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # get a scaling factor to keep expectations fixed after droppage
        drop_scale = 1. / (1. - p)
        # apply dropout mask and rescaling factor to the input
        droppy_input = drop_scale * input * drop_mask
        return droppy_input

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                dtype=theano.config.floatX)
        return P_nz
class GenConvModule(object):
    """
    Module of one "fractionally strided" convolution layer followed by one
    regular convolution layer. Inputs to the fractionally strided convolution
    can optionally be augmented with some random values.

    Params:
        filt_shape: shape for convolution filters -- should be square and odd
        in_chans: number of channels in the inputs to module
        out_chans: number of channels in the outputs from module
        rand_chans: number of random channels to augment input
        use_rand: flag for whether or not to augment inputs
        apply_bn_1: flag for whether to batch normalize following first conv
        apply_bn_2: flag for whether to batch normalize following second conv
        us_stride: upsampling ratio in the fractionally strided convolution
        use_pooling: whether to use unpooling or fractional striding
        init_func: function for initializing module parameters
        mod_name: text name for identifying module in theano graph
        rand_type: whether to use Gaussian or uniform randomness
    """
    def __init__(self, filt_shape, in_chans, out_chans, rand_chans,
                 use_rand=True, apply_bn_1=True, apply_bn_2=True,
                 us_stride=2, use_pooling=True,
                 init_func=None, mod_name='gm_conv',
                 rand_type='normal'):
        assert ((filt_shape[0] % 2) > 0), "filter dim should be odd (not even)"
        self.filt_dim = filt_shape[0]
        self.in_chans = in_chans
        self.out_chans = out_chans
        self.rand_chans = rand_chans
        self.use_rand = use_rand
        self.apply_bn_1 = apply_bn_1
        self.apply_bn_2 = apply_bn_2
        self.us_stride = us_stride
        self.use_pooling = use_pooling
        self.mod_name = mod_name
        self.rand_type = rand_type
        self.rng = RandStream(123)
        if init_func is None:
            self.init_func = inits.Normal(scale=0.02)
        else:
            self.init_func = init_func
        self._init_params() # initialize parameters
        return

    def _init_params(self):
        """
        Initialize parameters for the layers in this generator module.
        """
        if self.use_rand:
            # random values will be stacked on exogenous input
            self.w1 = self.init_func((self.out_chans, (self.in_chans+self.rand_chans), self.filt_dim, self.filt_dim),
                                     "{}_w1".format(self.mod_name))
        else:
            # random values won't be stacked on exogenous input
            self.w1 = self.init_func((self.out_chans, self.in_chans, self.filt_dim, self.filt_dim),
                         "{}_w1".format(self.mod_name))
        self.w2 = self.init_func((self.out_chans, self.out_chans, self.filt_dim, self.filt_dim), 
                                 "{}_w2".format(self.mod_name))
        self.params = [self.w1, self.w2]
        # make gains and biases for transforms that will get batch normed
        if self.apply_bn_1:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g1 = gain_ifn((self.out_chans), "{}_g1".format(self.mod_name))
            self.b1 = bias_ifn((self.out_chans), "{}_b1".format(self.mod_name))
            self.params.extend([self.g1, self.b1])
        if self.apply_bn_2:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g2 = gain_ifn((self.out_chans), "{}_g2".format(self.mod_name))
            self.b2 = bias_ifn((self.out_chans), "{}_b2".format(self.mod_name))
            self.params.extend([self.g2, self.b2])
        return

    def apply(self, input, rand_vals=None):
        """
        Apply this generator module to some input.
        """
        batch_size = input.shape[0]
        bm = int((self.filt_dim - 1) / 2) # use "same" mode convolutions
        ss = self.us_stride               # stride for "learned upsampling"
        if self.use_pooling:
            # "unpool" the input if desired
            input = input.repeat(ss, axis=2).repeat(ss, axis=3)
        # get shape for random values that will augment input
        rand_shape = (batch_size, self.rand_chans, input.shape[2], input.shape[3])
        if self.use_rand:
            # augment input with random channels
            if rand_vals is None:
                if self.rand_type == 'normal':
                    rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \
                                                dtype=theano.config.floatX)
                else:
                    rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \
                                                 dtype=theano.config.floatX)
            rand_vals = rand_vals.reshape(rand_shape)
            # stack random values on top of input
            full_input = T.concatenate([rand_vals, input], axis=1)
        else:
            # don't augment input with random channels
            full_input = input
        # apply first convolution, perhaps with fractional striding
        if self.use_pooling:
            h1 = dnn_conv(full_input, self.w1, subsample=(1, 1), border_mode=(bm, bm))
        else:
            # apply first conv layer (with fractional stride for upsampling)
            h1 = deconv(full_input, self.w1, subsample=(ss, ss), border_mode=(bm, bm))
        if self.apply_bn_1:
            h1 = batchnorm(h1, g=self.g1, b=self.b1)
        h1 = relu(h1)
        # apply second conv layer
        h2 = dnn_conv(h1, self.w2, subsample=(1, 1), border_mode=(bm, bm))
        if self.apply_bn_2:
            h2 = batchnorm(h2, g=self.g2, b=self.b2)
        h2 = relu(h2)
        return h2
class DAELayer(object):
    def __init__(self, rng, clean_input=None, fuzzy_input=None, \
            in_dim=0, out_dim=0, activation=None, input_noise=0., \
            W=None, b_h=None, b_v=None, W_scale=1.0):

        # Setup a shared random generator for this layer
        self.rng = RandStream(rng.randint(1000000))

        # Grab the layer input and perturb it with some sort of noise. This
        # is, afterall, a _denoising_ autoencoder...
        self.clean_input = clean_input
        self.noisy_input = self._get_noisy_input(fuzzy_input, input_noise)

        # Set some basic layer properties
        self.activation = activation
        self.in_dim = in_dim
        self.out_dim = out_dim

        # Get some random initial weights and biases, if not given
        if W is None:
            W_init = np.asarray(1.0 * DCG(rng.standard_normal( \
                      size=(in_dim, out_dim)), dtype=theano.config.floatX))
            W = theano.shared(value=(W_scale*W_init), name='W')
        if b_h is None:
            b_init = np.zeros((out_dim,), dtype=theano.config.floatX)
            b_h = theano.shared(value=b_init, name='b_h')
        if b_v is None:
            b_init = np.zeros((in_dim,), dtype=theano.config.floatX)
            b_v = theano.shared(value=b_init, name='b_v')

        # Grab pointers to the now-initialized weights and biases
        self.W = W
        self.b_h = b_h
        self.b_v = b_v

        # Put the learnable/optimizable parameters into a list
        self.params = [self.W, self.b_h, self.b_v]
        # Beep boop... layer construction complete...
        return

    def compute_costs(self, lam_l1=None):
        """Compute reconstruction and activation sparsity costs."""
        # Get noise-perturbed encoder/decoder parameters
        W_nz = self._noisy_params(self.W, 0.01)
        b_nz = self.b_h #self._noisy_params(self.b_h, 0.05)
        # Compute hidden and visible activations
        A_v, A_h = self._compute_activations(self.noisy_input, \
                W_nz, b_nz, self.b_v)
        # Compute reconstruction error cost
        recon_cost = T.sum((self.clean_input - A_v)**2.0) / \
                self.clean_input.shape[0]
        # Compute sparsity penalty (over both population and lifetime)
        row_l1_sum = T.sum(abs(row_normalize(A_h))) / A_h.shape[0]
        col_l1_sum = T.sum(abs(col_normalize(A_h))) / A_h.shape[1]
        sparse_cost = lam_l1[0] * (row_l1_sum + col_l1_sum)
        return [recon_cost, sparse_cost]

    def _compute_hidden_acts(self, X, W, b_h):
        """Compute activations of encoder (at hidden layer)."""
        A_h = self.activation(T.dot(X, W) + b_h)
        return A_h

    def _compute_activations(self, X, W, b_h, b_v):
        """Compute activations of decoder (at visible layer)."""
        A_h = self._compute_hidden_acts(X, W, b_h)
        A_v = T.dot(A_h, W.T) + b_v
        return [A_v, A_h]

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        if noise_lvl > 1e-3:
            P_nz = P + DCG(self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                    dtype=theano.config.floatX))
        else:
            P_nz = P
        return P_nz

    def _get_noisy_input(self, input, p):
        """p is the probability of dropping elements of input."""
        drop_rnd = self.rng.uniform(input.shape, low=0.0, high=1.0, \
            dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # Cast mask from int to float32, to keep things on GPU
        noisy_input = input * DCG(drop_mask)
        return noisy_input
Esempio n. 30
0
class HiddenLayer(object):
    def __init__(self, rng, input, in_dim, out_dim, \
                 activation=None, pool_size=0, \
                 drop_rate=0., input_noise=0., bias_noise=0., \
                 W=None, b=None, name="", W_scale=1.0):

        # Setup a shared random generator for this layer
        #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \
        #        rng.randint(100000))
        self.rng = CURAND_RandomStreams(rng.randint(1000000))

        self.clean_input = input

        # Add gaussian noise to the input (if desired)
        if (input_noise > 1e-4):
            self.fuzzy_input = input + self.rng.normal(size=input.shape, \
                    avg=0.0, std=input_noise, dtype=theano.config.floatX)
        else:
            self.fuzzy_input = input

        # Apply masking noise to the input (if desired)
        if (drop_rate > 1e-4):
            self.noisy_input = self._drop_from_input(self.fuzzy_input,
                                                     drop_rate)
        else:
            self.noisy_input = self.fuzzy_input

        # Set some basic layer properties
        self.pool_size = pool_size
        self.in_dim = in_dim
        self.out_dim = out_dim
        if self.pool_size <= 1:
            self.filt_count = self.out_dim
        else:
            self.filt_count = self.out_dim * self.pool_size
        self.pool_count = self.filt_count / max(self.pool_size, 1)
        if activation:
            self.activation = activation
        else:
            if self.pool_size <= 1:
                self.activation = lambda x: relu_actfun(x)
            else:
                self.activation = lambda x: \
                        maxout_actfun(x, self.pool_size, self.filt_count)

        # Get some random initial weights and biases, if not given
        if W is None:
            if self.pool_size <= 1:
                # Generate random initial filters in a typical way
                W_init = 0.01 * np.asarray(rng.normal( \
                          size=(self.in_dim, self.filt_count)), \
                          dtype=theano.config.floatX)
            else:
                # Generate groups of random filters to pool over such that
                # intra-group correlations are stronger than inter-group
                # correlations, to encourage pooling over similar filters...
                filters = []
                f_size = (self.in_dim, 1)
                for g_num in range(self.pool_count):
                    g_filt = 0.01 * rng.normal(size=f_size)
                    for f_num in range(self.pool_size):
                        f_filt = g_filt + 0.003 * rng.normal(size=f_size)
                        filters.append(f_filt)
                W_init = np.hstack(filters).astype(theano.config.floatX)
            W = theano.shared(value=(W_scale * W_init),
                              name="{0:s}_W".format(name))
        if b is None:
            b_init = np.zeros((self.filt_count, ), dtype=theano.config.floatX)
            b = theano.shared(value=b_init, name="{0:s}_b".format(name))

        # Set layer weights and biases
        self.W = W
        self.b = b

        # Compute linear "pre-activation" for this layer
        self.linear_output = T.dot(self.noisy_input, self.W) + self.b

        # Add noise to the pre-activation features (if desired)
        if bias_noise > 1e-3:
            self.noisy_linear = self.linear_output  + \
                    self.rng.normal(size=self.linear_output.shape, \
                    avg=0.0, std=bias_noise, dtype=theano.config.floatX)
        else:
            self.noisy_linear = self.linear_output

        # Apply activation function
        self.output = self.activation(self.noisy_linear)

        # Compute some properties of the activations, probably to regularize
        self.act_l2_sum = T.sum(self.output**2.) / self.output.size
        self.row_l1_sum = T.sum(abs(row_normalize(self.output))) / \
                self.output.shape[0]
        self.col_l1_sum = T.sum(abs(col_normalize(self.output))) / \
                self.output.shape[1]

        # Conveniently package layer parameters
        self.params = [self.W, self.b]
        # Layer construction complete...
        return

    def _drop_from_input(self, input, p):
        """p is the probability of dropping elements of input."""
        # get a drop mask that drops things with probability p
        drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \
                dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # get a scaling factor to keep expectations fixed after droppage
        drop_scale = 1. / (1. - p)
        # apply dropout mask and rescaling factor to the input
        droppy_input = drop_scale * input * drop_mask
        return droppy_input

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                dtype=theano.config.floatX)
        return P_nz
class GenUniModule(object):
    """
    Module that applies a linear transform followed by an non-linearity.
    """
    def __init__(self, rand_dim, out_dim,
                 apply_bn=True, init_func=None,
                 rand_type='normal', final_relu=True, 
                 mod_name='dm_uni'):
        self.rand_dim = rand_dim
        self.out_dim = out_dim
        self.apply_bn = apply_bn
        self.mod_name = mod_name
        self.rand_type = rand_type
        self.final_relu = final_relu
        self.rng = RandStream(123)
        if init_func is None:
            self.init_func = inits.Normal(scale=0.02)
        else:
            self.init_func = init_func
        self._init_params() # initialize parameters
        return

    def _init_params(self):
        """
        Initialize parameters for the layers in this generator module.
        """
        self.w1 = self.init_func((self.rand_dim, self.out_dim),
                                 "{}_w1".format(self.mod_name))
        self.params = [ self.w1 ]
        # make gains and biases for transforms that will get batch normed
        if self.apply_bn:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g1 = gain_ifn((self.out_dim), "{}_g1".format(self.mod_name))
            self.b1 = bias_ifn((self.out_dim), "{}_b1".format(self.mod_name))
            self.params.extend([self.g1, self.b1])
        return

    def apply(self, batch_size=None, rand_vals=None):
        """
        Apply this generator module. Pass _either_ batch_size or rand_vals.
        """
        assert not ((batch_size is None) and (rand_vals is None)), "need either batch_size or rand_vals"
        if rand_vals is None:
            rand_shape = (batch_size, self.rand_dim)
            if self.rand_type == 'normal':
                rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \
                                            dtype=theano.config.floatX)
            else:
                rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \
                                             dtype=theano.config.floatX)
        else:
            rand_shape = (rand_vals.shape[0], self.rand_dim)
        rand_vals = rand_vals.reshape(rand_shape)
        # transform random values linearly
        h1 = T.dot(rand_vals, self.w1)
        if self.apply_bn:
            h1 = batchnorm(h1, g=self.g1, b=self.b1)
        if self.final_relu:
            h1 = relu(h1)
        return h1















##############
# EYE BUFFER #
##############
Esempio n. 32
0
class MultiStageModel(object):
    """
    Controller for training a multi-step iterative refinement model.

    Parameters:
        rng: numpy.random.RandomState (for reproducibility)
        x_in: the input data to encode
        x_out: the target output to decode
        p_s0_given_z: InfNet for initializing "canvas" state
        p_hi_given_si: InfNet for hi given si
        p_sip1_given_si_hi: HydraNet for sip1 given si and hi
        q_z_given_x: InfNet for z given x
        q_hi_given_x_si: InfNet for hi given x and si
        obs_dim: dimension of the observations to generate
        z_dim: dimension of the "initial" latent space
        h_dim: dimension of the "primary" latent space
        ir_steps: number of "iterative refinement" steps to perform
        params: REQUIRED PARAMS SHOWN BELOW
                x_type: can be "bernoulli" or "gaussian"
                obs_transform: can be 'none' or 'sigmoid'
    """
    def __init__(self, rng=None, \
            x_in=None, x_out=None, \
            p_s0_given_z=None, \
            p_hi_given_si=None, \
            p_sip1_given_si_hi=None, \
            q_z_given_x=None, \
            q_hi_given_x_si=None, \
            obs_dim=None, \
            z_dim=None, h_dim=None, \
            ir_steps=4, params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x))
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x))
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x))
        self.shared_param_dicts = shared_param_dicts

        # record the dimensions of various spaces relevant to this model
        self.obs_dim = obs_dim
        self.z_dim = z_dim
        self.h_dim = h_dim
        self.ir_steps = ir_steps

        # grab handles to the relevant InfNets
        self.q_z_given_x = q_z_given_x
        self.q_hi_given_x_si = q_hi_given_x_si
        self.p_s0_given_z = p_s0_given_z
        self.p_hi_given_si = p_hi_given_si
        self.p_sip1_given_si_hi = p_sip1_given_si_hi

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out
        self.hi_zmuv = T.tensor3() # for ZMUV Gaussian samples to use in scan

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch')
        self.set_train_switch(1.0)
        # setup a variable for controlling dropout noise
        self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate')
        self.set_drop_rate(0.0)
        # this weight balances l1 vs. l2 penalty on posterior KLds
        self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2')
        self.set_lam_kld_l1l2(1.0)

        if self.shared_param_dicts is None:
            # initialize "optimizable" parameters specific to this MSM
            init_vec = to_fX( np.zeros((self.z_dim,)) )
            self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean')
            self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar')
            init_vec = to_fX( np.zeros((self.obs_dim,)) )
            self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)
            self.shared_param_dicts = {}
            self.shared_param_dicts['p_z_mean'] = self.p_z_mean
            self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            self.p_z_mean = self.shared_param_dicts['p_z_mean']
            self.p_z_logvar = self.shared_param_dicts['p_z_logvar']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)

        # setup a function for computing reconstruction log likelihood
        if self.x_type == 'bernoulli':
            self.log_prob_func = lambda xo, xh: \
                    (-1.0 * log_prob_bernoulli(xo, xh))
        else:
            self.log_prob_func = lambda xo, xh: \
                    (-1.0 * log_prob_gaussian2(xo, xh, \
                     log_vars=self.bounded_logvar))

        # get a drop mask that drops things with probability p
        drop_scale = 1. / (1. - self.drop_rate[0])
        drop_rnd = self.rng.uniform(size=self.x_out.shape, \
                low=0.0, high=1.0, dtype=theano.config.floatX)
        drop_mask = drop_scale * (drop_rnd > self.drop_rate[0])

        #############################
        # Setup self.z and self.s0. #
        #############################
        print("Building MSM step 0...")
        drop_x = drop_mask * self.x_in
        self.q_z_mean, self.q_z_logvar, self.z = \
                self.q_z_given_x.apply(drop_x, do_samples=True)
        # get initial observation state
        self.s0, _ = self.p_s0_given_z.apply(self.z, do_samples=False)

        # gather KLd and NLL for the initialization step
        self.init_klds = gaussian_kld(self.q_z_mean, self.q_z_logvar, \
                                      self.p_z_mean, self.p_z_logvar)
        self.init_nlls =  -1.0 * \
                self.log_prob_func(self.x_out, self.obs_transform(self.s0))

        ##################################################
        # Setup the iterative generation loop using scan #
        ##################################################
        def ir_step_func(hi_zmuv, sim1):
            # get variables used throughout this refinement step
            sim1_obs = self.obs_transform(sim1) # transform state -> obs
            grad_ll = self.x_out - sim1_obs

            # get samples of next hi, conditioned on current si
            hi_p_mean, hi_p_logvar = self.p_hi_given_si.apply( \
                    sim1_obs, do_samples=False)
            # now we build the model for variational hi given si
            hi_q_mean, hi_q_logvar = self.q_hi_given_x_si.apply( \
                    T.horizontal_stack(grad_ll, sim1_obs), \
                    do_samples=False)
            hi_q = (T.exp(0.5 * hi_q_logvar) * hi_zmuv) + hi_q_mean
            hi_p = (T.exp(0.5 * hi_p_logvar) * hi_zmuv) + hi_p_mean

            # make hi samples that can be switched between hi_p and hi_q
            hi = ( ((self.train_switch[0] * hi_q) + \
                    ((1.0 - self.train_switch[0]) * hi_p)) )

            # p_sip1_given_si_hi is conditioned on si and  hi.
            ig_vals, fg_vals, in_vals = self.p_sip1_given_si_hi.apply(hi)
                    
            # get the transformed values (for an LSTM style update)
            i_gate = 1.0 * T.nnet.sigmoid(ig_vals + 2.0)
            f_gate = 1.0 * T.nnet.sigmoid(fg_vals + 2.0)
            # perform an LSTM-like update of the state sim1 -> si
            si = (in_vals * i_gate) + (sim1 * f_gate)

            # compute generator NLL for this step
            nlli = self.log_prob_func(self.x_out, self.obs_transform(si))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(hi_q_mean, hi_q_logvar, \
                                    hi_p_mean, hi_p_logvar)
            kldi_p2q = gaussian_kld(hi_p_mean, hi_p_logvar, \
                                    hi_q_mean, hi_q_logvar)
            return si, nlli, kldi_q2p, kldi_p2q

        init_values = [self.s0, None, None, None]

        self.scan_results, self.scan_updates = theano.scan(ir_step_func, \
                outputs_info=init_values, sequences=self.hi_zmuv)

        self.si = self.scan_results[0]
        self.nlli = self.scan_results[1]
        self.kldi_q2p = self.scan_results[2]
        self.kldi_p2q = self.scan_results[3]

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1')
        self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z')
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.q_params = []
        self.q_params.extend(self.q_z_given_x.mlp_params)
        self.q_params.extend(self.q_hi_given_x_si.mlp_params)
        # Grab all of the "optimizable" parameters in "group 2"
        self.p_params = [self.p_z_mean, self.p_z_logvar]
        self.p_params.extend(self.p_hi_given_si.mlp_params)
        self.p_params.extend(self.p_sip1_given_si_hi.mlp_params)
        self.p_params.extend(self.p_s0_given_z.mlp_params)

        # Make a joint list of parameters group 1/2
        self.joint_params = self.q_params + self.p_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z_q2p, self.kld_z_p2q, self.kld_hi_q2p, self.kld_hi_p2q = \
                self._construct_kld_costs(p=1.0)
        self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_hi = (self.lam_kld_q2p[0] * self.kld_hi_q2p) + \
                      (self.lam_kld_p2q[0] * self.kld_hi_p2q)
        self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_hi
        # now do l2 KLd costs
        self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_hi_q2p, self.kl2_hi_p2q = \
                self._construct_kld_costs(p=2.0)
        self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kl2_z_p2q)
        self.kl2_hi = (self.lam_kld_q2p[0] * self.kl2_hi_q2p) + \
                      (self.lam_kld_p2q[0] * self.kl2_hi_p2q)
        self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_hi
        # compute joint l1/l2 KLd cost
        self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \
                ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs)
        # compute "mean" (rather than per-input) costs
        self.kld_cost = T.mean(self.kld_costs)
        self.kl2_cost = T.mean(self.kl2_costs)
        self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self.nlli[-1]
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_l1l2_cost + \
                          self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_l1l2_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.q_updates = get_adam_updates(params=self.q_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.p_updates = get_adam_updates(params=self.p_params, \
                grads=self.joint_grads, alpha=self.lr_2, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.joint_updates = OrderedDict()
        for k in self.q_updates:
            self.joint_updates[k] = self.q_updates[k]
        for k in self.p_updates:
            self.joint_updates[k] = self.p_updates[k]
        # add scan updates, which seem to be required
        for k in self.scan_updates:
            self.joint_updates[k] = self.scan_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling cost computer...")
        self.compute_raw_klds = self._construct_raw_klds()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling open-loop model sampler...")
        self.sample_from_prior = self._construct_sample_from_prior()
        print("Compiling data-guided model sampler...")
        self.sample_from_input = self._construct_sample_from_input()
        return

    def set_sgd_params(self, lr_1=0.01, lr_2=0.01, \
                mom_1=0.9, mom_2=0.999):
        """
        Set learning rate and momentum parameter for all updates.
        """
        zero_ary = np.zeros((1,))
        # set learning rates
        new_lr_1 = zero_ary + lr_1
        self.lr_1.set_value(to_fX(new_lr_1))
        new_lr_2 = zero_ary + lr_2
        self.lr_2.set_value(to_fX(new_lr_2))
        # set momentums
        new_mom_1 = zero_ary + mom_1
        self.mom_1.set_value(to_fX(new_mom_1))
        new_mom_2 = zero_ary + mom_2
        self.mom_2.set_value(to_fX(new_mom_2))
        return

    def set_lam_nll(self, lam_nll=1.0):
        """
        Set weight for controlling the influence of the data likelihood.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_nll
        self.lam_nll.set_value(to_fX(new_lam))
        return

    def set_lam_kld(self, lam_kld_z=1.0, lam_kld_q2p=1.0, lam_kld_p2q=1.0):
        """
        Set the relative weight of various KL-divergences.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_kld_z
        self.lam_kld_z.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_q2p
        self.lam_kld_q2p.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_p2q
        self.lam_kld_p2q.set_value(to_fX(new_lam))
        return

    def set_lam_l2w(self, lam_l2w=1e-3):
        """
        Set the relative strength of l2 regularization on network params.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_l2w
        self.lam_l2w.set_value(to_fX(new_lam))
        return

    def set_train_switch(self, switch_val=0.0):
        """
        Set the switch for changing between training and sampling behavior.
        """
        if (switch_val < 0.5):
            switch_val = 0.0
        else:
            switch_val = 1.0
        zero_ary = np.zeros((1,))
        new_val = zero_ary + switch_val
        self.train_switch.set_value(to_fX(new_val))
        return

    def set_lam_kld_l1l2(self, lam_kld_l1l2=1.0):
        """
        Set the weight for shaping penalty on conditional priors over zt.
        """
        zero_ary = np.zeros((1,))
        new_val = zero_ary + lam_kld_l1l2
        self.lam_kld_l1l2.set_value(to_fX(new_val))
        return

    def set_drop_rate(self, drop_rate=0.0):
        """
        Set the weight for shaping penalty on conditional priors over zt.
        """
        zero_ary = np.zeros((1,))
        new_val = zero_ary + drop_rate
        self.drop_rate.set_value(to_fX(new_val))
        return

    def _construct_zmuv_samples(self, xi, br):
        """
        Construct the necessary (symbolic) samples for computing through this
        MultiStageModel for input (sybolic) matrix X.
        """
        z_zmuv = self.rng.normal( \
                size=(xi.shape[0]*br, self.z_dim), \
                avg=0.0, std=1.0, dtype=theano.config.floatX)
        hi_zmuv = self.rng.normal( \
                size=(self.ir_steps, xi.shape[0]*br, self.h_dim), \
                avg=0.0, std=1.0, dtype=theano.config.floatX)
        return z_zmuv, hi_zmuv

    def _construct_nll_costs(self, si, xo):
        """
        Construct the negative log-likelihood part of free energy.
        """
        # average log-likelihood over the refinement sequence
        xh = self.obs_transform(si)
        if self.x_type == 'bernoulli':
            ll_costs = log_prob_bernoulli(xo, xh)
        else:
            ll_costs = log_prob_gaussian2(xo, xh, \
                    log_vars=self.bounded_logvar)
        nll_costs = -ll_costs
        return nll_costs

    def _construct_kld_costs(self, p=1.0):
        """
        Construct the posterior KL-divergence part of cost to minimize.
        """
        kld_hi_q2ps = []
        kld_hi_p2qs = []
        for i in range(self.ir_steps):
            kld_hi_q2p = self.kldi_q2p[i]
            kld_hi_p2q = self.kldi_p2q[i]
            kld_hi_q2ps.append(T.sum(kld_hi_q2p**p, \
                    axis=1, keepdims=True))
            kld_hi_p2qs.append(T.sum(kld_hi_p2q**p, \
                    axis=1, keepdims=True))
        # compute the batch-wise costs
        kld_hi_q2p = sum(kld_hi_q2ps)
        kld_hi_p2q = sum(kld_hi_p2qs)
        # construct KLd cost for the distributions over z
        kld_z_q2ps = gaussian_kld(self.q_z_mean, self.q_z_logvar, \
                                  self.p_z_mean, self.p_z_logvar)
        kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \
                                  self.q_z_mean, self.q_z_logvar)
        kld_z_q2p = T.sum(kld_z_q2ps**p, axis=1, keepdims=True)
        kld_z_p2q = T.sum(kld_z_p2qs**p, axis=1, keepdims=True)
        return [kld_z_q2p, kld_z_p2q, kld_hi_q2p, kld_hi_p2q]

    def _construct_reg_costs(self):
        """
        Construct the cost for low-level basic regularization. E.g. for
        applying l2 regularization to the network activations and parameters.
        """
        param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params])
        return param_reg_cost

    def _construct_train_joint(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xi = T.matrix()
        xo = T.matrix()
        br = T.lscalar()
        # collect the outputs to return from this function
        outputs = [self.joint_cost, self.nll_cost, self.kld_cost, \
                   self.reg_cost, self.obs_costs]
        # compile the theano function
        _, hi_zmuv = self._construct_zmuv_samples(xi, br)
        func = theano.function(inputs=[ xi, xo, br ], \
                outputs=outputs, \
                givens={ self.x_in: xi.repeat(br, axis=0), \
                         self.x_out: xo.repeat(br, axis=0), \
                         self.hi_zmuv: hi_zmuv }, \
                updates=self.joint_updates)
        return func

    def _construct_raw_klds(self):
        """
        Construct function for computing KLd per latent dimension.
        """
        # gather step-wise costs into a single list (init costs at the end)
        all_step_costs = [self.init_klds, self.kldi_q2p, self.kldi_p2q]
        # compile theano function for computing all relevant costs
        inputs = [self.x_in, self.x_out, self.hi_zmuv]
        cost_func = theano.function(inputs=inputs, outputs=all_step_costs, \
                                    updates=self.scan_updates)
        def raw_kld_computer(XI, XO):
            hi_zmuv = to_fX( npr.randn(self.ir_steps, XI.shape[0], self.h_dim) )
            _all_costs = cost_func(XI, XO, hi_zmuv)
            _init_klds = _all_costs[0]
            _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0)
            _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0)
            results = [_init_klds, _kld_q2p, _kld_p2q]
            return results
        return raw_kld_computer

    def _construct_compute_fe_terms(self):
        """
        Construct a function for computing terms in variational free energy.
        """
        # setup some symbolic variables for theano to deal with
        xi = T.matrix()
        xo = T.matrix()
        _, hi_zmuv = self._construct_zmuv_samples(xi, 1)
        # construct values to output
        nll = self.nlli[-1]
        kld = self.kld_z.flatten() + self.kld_hi_q2p.flatten()
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(inputs=[ xi, xo ], \
                outputs=[nll, kld], \
                givens={self.x_in: xi, \
                        self.x_out: xo, \
                        self.hi_zmuv: hi_zmuv}, \
                updates=self.scan_updates)
        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(XI, XO, sample_count):
            # compute a multi-sample estimate of variational free-energy
            nll_sum = np.zeros((XI.shape[0],))
            kld_sum = np.zeros((XI.shape[0],))
            for i in range(sample_count):
                result = fe_term_sample(XI, XO)
                nll_sum += result[0].ravel()
                kld_sum += result[1].ravel()
            mean_nll = nll_sum / float(sample_count)
            mean_kld = kld_sum / float(sample_count)
            return [mean_nll, mean_kld]
        return fe_term_estimator

    def _construct_sample_from_prior(self):
        """
        Construct a function for drawing independent samples from the
        distribution generated by this MultiStageModel. This function returns
        the full sequence of "partially completed" examples.
        """
        z_sym = T.matrix()
        x_sym = T.matrix()
        irs = self.ir_steps
        oputs = [self.obs_transform(self.s0)]
        oputs.extend([self.obs_transform(self.si[i]) for i in range(irs)])
        _, hi_zmuv = self._construct_zmuv_samples(x_sym, 1)
        sample_func = theano.function(inputs=[z_sym, x_sym], outputs=oputs, \
                givens={ self.z: z_sym, \
                         self.x_in: T.zeros_like(x_sym), \
                         self.x_out: T.zeros_like(x_sym), \
                         self.hi_zmuv: hi_zmuv }, \
                updates=self.scan_updates)
        def prior_sampler(samp_count):
            x_samps = to_fX( np.zeros((samp_count, self.obs_dim)) )
            old_switch = self.train_switch.get_value(borrow=False)
            # set model to generation mode
            self.set_train_switch(switch_val=0.0)
            z_samps = to_fX( npr.randn(samp_count, self.z_dim) )
            model_samps = sample_func(z_samps, x_samps)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            return model_samps
        return prior_sampler

    def _construct_sample_from_input(self):
        """
        Construct a function for drawing samples from the distribution
        generated by this MultiStageModel, conditioned on some inputs to the
        initial encoder stage (i.e. self.q_z_given_x). This returns the full 
        sequence of "partially completed" examples.
        """
        xi = T.matrix()
        xo = T.matrix()
        irs = self.ir_steps
        oputs = [self.obs_transform(self.s0)]
        oputs.extend([self.obs_transform(self.si[i]) for i in range(irs)])
        _, hi_zmuv = self._construct_zmuv_samples(xi, 1)
        sample_func = theano.function(inputs=[xi, xo], outputs=oputs, \
                givens={ self.x_in: xi, \
                         self.x_out: xo, \
                         self.hi_zmuv: hi_zmuv }, \
                updates=self.scan_updates)
        def conditional_sampler(XI, XO=None, guided_decoding=False):
            XI = to_fX( XI )
            if XO is None:
                XO = XI
            XO = to_fX( XO )
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if guided_decoding:
                # take samples from guide policies (i.e. variational q)
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from model's generative policy
                self.set_train_switch(switch_val=0.0)
            # draw guided/unguided conditional samples
            model_samps = sample_func(XI, XO)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            return model_samps
        return conditional_sampler
Esempio n. 33
0
class ConvPoolLayer(object):
    """
    A simple convolution --> max-pooling layer.

    The (symbolic) input to this layer must be a theano.tensor.dtensor4 shaped
    like (batch_size, chan_count, im_dim_1, im_dim_2).

    filt_def should be a 4-tuple like (filt_count, in_chans, filt_def_1, filt_def_2)

    pool_def should be a 3-tuple like (pool_dim, pool_stride)
    """
    def __init__(self, rng, input=None, filt_def=None, pool_def=(2, 2), \
      activation=None, drop_rate=0., input_noise=0., bias_noise=0., \
      W=None, b=None, name="", W_scale=1.0):

        # Setup a shared random generator for this layer
        #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \
        #        rng.randint(100000))
        self.rng = CURAND_RandomStreams(rng.randint(1000000))

        self.clean_input = input

        # Add gaussian noise to the input (if desired)
        if (input_noise > 1e-4):
            self.fuzzy_input = input + self.rng.normal(size=input.shape, \
                    avg=0.0, std=input_noise, dtype=theano.config.floatX)
        else:
            self.fuzzy_input = input

        # Apply masking noise to the input (if desired)
        if (drop_rate > 1e-4):
            self.noisy_input = self._drop_from_input(self.fuzzy_input,
                                                     drop_rate)
        else:
            self.noisy_input = self.fuzzy_input

        # Set the activation function for the conv filters
        if activation:
            self.activation = activation
        else:
            self.activation = lambda x: relu_actfun(x)

        # initialize weights with random weights
        W_init = 0.01 * np.asarray(rng.normal( \
          size=filt_def), dtype=theano.config.floatX)
        self.W = theano.shared(value=(W_scale*W_init), \
          name="{0:s}_W".format(name))

        # the bias is a 1D tensor -- one bias per output feature map
        b_init = np.zeros((filt_def[0], ), dtype=theano.config.floatX) + 0.1
        self.b = theano.shared(value=b_init, name="{0:s}_b".format(name))

        # convolve input feature maps with filters
        input_c01b = self.noisy_input.dimshuffle(1, 2, 3, 0)  # bc01 to c01b
        filters_c01b = self.W.dimshuffle(1, 2, 3, 0)  # bc01 to c01b
        conv_op = FilterActs(stride=1, partial_sum=1)
        contig_input = gpu_contiguous(input_c01b)
        contig_filters = gpu_contiguous(filters_c01b)
        conv_out_c01b = conv_op(contig_input, contig_filters)

        if (bias_noise > 1e-4):
            noisy_conv_out_c01b = conv_out_c01b + self.rng.normal( \
              size=conv_out_c01b.shape, avg=0.0, std=bias_noise, \
              dtype=theano.config.floatX)
        else:
            noisy_conv_out_c01b = conv_out_c01b

        # downsample each feature map individually, using maxpooling
        pool_op = MaxPool(ds=pool_def[0], stride=pool_def[1])
        mp_out_c01b = pool_op(noisy_conv_out_c01b)
        mp_out_bc01 = mp_out_c01b.dimshuffle(3, 0, 1, 2)  # c01b to bc01

        # add the bias term. Since the bias is a vector (1D array), we first
        # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will
        # thus be broadcasted across mini-batches and feature map
        # width & height
        self.noisy_linear_output = mp_out_bc01 + self.b.dimshuffle(
            'x', 0, 'x', 'x')
        self.linear_output = self.noisy_linear_output
        self.output = self.activation(self.noisy_linear_output)

        # store parameters of this layer
        self.params = [self.W, self.b]

        return

    def _drop_from_input(self, input, p):
        """p is the probability of dropping elements of input."""
        # get a drop mask that drops things with probability p
        drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \
                dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # get a scaling factor to keep expectations fixed after droppage
        drop_scale = 1. / (1. - p)
        # apply dropout mask and rescaling factor to the input
        droppy_input = drop_scale * input * drop_mask
        return droppy_input

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                dtype=theano.config.floatX)
        return P_nz
Esempio n. 34
0
class DAELayer(object):
    def __init__(self, rng, clean_input=None, fuzzy_input=None, \
            in_dim=0, out_dim=0, activation=None, input_noise=0., \
            W=None, b_h=None, b_v=None):

        # Setup a shared random generator for this layer
        #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \
        #        rng.randint(100000))
        self.rng = CURAND_RandomStreams(rng.randint(1000000))

        # Grab the layer input and perturb it with some sort of noise. This
        # is, afterall, a _denoising_ autoencoder...
        self.clean_input = clean_input
        self.noisy_input = self._get_noisy_input(fuzzy_input, input_noise)

        # Set some basic layer properties
        self.activation = activation
        self.in_dim = in_dim
        self.out_dim = out_dim

        # Get some random initial weights and biases, if not given
        if W is None:
            W_init = np.asarray(0.01 * rng.standard_normal( \
                      size=(in_dim, out_dim)), dtype=theano.config.floatX)
            W = theano.shared(value=W_init, name='W')
        if b_h is None:
            b_init = np.zeros((out_dim, ), dtype=theano.config.floatX)
            b_h = theano.shared(value=b_init, name='b_h')
        if b_v is None:
            b_init = np.zeros((in_dim, ), dtype=theano.config.floatX)
            b_v = theano.shared(value=b_init, name='b_v')

        # Grab pointers to the now-initialized weights and biases
        self.W = W
        self.b_h = b_h
        self.b_v = b_v

        # Put the learnable/optimizable parameters into a list
        self.params = [self.W, self.b_h, self.b_v]
        # Beep boop... layer construction complete...
        return

    def compute_costs(self, lam_l1=None):
        """Compute reconstruction and activation sparsity costs."""
        # Get noise-perturbed encoder/decoder parameters
        W_nz = self._noisy_params(self.W, 0.01)
        b_nz = self.b_h  #self._noisy_params(self.b_h, 0.05)
        # Compute hidden and visible activations
        A_v, A_h = self._compute_activations(self.noisy_input, \
                W_nz, b_nz, self.b_v)
        # Compute reconstruction error cost
        recon_cost = T.sum((self.clean_input - A_v)**2.0) / \
                self.clean_input.shape[0]
        # Compute sparsity penalty (over both population and lifetime)
        row_l1_sum = T.sum(abs(row_normalize(A_h))) / A_h.shape[0]
        col_l1_sum = T.sum(abs(col_normalize(A_h))) / A_h.shape[1]
        sparse_cost = lam_l1[0] * (row_l1_sum + col_l1_sum)
        return [recon_cost, sparse_cost]

    def _compute_hidden_acts(self, X, W, b_h):
        """Compute activations of encoder (at hidden layer)."""
        A_h = self.activation(T.dot(X, W) + b_h)
        return A_h

    def _compute_activations(self, X, W, b_h, b_v):
        """Compute activations of decoder (at visible layer)."""
        A_h = self._compute_hidden_acts(X, W, b_h)
        A_v = T.dot(A_h, W.T) + b_v
        return [A_v, A_h]

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        if noise_lvl > 1e-3:
            P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                    dtype=theano.config.floatX)
        else:
            P_nz = P
        return P_nz

    def _get_noisy_input(self, input, p):
        """p is the probability of dropping elements of input."""
        drop_rnd = self.rng.uniform(input.shape, low=0.0, high=1.0, \
            dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # Cast mask from int to float32, to keep things on GPU
        noisy_input = input * drop_mask
        return noisy_input
class SRRModel(object):
    """
    Controller for training a sequential revelation and refinement model.

    Parameters:
        rng: numpy.random.RandomState (for reproducibility)
        x_out: the goal state for iterative refinement
        p_zi_given_xi: InfNet for stochastic part of step
        p_sip1_given_zi: HydraNet for deterministic part of step
        p_x_given_si: HydraNet for transform from s-space to x-space
        q_zi_given_xi: InfNet for the guide policy
        params: REQUIRED PARAMS SHOWN BELOW
                x_dim: dimension of observations to construct
                z_dim: dimension of latent space for policy wobble
                s_dim: dimension of space in which to perform construction
                use_p_x_given_si: boolean for whether to use p_x_given_si
                rev_sched: list of "revelation" blocks. each block is described
                           by the number of steps prior to revelation, and the
                           percentage of remaining pixels to reveal.
                rev_masks: matrix of revelation masks. the row i provides the
                           mask for iteration i of the srr loop. when this
                           argument is passed, rev_sched is ignored and the
                           revelation schedule is determined by rev_masks.
                step_type: either "add" or "jump"
                x_type: can be "bernoulli" or "gaussian"
                obs_transform: can be 'none' or 'sigmoid'
    """

    def __init__(
        self,
        rng=None,
        x_out=None,
        p_zi_given_xi=None,
        p_sip1_given_zi=None,
        p_x_given_si=None,
        q_zi_given_xi=None,
        params=None,
        shared_param_dicts=None,
    ):
        # setup a rng for this SRRModel
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_dim = self.params["x_dim"]
        self.z_dim = self.params["z_dim"]
        self.s_dim = self.params["s_dim"]
        self.use_p_x_given_si = self.params["use_p_x_given_si"]
        self.step_type = self.params["step_type"]
        self.x_type = self.params["x_type"]
        if self.use_p_x_given_si:
            print("Constructing hypotheses indirectly in s-space...")
        else:
            print("Constructing hypotheses directly in x-space...")
            assert self.s_dim == self.x_dim
        if "obs_transform" in self.params:
            assert (self.params["obs_transform"] == "sigmoid") or (self.params["obs_transform"] == "none")
            if self.params["obs_transform"] == "sigmoid":
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == "bernoulli":
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        self.shared_param_dicts = shared_param_dicts
        # Deal with revelation scheduling
        if ("rev_masks" in self.params) and (self.params["rev_masks"] is not None):
            rmp = self.params["rev_masks"][0].astype(theano.config.floatX)
            rmq = self.params["rev_masks"][1].astype(theano.config.floatX)
            self.rev_masks_p = theano.shared(value=rmp, name="srrm_rev_masks_p")
            self.rev_masks_q = theano.shared(value=rmq, name="srrm_rev_masks_q")
            self.rev_sched = None
            self.use_rev_masks = True
        else:
            self.rev_sched = self.params["rev_sched"]
            self.rev_masks_p = None
            self.rev_masks_q = None
            self.use_rev_masks = False
            nice_nums = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
            # "validate" the set of revelation block descriptions
            for rev_block in self.rev_sched:
                assert rev_block[0] in nice_nums
                assert (rev_block[1] >= 0.0) and (rev_block[1] <= 1.01)
        assert (self.x_type == "bernoulli") or (self.x_type == "gaussian")
        assert (self.step_type == "add") or (self.step_type == "jump")

        # grab handles to the relevant networks
        self.p_zi_given_xi = p_zi_given_xi
        self.p_sip1_given_zi = p_sip1_given_zi
        self.p_x_given_si = p_x_given_si
        self.q_zi_given_xi = q_zi_given_xi

        # record the symbolic variables that will provide inputs to the
        # computation graph created for this SRRModel
        self.x_out = x_out  # target output for generation
        self.zi_zmuv = T.tensor3()  # ZMUV gauss noise for policy wobble
        self.p_masks = T.tensor3()  # revelation masks for primary policy
        self.q_masks = T.tensor3()  # revelation masks for guide policy
        if self.use_rev_masks:
            self.total_steps = self.params["rev_masks"][0].shape[0]
        else:
            self.total_steps = sum([rb[0] for rb in self.rev_sched])

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX(np.zeros((1,)))
        self.train_switch = theano.shared(value=zero_ary, name="srrm_train_switch")
        self.set_train_switch(1.0)

        if self.shared_param_dicts is None:
            # initialize the parameters "owned" by this model
            s0_init = to_fX(np.zeros((self.s_dim,)))
            ss_init = to_fX(0.5 * np.ones((self.total_steps,)))
            self.s0 = theano.shared(value=s0_init, name="srrm_s0")
            self.obs_logvar = theano.shared(value=zero_ary, name="srrm_obs_logvar")
            self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.obs_logvar[0])
            self.step_scales = theano.shared(value=ss_init, name="srrm_step_scales")
            self.shared_param_dicts = {}
            self.shared_param_dicts["s0"] = self.s0
            self.shared_param_dicts["obs_logvar"] = self.obs_logvar
            self.shared_param_dicts["step_scales"] = self.step_scales
        else:
            # grab the parameters required by this model from a given dict
            self.s0 = self.shared_param_dicts["s0"]
            self.obs_logvar = self.shared_param_dicts["obs_logvar"]
            self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.obs_logvar[0])
            self.step_scales = self.shared_param_dicts["step_scales"]

        ##################################################################
        # Setup the sequential revelation and refinement loop using scan #
        ##################################################################
        # ss: This is a sequence of scalars that will be used to rescale the
        #     "gradient" input to the primary and guide policies.
        #
        # zi_zmuv: This is a sequence of ZMUV gaussian samples that will be
        #          reparametrized to sample actions from the policies.
        #
        # p_masks: This is a sequence of "unmasking" masks. When one of these
        #          masking variables is 1, the corresponding value in self.x_out
        #          will be "revealed" to the primary policy. Prediction error
        #          is measured for a value only the first time it is revealed.
        #          Once revealed, a value remains "visible" to the policy.
        #          The final step should reveal all values.
        #
        # q_masks: This is a sequence of "unmasking" masks. These are similar
        #          to p_masks, but control which values are revealed to the
        #          guide policy. The guide policy masking sequence should be
        #          constructed to stay "ahead of" the primary policy's masking
        #          sequence. The guide policy needs to know which values will
        #          be revealed to the primary policy so that it can focus its
        #          reconstruction efforts on those values. Otherwise, the guide
        #          policy will immediately reconstruct the entire target.
        #
        # si: This is the current "belief state" for each trial in the training
        #     batch. The belief state is updated in each iteration, and passed
        #     forward through the recurrence.
        #
        # mi_p: This is the current revelation mask for the primary policy.
        #
        # mi_q: This is the current revelation mask for the guide policy.
        #
        def srr_step_func(ss, zi_zmuv, p_masks, q_masks, si, mi_p, mi_q):
            # transform the current belief state into an observation
            si_as_x = self._from_si_to_x(si)
            full_grad = T.log(1.0 + T.exp(ss)) * (self.x_out - si_as_x)

            # get the masked belief state and gradient for primary policy
            xi_for_p = (mi_p * self.x_out) + ((1.0 - mi_p) * si_as_x)
            grad_for_p = mi_p * full_grad

            # update the guide policy's revelation mask
            new_to_q = (1.0 - mi_q) * q_masks
            mip1_q = mi_q + new_to_q
            # get the masked belief state and gradient for guide policy
            # xi_for_q = (mip1_q * self.x_out) + ((1.0 - mip1_q) * si_as_x)
            xi_for_q = xi_for_p
            grad_for_q = mip1_q * full_grad

            # get samples of next zi, according to the primary policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(
                T.horizontal_stack(xi_for_p, grad_for_p), do_samples=False
            )
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply(
                T.horizontal_stack(xi_for_q, grad_for_q), do_samples=False
            )
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)
            # make zi samples that can be switched between zi_p and zi_q
            zi = (self.train_switch[0] * zi_q) + ((1.0 - self.train_switch[0]) * zi_p)

            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar)  # KL(q || p)
            kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar)  # KL(p || q)
            kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0)  # KL(p || N(0, I))

            # compute next si, given sampled zi (i.e. update the belief state)
            hydra_out = self.p_sip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if self.step_type == "jump":
                # jump steps always do a full swap of belief state
                sip1 = si_step
            else:
                # additive steps adjust the belief state like an LSTM
                write_gate = T.nnet.sigmoid(2.0 + hydra_out[1])
                erase_gate = T.nnet.sigmoid(2.0 + hydra_out[2])
                sip1 = (erase_gate * si) + (write_gate * si_step)
            # update the primary policy's revelation mask
            new_to_p = (1.0 - mi_p) * p_masks
            mip1_p = mi_p + new_to_p
            # compute NLL only for the newly revealed values
            nlli = self._construct_nll_costs(sip1, self.x_out, new_to_p)
            # each loop iteration produces the following values:
            #   sip1: belief state at end of current step
            #   mip1_p: revealed values mask to use in next step (primary)
            #   mip1_q: revealed values mask to use in next step (guide)
            #   nlli: NLL for values revealed at end of current step
            #   kldi_q2p: KL(q || p) for the current step
            #   kldi_p2q: KL(p || q) for the current step
            #   kldi_p2g: KL(p || N(0,I)) for the current step
            return sip1, mip1_p, mip1_q, nlli, kldi_q2p, kldi_p2q, kldi_p2g

        # initialize belief state to self.s0
        self.s0_full = T.alloc(0.0, self.x_out.shape[0], self.s_dim) + self.s0
        # initialize revelation masks to 0 for all values in all trials
        self.m0_full = T.zeros_like(self.x_out)
        # setup initial values to pass to scan op
        outputs_init = [self.s0_full, self.m0_full, self.m0_full, None, None, None, None]
        sequences_init = [self.step_scales, self.zi_zmuv, self.p_masks, self.q_masks]
        # apply scan op for the sequential imputation loop
        self.scan_results, self.scan_updates = theano.scan(
            srr_step_func, outputs_info=outputs_init, sequences=sequences_init
        )

        # grab results of the scan op. all values are computed for each step
        self.si = self.scan_results[0]  # belief states
        self.mi_p = self.scan_results[1]  # primary revelation masks
        self.mi_q = self.scan_results[2]  # guide revelation masks
        self.nlli = self.scan_results[3]  # NLL on newly revealed values
        self.kldi_q2p = self.scan_results[4]  # KL(q || p)
        self.kldi_p2q = self.scan_results[5]  # KL(p || q)
        self.kldi_p2g = self.scan_results[6]  # KL(p || N(0,I))

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX(np.zeros((1,)))
        self.lr = theano.shared(value=zero_ary, name="srr_lr")
        # shared var momentum parameters for ADAM optimization
        self.mom_1 = theano.shared(value=zero_ary, name="srr_mom_1")
        self.mom_2 = theano.shared(value=zero_ary, name="srr_mom_2")
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared vars for weighting prior kld against reconstruction
        self.lam_kld_p = theano.shared(value=zero_ary, name="srr_lam_kld_p")
        self.lam_kld_q = theano.shared(value=zero_ary, name="srr_lam_kld_q")
        self.lam_kld_g = theano.shared(value=zero_ary, name="srr_lam_kld_g")
        self.lam_kld_s = theano.shared(value=zero_ary, name="srr_lam_kld_s")
        self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name="srr_lam_l2w")
        self.set_lam_l2w(1e-5)

        # grab all of the "optimizable" parameters from the base networks
        self.joint_params = [self.s0, self.obs_logvar, self.step_scales]
        self.joint_params.extend(self.p_zi_given_xi.mlp_params)
        self.joint_params.extend(self.p_sip1_given_zi.mlp_params)
        self.joint_params.extend(self.p_x_given_si.mlp_params)
        self.joint_params.extend(self.q_zi_given_xi.mlp_params)

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(p=1.0)
        self.kld_costs = (
            (self.lam_kld_p[0] * self.kld_p)
            + (self.lam_kld_q[0] * self.kld_q)
            + (self.lam_kld_g[0] * self.kld_g)
            + (self.lam_kld_s[0] * self.kld_s)
        )
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = T.sum(self.nlli, axis=0)  # sum the per-step NLLs
        self.nll_cost = T.mean(self.nll_costs)
        self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel()
        self.nll_bound = T.mean(self.nll_bounds)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-TRIAL COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(
            params=self.joint_params,
            grads=self.joint_grads,
            alpha=self.lr,
            beta1=self.mom_1,
            beta2=self.mom_2,
            mom2_init=1e-3,
            smoothing=1e-5,
            max_grad_norm=10.0,
        )
        for k, v in self.scan_updates.items():
            self.joint_updates[k] = v

        # Construct theano functions for training and diagnostic computations
        print("Compiling cost computer...")
        self.compute_raw_costs = self._construct_raw_costs()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling sequence sampler...")
        self.sequence_sampler = self._construct_sequence_sampler()
        # make easy access points for some interesting parameters
        # self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W
        return

    def _from_si_to_x(self, si):
        """
        Convert the given si from s-space to x-space.
        """
        if self.use_p_x_given_si:
            x_pre_trans, _ = self.p_x_given_si.apply(si)
        else:
            x_pre_trans = si
        x_post_trans = self.obs_transform(x_pre_trans)
        return x_post_trans

    def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999):
        """
        Set learning rate and momentum parameter for all updates.
        """
        zero_ary = np.zeros((1,))
        # set learning rate
        new_lr = zero_ary + lr
        self.lr.set_value(to_fX(new_lr))
        # set momentums (use first and second order "momentum")
        new_mom_1 = zero_ary + mom_1
        self.mom_1.set_value(to_fX(new_mom_1))
        new_mom_2 = zero_ary + mom_2
        self.mom_2.set_value(to_fX(new_mom_2))
        return

    def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0):
        """
        Set the relative weight of prior KL-divergence vs. data likelihood.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_kld_p
        self.lam_kld_p.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_q
        self.lam_kld_q.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_g
        self.lam_kld_g.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_s
        self.lam_kld_s.set_value(to_fX(new_lam))
        return

    def set_lam_l2w(self, lam_l2w=1e-3):
        """
        Set the relative strength of l2 regularization on network params.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_l2w
        self.lam_l2w.set_value(to_fX(new_lam))
        return

    def set_train_switch(self, switch_val=0.0):
        """
        Set the switch for changing between training and sampling behavior.
        """
        if switch_val < 0.5:
            switch_val = 0.0
        else:
            switch_val = 1.0
        zero_ary = np.zeros((1,))
        new_val = zero_ary + switch_val
        self.train_switch.set_value(to_fX(new_val))
        return

    def _construct_zi_zmuv(self, xo):
        """
        Construct the necessary ZMUV gaussian samples for generating
        trajectories from this SRRModel, for input matrix xo.
        """
        zi_zmuv = self.rng.normal(
            size=(self.total_steps, xo.shape[0], self.z_dim), avg=0.0, std=1.0, dtype=theano.config.floatX
        )
        return zi_zmuv

    def _construct_rev_masks(self, xo):
        """
        Compute the sequential revelation masks for the input batch in xo.
        -- We need to construct mask sequences for both p and q.
        """
        if self.use_rev_masks:
            # make batch copies of self.rev_masks_p and self.rev_masks_q
            pmasks = self.rev_masks_p.dimshuffle(0, "x", 1).repeat(xo.shape[0], axis=1)
            qmasks = self.rev_masks_q.dimshuffle(0, "x", 1).repeat(xo.shape[0], axis=1)
        else:
            pm_list = []
            qm_list = []
            # make a zero mask that does nothing
            zero_mask = T.alloc(0.0, 1, xo.shape[0], xo.shape[1])
            # generate independently sampled masks for each revelation block
            for rb in self.rev_sched:
                # make a random binary mask with ones at rate rb[1]
                rand_vals = self.rng.uniform(
                    size=(1, xo.shape[0], xo.shape[1]), low=0.0, high=1.0, dtype=theano.config.floatX
                )
                rand_mask = rand_vals < rb[1]
                # append the masks for this revleation block to the mask lists
                #
                # the guide policy (in q) gets to peek at the values that will be
                # revealed to the primary policy (in p) for the entire block. The
                # primary policy only gets to see these values at end of the final
                # step of the block. Within a given step, values are revealed to q
                # at the beginning of the step, and to p at the end.
                #
                # e.g. in a revelation block with only a single step, the guide
                # policy sees the values at the beginning of the step, which allows
                # it to guide the step. the primary policy only gets to see the
                # values at the end of the step.
                #
                # i.e. a standard variational auto-encoder is equivalent to a
                # sequential revelation and refinement model with only one
                # revelation block, which has one step and a reveal rate of 1.0.
                #
                for refine_step in range(rb[0] - 1):
                    pm_list.append(zero_mask)
                    qm_list.append(rand_mask)
                pm_list.append(rand_mask)
                qm_list.append(rand_mask)
            # concatenate each mask list into a 3-tensor
            pmasks = T.cast(T.concatenate(pm_list, axis=0), "floatX")
            qmasks = T.cast(T.concatenate(qm_list, axis=0), "floatX")
        return [pmasks, qmasks]

    def _construct_nll_costs(self, si, xo, nll_mask):
        """
        Construct the negative log-likelihood part of free energy.
        -- only check NLL where nll_mask == 1
        """
        xh = self._from_si_to_x(si)
        if self.x_type == "bernoulli":
            ll_costs = log_prob_bernoulli(xo, xh, mask=nll_mask)
        else:
            ll_costs = log_prob_gaussian2(xo, xh, log_vars=self.bounded_logvar, mask=nll_mask)
        nll_costs = -ll_costs.flatten()
        return nll_costs

    def _construct_kld_s(self, s_i, s_j):
        """
        Compute KL(s_i || s_j) -- assuming bernoullish outputs
        """
        x_i = self._from_si_to_x(s_i)
        x_j = self._from_si_to_x(s_j)
        kld_s = (x_i * (T.log(x_i) - T.log(x_j))) + ((1.0 - x_i) * (T.log(1.0 - x_i) - T.log(1.0 - x_j)))
        sum_kld = T.sum(kld_s, axis=1)
        return sum_kld

    def _construct_kld_costs(self, p=1.0):
        """
        Construct the policy KL-divergence part of cost to minimize.
        """
        kld_pis = []
        kld_qis = []
        kld_gis = []
        kld_sis = []
        s0 = 0.0 * self.si[0] + self.s0
        for i in range(self.total_steps):
            kld_pis.append(T.sum(self.kldi_p2q[i] ** p, axis=1))
            kld_qis.append(T.sum(self.kldi_q2p[i] ** p, axis=1))
            kld_gis.append(T.sum(self.kldi_p2g[i] ** p, axis=1))
            if i == 0:
                kld_sis.append(self._construct_kld_s(self.si[i], s0))
            else:
                kld_sis.append(self._construct_kld_s(self.si[i], self.si[i - 1]))
        # compute the batch-wise costs
        kld_pi = sum(kld_pis)
        kld_qi = sum(kld_qis)
        kld_gi = sum(kld_gis)
        kld_si = sum(kld_sis)
        return [kld_pi, kld_qi, kld_gi, kld_si]

    def _construct_reg_costs(self):
        """
        Construct the cost for low-level basic regularization. E.g. for
        applying l2 regularization to the network activations and parameters.
        """
        param_reg_cost = sum([T.sum(p ** 2.0) for p in self.joint_params])
        return param_reg_cost

    def _construct_compute_fe_terms(self):
        """
        Construct a function for computing terms in variational free energy.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # construct values to output
        nll = self.nll_costs.flatten()
        kld = self.kld_q.flatten()
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(
            inputs=[xo],
            outputs=[nll, kld],
            givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks},
            updates=self.scan_updates,
            on_unused_input="ignore",
        )
        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(XO, sample_count=20, use_guide_policy=True):
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from the guide policy
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from the primary policy
                self.set_train_switch(switch_val=0.0)
            # compute a multi-sample estimate of variational free-energy
            nll_sum = np.zeros((XO.shape[0],))
            kld_sum = np.zeros((XO.shape[0],))
            for i in range(sample_count):
                result = fe_term_sample(XO)
                nll_sum += result[0].ravel()
                kld_sum += result[1].ravel()
            mean_nll = nll_sum / float(sample_count)
            mean_kld = kld_sum / float(sample_count)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            if not use_guide_policy:
                # no KLd if samples are from the primary policy...
                mean_kld = 0.0 * mean_kld
            return [mean_nll, mean_kld]

        return fe_term_estimator

    def _construct_raw_costs(self):
        """
        Construct all the raw, i.e. not weighted by any lambdas, costs.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # compile theano function for computing the costs
        all_step_costs = [self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g]
        cost_func = theano.function(
            inputs=[xo],
            outputs=all_step_costs,
            givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks},
            updates=self.scan_updates,
            on_unused_input="ignore",
        )
        # make a function for computing batch-based estimates of costs.
        #   _step_nlls: the expected NLL cost for each step
        #   _step_klds: the expected KL(q||p) cost for each step
        #   _kld_q2p: the expected KL(q||p) cost for each latent dim
        #   _kld_p2q: the expected KL(p||q) cost for each latent dim
        #   _kld_p2g: the expected KL(p||N(0,I)) cost for each latent dim
        def raw_cost_computer(XO):
            _all_costs = cost_func(to_fX(XO))
            _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0)
            _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0)
            _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0)
            _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1)
            _step_klds = to_fX(np.asarray([k for k in _step_klds]))
            _step_nlls = np.mean(_all_costs[0], axis=1)
            _step_nlls = to_fX(np.asarray([k for k in _step_nlls]))
            results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g]
            return results

        return raw_cost_computer

    def _construct_train_joint(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # collect the outputs to return from this function
        outputs = [self.joint_cost, self.nll_bound, self.nll_cost, self.kld_cost, self.reg_cost, self.obs_costs]
        # compile the theano function
        func = theano.function(
            inputs=[xo],
            outputs=outputs,
            givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks},
            updates=self.joint_updates,
            on_unused_input="ignore",
        )
        return func

    def _construct_sequence_sampler(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # collect the outputs to return from this function
        states = [self._from_si_to_x(self.s0_full)] + [self._from_si_to_x(self.si[i]) for i in range(self.total_steps)]
        masks = [self.m0_full] + [self.mi_p[i] for i in range(self.total_steps)]
        outputs = states + masks
        # compile the theano function
        func = theano.function(
            inputs=[xo],
            outputs=outputs,
            givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks},
            updates=self.joint_updates,
            on_unused_input="ignore",
        )
        # visualize trajectories generated by the model
        def sample_func(XO, use_guide_policy=False):
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from the guide policy
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from the primary policy
                self.set_train_switch(switch_val=0.0)
            # get belief states and masks generated by the scan loop
            scan_vals = func(to_fX(XO))
            step_count = self.total_steps + 1
            seq_shape = (step_count, XO.shape[0], XO.shape[1])
            xm_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            xi_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            mi_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            for i in range(step_count):
                _xi = scan_vals[i]
                _mi = scan_vals[i + step_count]
                _xm = (_mi * XO) + ((1.0 - _mi) * _xi)
                xm_seq[i, :, :] = _xm
                xi_seq[i, :, :] = _xi
                mi_seq[i, :, :] = _mi
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            return [xm_seq, xi_seq, mi_seq]

        return sample_func

    def save_to_file(self, f_name=None):
        """
        Dump important stuff to a Python pickle, so that we can reload this
        model later.
        """
        assert not (f_name is None)
        f_handle = file(f_name, "wb")
        # dump the dict self.params, which just holds "simple" python values
        cPickle.dump(self.params, f_handle, protocol=-1)
        # make a copy of self.shared_param_dicts, with numpy arrays in place
        # of the theano shared variables
        numpy_param_dicts = {}
        for key in self.shared_param_dicts:
            numpy_ary = self.shared_param_dicts[key].get_value(borrow=False)
            numpy_param_dicts[key] = numpy_ary
        # dump the numpy version of self.shared_param_dicts to pickle file
        cPickle.dump(numpy_param_dicts, f_handle, protocol=-1)
        # get numpy dicts for each of the "child" models that we must save
        child_model_dicts = {}
        child_model_dicts["p_zi_given_xi"] = self.p_zi_given_xi.save_to_dict()
        child_model_dicts["p_sip1_given_zi"] = self.p_sip1_given_zi.save_to_dict()
        child_model_dicts["p_x_given_si"] = self.p_x_given_si.save_to_dict()
        child_model_dicts["q_zi_given_xi"] = self.q_zi_given_xi.save_to_dict()
        # dump the numpy child model dicts to the pickle file
        cPickle.dump(child_model_dicts, f_handle, protocol=-1)
        f_handle.close()
        return