コード例 #1
0
def compare_speed():
    # To run this speed comparison
    # cd <directory of this file>
    # THEANO_FLAGS=device=gpu \
    #   python -c 'import test_rng_curand; test_rng_curand.compare_speed()'

    mrg = MRG_RandomStreams()
    crn = CURAND_RandomStreams(234)

    N = 1000 * 100

    dest = theano.shared(numpy.zeros(N, dtype=theano.config.floatX))

    mrg_u = theano.function([], [], updates={dest: mrg.uniform((N,))},
            profile='mrg uniform')
    crn_u = theano.function([], [], updates={dest: crn.uniform((N,))},
            profile='crn uniform')
    mrg_n = theano.function([], [], updates={dest: mrg.normal((N,))},
            profile='mrg normal')
    crn_n = theano.function([], [], updates={dest: crn.normal((N,))},
            profile='crn normal')

    for f in mrg_u, crn_u, mrg_n, crn_n:
        # don't time the first call, it has some startup cost
        print('DEBUGPRINT')
        print('----------')
        theano.printing.debugprint(f)

    for i in range(100):
        for f in mrg_u, crn_u, mrg_n, crn_n:
            # don't time the first call, it has some startup cost
            f.fn.time_thunks = (i > 0)
            f()
コード例 #2
0
ファイル: test_rng_curand.py プロジェクト: yyq90/grammarVAE
def compare_speed():
    # To run this speed comparison
    # cd <directory of this file>
    # THEANO_FLAGS=device=gpu \
    #   python -c 'import test_rng_curand; test_rng_curand.compare_speed()'

    mrg = MRG_RandomStreams()
    crn = CURAND_RandomStreams(234)

    N = 1000 * 100

    dest = theano.shared(numpy.zeros(N, dtype=theano.config.floatX))

    mrg_u = theano.function([], [], updates={dest: mrg.uniform((N,))},
            profile='mrg uniform')
    crn_u = theano.function([], [], updates={dest: crn.uniform((N,))},
            profile='crn uniform')
    mrg_n = theano.function([], [], updates={dest: mrg.normal((N,))},
            profile='mrg normal')
    crn_n = theano.function([], [], updates={dest: crn.normal((N,))},
            profile='crn normal')

    for f in mrg_u, crn_u, mrg_n, crn_n:
        # don't time the first call, it has some startup cost
        print('DEBUGPRINT')
        print('----------')
        theano.printing.debugprint(f)

    for i in range(100):
        for f in mrg_u, crn_u, mrg_n, crn_n:
            # don't time the first call, it has some startup cost
            f.fn.time_thunks = (i > 0)
            f()
コード例 #3
0
def check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False):
    """
    check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False)

    Runs a basic sanity check on the `uniform` method of a
    `CURAND_RandomStreams` object.

    Checks that variates

     * are in the range [0, 1]
     * have a mean in the right neighbourhood (near 0.5)
     * are of the specified shape
     * successive calls produce different arrays of variates

    Parameters
    ----------
    shape_as_symbolic : boolean
        If `True`, est the case that the shape tuple is a symbolic
        variable rather than known at compile-time.

    dim_as_symbolic : boolean
        If `True`, test the case that an element of the shape
        tuple is a Theano symbolic. Irrelevant if `shape_as_symbolic`
        is `True`.
    """
    rng = CURAND_RandomStreams(234)
    if shape_as_symbolic:
        # instantiate a TensorConstant with the value (10, 10)
        shape = constant((10, 10))
    else:
        # Only one dimension is symbolic, with the others known
        if dim_as_symbolic:
            shape = (10, constant(10))
        else:
            shape = (10, 10)
    u0 = rng.uniform(shape)
    u1 = rng.uniform(shape)

    f0 = theano.function([], u0, mode=mode_with_gpu)
    f1 = theano.function([], u1, mode=mode_with_gpu)

    v0list = [f0() for i in range(3)]
    v1list = [f1() for i in range(3)]

    #print v0list
    #print v1list
    # assert that elements are different in a few ways
    assert numpy.all(v0list[0] != v0list[1])
    assert numpy.all(v1list[0] != v1list[1])
    assert numpy.all(v0list[0] != v1list[0])

    for v in v0list:
        assert v.shape == (10, 10)
        assert v.min() >= 0
        assert v.max() <= 1
        assert v.min() < v.max()
        assert .25 <= v.mean() <= .75
コード例 #4
0
def check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False):
    """
    check_uniform_basic(shape_as_symbolic, dim_as_symbolic=False)

    Runs a basic sanity check on the `uniform` method of a
    `CURAND_RandomStreams` object.

    Checks that variates

     * are in the range [0, 1]
     * have a mean in the right neighbourhood (near 0.5)
     * are of the specified shape
     * successive calls produce different arrays of variates

    Parameters
    ----------
    shape_as_symbolic : boolean
        If `True`, est the case that the shape tuple is a symbolic
        variable rather than known at compile-time.

    dim_as_symbolic : boolean
        If `True`, test the case that an element of the shape
        tuple is a Theano symbolic. Irrelevant if `shape_as_symbolic`
        is `True`.
    """
    rng = CURAND_RandomStreams(234)
    if shape_as_symbolic:
        # instantiate a TensorConstant with the value (10, 10)
        shape = constant((10, 10))
    else:
        # Only one dimension is symbolic, with the others known
        if dim_as_symbolic:
            shape = (10, constant(10))
        else:
            shape = (10, 10)
    u0 = rng.uniform(shape)
    u1 = rng.uniform(shape)

    f0 = theano.function([], u0, mode=mode_with_gpu)
    f1 = theano.function([], u1, mode=mode_with_gpu)

    v0list = [f0() for i in range(3)]
    v1list = [f1() for i in range(3)]

    # print v0list
    # print v1list
    # assert that elements are different in a few ways
    assert numpy.all(v0list[0] != v0list[1])
    assert numpy.all(v1list[0] != v1list[1])
    assert numpy.all(v0list[0] != v1list[0])

    for v in v0list:
        assert v.shape == (10, 10)
        assert v.min() >= 0
        assert v.max() <= 1
        assert v.min() < v.max()
        assert .25 <= v.mean() <= .75
コード例 #5
0
class Training(Layer):
    def __init__(self,rng, W=None,m=1.0, n_samples=50,shape=None,batch_size=1000):
        if W is None:
            W = numpy.asarray(rng.uniform(
                low=-numpy.sqrt(6. / (shape[0] + shape[1])),
                high=numpy.sqrt(6. / (shape[0] + shape[1])),
                size=(shape[0], shape[1])), dtype=theano.config.floatX)

        self.W = theano.shared(value=W, name='Hashtag_emb', borrow=True)
        self.batch_size = batch_size
        self.n_ht = W.shape[0]
        self.m = m
        self.n_samples = n_samples
        self.csrng = CURAND_RandomStreams(123)
        mask = self.csrng.uniform(size=(self.n_samples,1),low=0.0,high=1.0,dtype=theano.config.floatX)
        self.rfun = theano.function([],mask.argsort(axis=0))

        self.alpha = T.constant(1.0/numpy.arange(start=1,stop=self.n_ht + 1,step=1))

        self.weights = [self.W]
        self.biases = []

    def __repr__(self):
        return "{}: W_shape: {}, m={}, n_samples={}, n_ht={}".format(self.__class__.__name__, self.W.shape.eval(),self.m,self.n_samples,self.n_ht)

    def output_func(self, input):
        self.f = T.tensordot(input.dimshuffle(0,'x',1),self.W.dimshuffle('x',0,1),axes=[[1,2],[0,2]]) # cosine sim
        self.y_pred = T.argmax(self.f,axis=0)
        return self.y_pred

    def get_tag_neg(self,f,f_y):
            cand = f[(f > f_y - self.m).nonzero()]
            rnk =cand.shape[0] - 1# due to i != y
            if rnk == 0:
                return 0
            l = T.sum(self.alpha[T.arange(rnk)])
            return l/rnk

    def _warp_loss_cost(self, y,i):
        f_y = self.f[T.arange(y.shape[0]), y]
        s = self.m - f_y + self.f[T.arange(i.shape[0]),i]
        return T.maximum(0.0,s)

    def warp_loss_cost(self, y, idx):
        f_y = self.f[T.arange(y.shape[0]), y]
        f_yy = T.repeat(f_y.dimshuffle(0,'x'),self.f.shape[1],axis=1)

        f_idx = T.maximum(0.0,f_yy - self.f + self.m)
        idx = f_idx.argsort(axis=1)[:,0]

        s = self.m - f_y + self.f[T.arange(idx.shape[0]),idx]
        return T.maximum(0.0,s)

    def training_cost(self, y,i):
        return T.mean(self.warp_loss_cost(y,i))
コード例 #6
0
class Training(Layer):
    def __init__(self,
                 rng,
                 W=None,
                 m=1.0,
                 n_samples=50,
                 shape=None,
                 batch_size=1000):
        if W is None:
            W = numpy.asarray(rng.uniform(
                low=-numpy.sqrt(6. / (shape[0] + shape[1])),
                high=numpy.sqrt(6. / (shape[0] + shape[1])),
                size=(shape[0], shape[1])),
                              dtype=theano.config.floatX)

        self.W = theano.shared(value=W, name='Hashtag_emb', borrow=True)
        self.batch_size = batch_size
        self.n_ht = W.shape[0]
        self.m = m
        self.n_samples = n_samples
        self.csrng = CURAND_RandomStreams(123)
        mask = self.csrng.uniform(size=(self.n_samples, 1),
                                  low=0.0,
                                  high=1.0,
                                  dtype=theano.config.floatX)
        self.rfun = theano.function([], mask.argsort(axis=0))

        self.alpha = T.constant(
            1.0 / numpy.arange(start=1, stop=self.n_ht + 1, step=1))

        self.weights = [self.W]
        self.biases = []

    def __repr__(self):
        return "{}: W_shape: {}, m={}, n_samples={}, n_ht={}".format(
            self.__class__.__name__, self.W.shape.eval(), self.m,
            self.n_samples, self.n_ht)

    def output_func(self, input):
        self.f = T.tensordot(input.dimshuffle(0, 'x', 1),
                             self.W.dimshuffle('x', 0, 1),
                             axes=[[1, 2], [0, 2]])  # cosine sim
        self.y_pred = T.argmax(self.f, axis=0)
        return self.y_pred

    def get_tag_neg(self, f, f_y):
        cand = f[(f > f_y - self.m).nonzero()]
        rnk = cand.shape[0] - 1  # due to i != y
        if rnk == 0:
            return 0
        l = T.sum(self.alpha[T.arange(rnk)])
        return l / rnk

    def _warp_loss_cost(self, y, i):
        f_y = self.f[T.arange(y.shape[0]), y]
        s = self.m - f_y + self.f[T.arange(i.shape[0]), i]
        return T.maximum(0.0, s)

    def warp_loss_cost(self, y, idx):
        f_y = self.f[T.arange(y.shape[0]), y]
        f_yy = T.repeat(f_y.dimshuffle(0, 'x'), self.f.shape[1], axis=1)

        f_idx = T.maximum(0.0, f_yy - self.f + self.m)
        idx = f_idx.argsort(axis=1)[:, 0]

        s = self.m - f_y + self.f[T.arange(idx.shape[0]), idx]
        return T.maximum(0.0, s)

    def training_cost(self, y, i):
        return T.mean(self.warp_loss_cost(y, i))
コード例 #7
0
class WalkoutModel(object):
    """
    Controller for training a forwards-backwards chainy model.

    Parameters:
        rng: numpy.random.RandomState (for reproducibility)
        x_out: the goal state for forwards-backwards walking process
        p_z_given_x: InfNet for stochastic part of step
        p_x_given_z: HydraNet for deterministic part of step
        params: REQUIRED PARAMS SHOWN BELOW
                x_dim: dimension of observations to construct
                z_dim: dimension of latent space for policy wobble
                walkout_steps: number of steps to walk out
                x_type: can be "bernoulli" or "gaussian"
                x_transform: can be 'none' or 'sigmoid'
    """
    def __init__(self, rng=None,
            x_out=None, \
            p_z_given_x=None, \
            p_x_given_z=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this WalkoutModel
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_dim = self.params['x_dim']
        self.z_dim = self.params['z_dim']
        self.walkout_steps = self.params['walkout_steps']
        self.x_type = self.params['x_type']
        self.shared_param_dicts = shared_param_dicts
        if 'x_transform' in self.params:
            assert((self.params['x_transform'] == 'sigmoid') or \
                    (self.params['x_transform'] == 'none'))
            if self.params['x_transform'] == 'sigmoid':
                self.x_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.x_transform = lambda x: x
        else:
            self.x_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.x_transform = lambda x: T.nnet.sigmoid(x)
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        assert((self.step_type == 'add') or (self.step_type == 'jump'))

        # grab handles to the relevant networks
        self.p_z_given_x = p_z_given_x
        self.p_x_given_z = p_x_given_z

        # record the symbolic variables that will provide inputs to the
        # computation graph created for this WalkoutModel
        self.x_out = x_out           # target output for generation
        self.zi_zmuv = T.tensor3()   # ZMUV gauss noise for walk-out wobble

        if self.shared_param_dicts is None:
            # initialize the parameters "owned" by this model
            zero_ary = to_fX( np.zeros((1,)) )
            self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0])
            self.shared_param_dicts = {}
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            # grab the parameters required by this model from a given dict
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar[0])

        ###############################################################
        # Setup the forwards (i.e. training) walk-out loop using scan #
        ###############################################################
        def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw):
            # get samples of next zi, according to the forwards model
            zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \
                                       do_samples=False)
            zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv)

            # check reverse direction probability p(xi_fw | zi_fw)
            xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \
                                       do_samples=False)
            xi_bw_mean = self.x_transform(xi_bw_mean)
            nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \
                        log_vars=xi_bw_logvar, mask=None)
            nll_xi_bw = nll_xi_bw.flatten()

            # get samples of next xi, according to the forwards model
            xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \
                                       do_samples=False)
            xi_fw_mean = self.x_transform(xi_fw_mean)
            xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv)

            # check reverse direction probability p(zi_fw | xi_fw)
            zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \
                                       do_samples=False)
            nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \
                        log_vars=zi_bw_logvar, mask=None)
            nll_zi_bw = nll_zi_bw.flatten()

            # each loop iteration produces the following values:
            #   xi_fw: xi generated fom zi by forwards walk
            #   zi_fw: zi generated fom xi by forwards walk
            #   xi_fw_mean: ----
            #   xi_fw_logvar: ----
            #   zi_fw_mean: ----
            #   zi_fw_logvar: ----
            #   nll_xi_bw: NLL for reverse step zi_fw -> xi_fw
            #   nll_zi_bw: NLL for reverse step xi_fw -> zi_fw
            return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw

        # initialize states for x/z
        self.x0 = self.x_out
        self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim)
        # setup initial values to pass to scan op
        outputs_init = [self.x0, self.z0, None, None, None, None, None, None]
        sequences_init = [self.xi_zmuv, self.zi_zmuv]
        # apply scan op for the sequential imputation loop
        self.scan_results, self.scan_updates = theano.scan(forwards_loop, \
                    outputs_info=outputs_init, \
                    sequences=sequences_init)

        # grab results of the scan op. all values are computed for each step
        self.xi = self.scan_results[0]
        self.zi = self.scan_results[1]
        self.xi_fw_mean = self.scan_results[2]
        self.xi_fw_logvar = self.scan_results[3]
        self.zi_fw_mean = self.scan_results[4]
        self.zi_fw_logvar = self.scan_results[5]
        self.nll_xi_bw = self.scan_results[6]
        self.nll_zi_bw = self.scan_results[7]

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr = theano.shared(value=zero_ary, name='srr_lr')
        # shared var momentum parameters for ADAM optimization
        self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared vars for weighting prior kld against reconstruction
        self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p')
        self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q')
        self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g')
        self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s')
        self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w')
        self.set_lam_l2w(1e-5)

        # grab all of the "optimizable" parameters from the base networks
        self.joint_params = [self.s0, self.obs_logvar, self.step_scales]
        self.joint_params.extend(self.p_zi_given_xi.mlp_params)
        self.joint_params.extend(self.p_sip1_given_zi.mlp_params)
        self.joint_params.extend(self.p_x_given_si.mlp_params)
        self.joint_params.extend(self.q_zi_given_xi.mlp_params)

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(p=1.0)
        self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \
                         (self.lam_kld_q[0] * self.kld_q) + \
                         (self.lam_kld_g[0] * self.kld_g) + \
                         (self.lam_kld_s[0] * self.kld_s)
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = T.sum(self.nlli, axis=0) # sum the per-step NLLs
        self.nll_cost = T.mean(self.nll_costs)
        self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel()
        self.nll_bound = T.mean(self.nll_bounds)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-TRIAL COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        for k, v in self.scan_updates.items():
            self.joint_updates[k] = v

        # Construct theano functions for training and diagnostic computations
        print("Compiling cost computer...")
        self.compute_raw_costs = self._construct_raw_costs()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling sequence sampler...")
        self.sequence_sampler = self._construct_sequence_sampler()
        # make easy access points for some interesting parameters
        #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W
        return

    def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999):
        """
        Set learning rate and momentum parameter for all updates.
        """
        zero_ary = np.zeros((1,))
        # set learning rate
        new_lr = zero_ary + lr
        self.lr.set_value(to_fX(new_lr))
        # set momentums (use first and second order "momentum")
        new_mom_1 = zero_ary + mom_1
        self.mom_1.set_value(to_fX(new_mom_1))
        new_mom_2 = zero_ary + mom_2
        self.mom_2.set_value(to_fX(new_mom_2))
        return

    def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0):
        """
        Set the relative weight of prior KL-divergence vs. data likelihood.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_kld_p
        self.lam_kld_p.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_q
        self.lam_kld_q.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_g
        self.lam_kld_g.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_s
        self.lam_kld_s.set_value(to_fX(new_lam))
        return

    def set_lam_l2w(self, lam_l2w=1e-3):
        """
        Set the relative strength of l2 regularization on network params.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_l2w
        self.lam_l2w.set_value(to_fX(new_lam))
        return

    def set_train_switch(self, switch_val=0.0):
        """
        Set the switch for changing between training and sampling behavior.
        """
        if (switch_val < 0.5):
            switch_val = 0.0
        else:
            switch_val = 1.0
        zero_ary = np.zeros((1,))
        new_val = zero_ary + switch_val
        self.train_switch.set_value(to_fX(new_val))
        return

    def _construct_zi_zmuv(self, xo):
        """
        Construct the necessary ZMUV gaussian samples for generating
        trajectories from this WalkoutModel, for input matrix xo.
        """
        zi_zmuv = self.rng.normal( \
                size=(self.total_steps, xo.shape[0], self.z_dim), \
                avg=0.0, std=1.0, dtype=theano.config.floatX)
        return zi_zmuv

    def _construct_rev_masks(self, xo):
        """
        Compute the sequential revelation masks for the input batch in xo.
        -- We need to construct mask sequences for both p and q.
        """
        if self.use_rev_masks:
            # make batch copies of self.rev_masks_p and self.rev_masks_q
            pmasks = self.rev_masks_p.dimshuffle(0,'x',1).repeat(xo.shape[0], axis=1)
            qmasks = self.rev_masks_q.dimshuffle(0,'x',1).repeat(xo.shape[0], axis=1)
        else:
            pm_list = []
            qm_list = []
            # make a zero mask that does nothing
            zero_mask = T.alloc(0.0, 1, xo.shape[0], xo.shape[1])
            # generate independently sampled masks for each revelation block
            for rb in self.rev_sched:
                # make a random binary mask with ones at rate rb[1]
                rand_vals = self.rng.uniform( \
                        size=(1, xo.shape[0], xo.shape[1]), \
                        low=0.0, high=1.0, dtype=theano.config.floatX)
                rand_mask = rand_vals < rb[1]
                # append the masks for this revleation block to the mask lists
                #
                # the guide policy (in q) gets to peek at the values that will be
                # revealed to the primary policy (in p) for the entire block. The
                # primary policy only gets to see these values at end of the final
                # step of the block. Within a given step, values are revealed to q
                # at the beginning of the step, and to p at the end.
                #
                # e.g. in a revelation block with only a single step, the guide
                # policy sees the values at the beginning of the step, which allows
                # it to guide the step. the primary policy only gets to see the
                # values at the end of the step.
                #
                # i.e. a standard variational auto-encoder is equivalent to a
                # sequential revelation and refinement model with only one
                # revelation block, which has one step and a reveal rate of 1.0.
                #
                for refine_step in range(rb[0]-1):
                    pm_list.append(zero_mask)
                    qm_list.append(rand_mask)
                pm_list.append(rand_mask)
                qm_list.append(rand_mask)
            # concatenate each mask list into a 3-tensor
            pmasks = T.cast(T.concatenate(pm_list, axis=0), 'floatX')
            qmasks = T.cast(T.concatenate(qm_list, axis=0), 'floatX')
        return [pmasks, qmasks]

    def _construct_nll_costs(self, si, xo, nll_mask):
        """
        Construct the negative log-likelihood part of free energy.
        -- only check NLL where nll_mask == 1
        """
        xh = self._from_si_to_x( si )
        if self.x_type == 'bernoulli':
            ll_costs = log_prob_bernoulli(xo, xh, mask=nll_mask)
        else:
            ll_costs = log_prob_gaussian2(xo, xh, \
                    log_vars=self.bounded_logvar, mask=nll_mask)
        nll_costs = -ll_costs.flatten()
        return nll_costs

    def _construct_kld_s(self, s_i, s_j):
        """
        Compute KL(s_i || s_j) -- assuming bernoullish outputs
        """
        x_i = self._from_si_to_x( s_i )
        x_j = self._from_si_to_x( s_j )
        kld_s = (x_i * (T.log(x_i)  - T.log(x_j))) + \
                ((1.0 - x_i) * (T.log(1.0-x_i) - T.log(1.0-x_j)))
        sum_kld = T.sum(kld_s, axis=1)
        return sum_kld

    def _construct_kld_costs(self, p=1.0):
        """
        Construct the policy KL-divergence part of cost to minimize.
        """
        kld_pis = []
        kld_qis = []
        kld_gis = []
        kld_sis = []
        s0 = 0.0*self.si[0] + self.s0
        for i in range(self.total_steps):
            kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1))
            kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1))
            kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1))
            if i == 0:
                kld_sis.append(self._construct_kld_s(self.si[i], s0))
            else:
                kld_sis.append(self._construct_kld_s(self.si[i], self.si[i-1]))
        # compute the batch-wise costs
        kld_pi = sum(kld_pis)
        kld_qi = sum(kld_qis)
        kld_gi = sum(kld_gis)
        kld_si = sum(kld_sis)
        return [kld_pi, kld_qi, kld_gi, kld_si]

    def _construct_reg_costs(self):
        """
        Construct the cost for low-level basic regularization. E.g. for
        applying l2 regularization to the network activations and parameters.
        """
        param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params])
        return param_reg_cost

    def _construct_compute_fe_terms(self):
        """
        Construct a function for computing terms in variational free energy.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # construct values to output
        nll = self.nll_costs.flatten()
        kld = self.kld_q.flatten()
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(inputs=[ xo ], \
                outputs=[nll, kld], \
                givens={self.x_out: xo, \
                        self.zi_zmuv: zizmuv, \
                        self.p_masks: pmasks, \
                        self.q_masks: qmasks}, \
                updates=self.scan_updates, \
                on_unused_input='ignore')
        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(XO, sample_count=20, use_guide_policy=True):
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from the guide policy
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from the primary policy
                self.set_train_switch(switch_val=0.0)
            # compute a multi-sample estimate of variational free-energy
            nll_sum = np.zeros((XO.shape[0],))
            kld_sum = np.zeros((XO.shape[0],))
            for i in range(sample_count):
                result = fe_term_sample(XO)
                nll_sum += result[0].ravel()
                kld_sum += result[1].ravel()
            mean_nll = nll_sum / float(sample_count)
            mean_kld = kld_sum / float(sample_count)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            if not use_guide_policy:
                # no KLd if samples are from the primary policy...
                mean_kld = 0.0 * mean_kld
            return [mean_nll, mean_kld]
        return fe_term_estimator

    def _construct_raw_costs(self):
        """
        Construct all the raw, i.e. not weighted by any lambdas, costs.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # compile theano function for computing the costs
        all_step_costs = [self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g]
        cost_func = theano.function(inputs=[ xo ], \
                    outputs=all_step_costs, \
                    givens={self.x_out: xo, \
                            self.zi_zmuv: zizmuv, \
                            self.p_masks: pmasks, \
                            self.q_masks: qmasks}, \
                    updates=self.scan_updates, \
                    on_unused_input='ignore')
        # make a function for computing batch-based estimates of costs.
        #   _step_nlls: the expected NLL cost for each step
        #   _step_klds: the expected KL(q||p) cost for each step
        #   _kld_q2p: the expected KL(q||p) cost for each latent dim
        #   _kld_p2q: the expected KL(p||q) cost for each latent dim
        #   _kld_p2g: the expected KL(p||N(0,I)) cost for each latent dim
        def raw_cost_computer(XO):
            _all_costs = cost_func(to_fX(XO))
            _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0)
            _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0)
            _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0)
            _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1)
            _step_klds = to_fX( np.asarray([k for k in _step_klds]) )
            _step_nlls = np.mean(_all_costs[0], axis=1)
            _step_nlls = to_fX( np.asarray([k for k in _step_nlls]) )
            results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g]
            return results
        return raw_cost_computer

    def _construct_train_joint(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # collect the outputs to return from this function
        outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \
                   self.kld_cost, self.reg_cost, self.obs_costs]
        # compile the theano function
        func = theano.function(inputs=[ xo ], \
                outputs=outputs, \
                givens={self.x_out: xo, \
                        self.zi_zmuv: zizmuv, \
                        self.p_masks: pmasks, \
                        self.q_masks: qmasks}, \
                updates=self.joint_updates, \
                on_unused_input='ignore')
        return func

    def _construct_sequence_sampler(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # collect the outputs to return from this function
        states = [self._from_si_to_x(self.s0_full)] + \
                 [self._from_si_to_x(self.si[i]) for i in range(self.total_steps)]
        masks = [self.m0_full] + [self.mi_p[i] for i in range(self.total_steps)]
        outputs = states + masks
        # compile the theano function
        func = theano.function(inputs=[ xo ], \
                outputs=outputs, \
                givens={self.x_out: xo, \
                        self.zi_zmuv: zizmuv, \
                        self.p_masks: pmasks, \
                        self.q_masks: qmasks}, \
                updates=self.joint_updates, \
                on_unused_input='ignore')
        # visualize trajectories generated by the model
        def sample_func(XO, use_guide_policy=False):
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from the guide policy
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from the primary policy
                self.set_train_switch(switch_val=0.0)
            # get belief states and masks generated by the scan loop
            scan_vals = func(to_fX(XO))
            step_count = self.total_steps + 1
            seq_shape = (step_count, XO.shape[0], XO.shape[1])
            xm_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            xi_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            mi_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            for i in range(step_count):
                _xi = scan_vals[i]
                _mi = scan_vals[i + step_count]
                _xm = (_mi * XO) + ((1.0 - _mi) * _xi)
                xm_seq[i,:,:] = _xm
                xi_seq[i,:,:] = _xi
                mi_seq[i,:,:] = _mi
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            return [xm_seq, xi_seq, mi_seq]
        return sample_func

    def save_to_file(self, f_name=None):
        """
        Dump important stuff to a Python pickle, so that we can reload this
        model later.
        """
        assert(not (f_name is None))
        f_handle = file(f_name, 'wb')
        # dump the dict self.params, which just holds "simple" python values
        cPickle.dump(self.params, f_handle, protocol=-1)
        # make a copy of self.shared_param_dicts, with numpy arrays in place
        # of the theano shared variables
        numpy_param_dicts = {}
        for key in self.shared_param_dicts:
            numpy_ary = self.shared_param_dicts[key].get_value(borrow=False)
            numpy_param_dicts[key] = numpy_ary
        # dump the numpy version of self.shared_param_dicts to pickle file
        cPickle.dump(numpy_param_dicts, f_handle, protocol=-1)
        # get numpy dicts for each of the "child" models that we must save
        child_model_dicts = {}
        child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict()
        child_model_dicts['p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict()
        child_model_dicts['p_x_given_si'] = self.p_x_given_si.save_to_dict()
        child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict()
        # dump the numpy child model dicts to the pickle file
        cPickle.dump(child_model_dicts, f_handle, protocol=-1)
        f_handle.close()
        return
コード例 #8
0
class GenConvModule(object):
    """
    Module of one "fractionally strided" convolution layer followed by one
    regular convolution layer. Inputs to the fractionally strided convolution
    can optionally be augmented with some random values.

    Params:
        filt_shape: shape for convolution filters -- should be square and odd
        in_chans: number of channels in the inputs to module
        out_chans: number of channels in the outputs from module
        rand_chans: number of random channels to augment input
        use_rand: flag for whether or not to augment inputs
        apply_bn_1: flag for whether to batch normalize following first conv
        apply_bn_2: flag for whether to batch normalize following second conv
        us_stride: upsampling ratio in the fractionally strided convolution
        use_pooling: whether to use unpooling or fractional striding
        init_func: function for initializing module parameters
        mod_name: text name for identifying module in theano graph
        rand_type: whether to use Gaussian or uniform randomness
    """
    def __init__(self, filt_shape, in_chans, out_chans, rand_chans,
                 use_rand=True, apply_bn_1=True, apply_bn_2=True,
                 us_stride=2, use_pooling=True,
                 init_func=None, mod_name='gm_conv',
                 rand_type='normal'):
        assert ((filt_shape[0] % 2) > 0), "filter dim should be odd (not even)"
        self.filt_dim = filt_shape[0]
        self.in_chans = in_chans
        self.out_chans = out_chans
        self.rand_chans = rand_chans
        self.use_rand = use_rand
        self.apply_bn_1 = apply_bn_1
        self.apply_bn_2 = apply_bn_2
        self.us_stride = us_stride
        self.use_pooling = use_pooling
        self.mod_name = mod_name
        self.rand_type = rand_type
        self.rng = RandStream(123)
        if init_func is None:
            self.init_func = inits.Normal(scale=0.02)
        else:
            self.init_func = init_func
        self._init_params() # initialize parameters
        return

    def _init_params(self):
        """
        Initialize parameters for the layers in this generator module.
        """
        if self.use_rand:
            # random values will be stacked on exogenous input
            self.w1 = self.init_func((self.out_chans, (self.in_chans+self.rand_chans), self.filt_dim, self.filt_dim),
                                     "{}_w1".format(self.mod_name))
        else:
            # random values won't be stacked on exogenous input
            self.w1 = self.init_func((self.out_chans, self.in_chans, self.filt_dim, self.filt_dim),
                         "{}_w1".format(self.mod_name))
        self.w2 = self.init_func((self.out_chans, self.out_chans, self.filt_dim, self.filt_dim), 
                                 "{}_w2".format(self.mod_name))
        self.params = [self.w1, self.w2]
        # make gains and biases for transforms that will get batch normed
        if self.apply_bn_1:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g1 = gain_ifn((self.out_chans), "{}_g1".format(self.mod_name))
            self.b1 = bias_ifn((self.out_chans), "{}_b1".format(self.mod_name))
            self.params.extend([self.g1, self.b1])
        if self.apply_bn_2:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g2 = gain_ifn((self.out_chans), "{}_g2".format(self.mod_name))
            self.b2 = bias_ifn((self.out_chans), "{}_b2".format(self.mod_name))
            self.params.extend([self.g2, self.b2])
        return

    def apply(self, input, rand_vals=None):
        """
        Apply this generator module to some input.
        """
        batch_size = input.shape[0]
        bm = int((self.filt_dim - 1) / 2) # use "same" mode convolutions
        ss = self.us_stride               # stride for "learned upsampling"
        if self.use_pooling:
            # "unpool" the input if desired
            input = input.repeat(ss, axis=2).repeat(ss, axis=3)
        # get shape for random values that will augment input
        rand_shape = (batch_size, self.rand_chans, input.shape[2], input.shape[3])
        if self.use_rand:
            # augment input with random channels
            if rand_vals is None:
                if self.rand_type == 'normal':
                    rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \
                                                dtype=theano.config.floatX)
                else:
                    rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \
                                                 dtype=theano.config.floatX)
            rand_vals = rand_vals.reshape(rand_shape)
            # stack random values on top of input
            full_input = T.concatenate([rand_vals, input], axis=1)
        else:
            # don't augment input with random channels
            full_input = input
        # apply first convolution, perhaps with fractional striding
        if self.use_pooling:
            h1 = dnn_conv(full_input, self.w1, subsample=(1, 1), border_mode=(bm, bm))
        else:
            # apply first conv layer (with fractional stride for upsampling)
            h1 = deconv(full_input, self.w1, subsample=(ss, ss), border_mode=(bm, bm))
        if self.apply_bn_1:
            h1 = batchnorm(h1, g=self.g1, b=self.b1)
        h1 = relu(h1)
        # apply second conv layer
        h2 = dnn_conv(h1, self.w2, subsample=(1, 1), border_mode=(bm, bm))
        if self.apply_bn_2:
            h2 = batchnorm(h2, g=self.g2, b=self.b2)
        h2 = relu(h2)
        return h2
コード例 #9
0
class DAELayer(object):
    def __init__(self, rng, clean_input=None, fuzzy_input=None, \
            in_dim=0, out_dim=0, activation=None, input_noise=0., \
            W=None, b_h=None, b_v=None, W_scale=1.0):

        # Setup a shared random generator for this layer
        self.rng = RandStream(rng.randint(1000000))

        # Grab the layer input and perturb it with some sort of noise. This
        # is, afterall, a _denoising_ autoencoder...
        self.clean_input = clean_input
        self.noisy_input = self._get_noisy_input(fuzzy_input, input_noise)

        # Set some basic layer properties
        self.activation = activation
        self.in_dim = in_dim
        self.out_dim = out_dim

        # Get some random initial weights and biases, if not given
        if W is None:
            W_init = np.asarray(1.0 * DCG(rng.standard_normal( \
                      size=(in_dim, out_dim)), dtype=theano.config.floatX))
            W = theano.shared(value=(W_scale*W_init), name='W')
        if b_h is None:
            b_init = np.zeros((out_dim,), dtype=theano.config.floatX)
            b_h = theano.shared(value=b_init, name='b_h')
        if b_v is None:
            b_init = np.zeros((in_dim,), dtype=theano.config.floatX)
            b_v = theano.shared(value=b_init, name='b_v')

        # Grab pointers to the now-initialized weights and biases
        self.W = W
        self.b_h = b_h
        self.b_v = b_v

        # Put the learnable/optimizable parameters into a list
        self.params = [self.W, self.b_h, self.b_v]
        # Beep boop... layer construction complete...
        return

    def compute_costs(self, lam_l1=None):
        """Compute reconstruction and activation sparsity costs."""
        # Get noise-perturbed encoder/decoder parameters
        W_nz = self._noisy_params(self.W, 0.01)
        b_nz = self.b_h #self._noisy_params(self.b_h, 0.05)
        # Compute hidden and visible activations
        A_v, A_h = self._compute_activations(self.noisy_input, \
                W_nz, b_nz, self.b_v)
        # Compute reconstruction error cost
        recon_cost = T.sum((self.clean_input - A_v)**2.0) / \
                self.clean_input.shape[0]
        # Compute sparsity penalty (over both population and lifetime)
        row_l1_sum = T.sum(abs(row_normalize(A_h))) / A_h.shape[0]
        col_l1_sum = T.sum(abs(col_normalize(A_h))) / A_h.shape[1]
        sparse_cost = lam_l1[0] * (row_l1_sum + col_l1_sum)
        return [recon_cost, sparse_cost]

    def _compute_hidden_acts(self, X, W, b_h):
        """Compute activations of encoder (at hidden layer)."""
        A_h = self.activation(T.dot(X, W) + b_h)
        return A_h

    def _compute_activations(self, X, W, b_h, b_v):
        """Compute activations of decoder (at visible layer)."""
        A_h = self._compute_hidden_acts(X, W, b_h)
        A_v = T.dot(A_h, W.T) + b_v
        return [A_v, A_h]

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        if noise_lvl > 1e-3:
            P_nz = P + DCG(self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                    dtype=theano.config.floatX))
        else:
            P_nz = P
        return P_nz

    def _get_noisy_input(self, input, p):
        """p is the probability of dropping elements of input."""
        drop_rnd = self.rng.uniform(input.shape, low=0.0, high=1.0, \
            dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # Cast mask from int to float32, to keep things on GPU
        noisy_input = input * DCG(drop_mask)
        return noisy_input
コード例 #10
0
class HiddenLayer(object):
    def __init__(self, rng, input, in_dim, out_dim, \
                 activation=None, pool_size=0, \
                 drop_rate=0., input_noise=0., bias_noise=0., \
                 W=None, b=None, b_in=None, s_in=None, \
                 name="", W_scale=1.0):

        # Setup a shared random generator for this layer
        self.rng = RandStream(rng.randint(1000000))

        # setup parameters for controlling 
        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        self.input_noise = theano.shared(value=(zero_ary+input_noise), \
                name="{0:s}_input_noise".format(name))
        self.bias_noise = theano.shared(value=(zero_ary+bias_noise), \
                name="{0:s}_bias_noise".format(name))
        self.drop_rate = theano.shared(value=(zero_ary+drop_rate), \
                name="{0:s}_drop_rate".format(name))

        # setup scale and bias params for the input
        if b_in is None:
            # input biases are always initialized to zero
            ary = np.zeros((in_dim,), dtype=theano.config.floatX)
            b_in = theano.shared(value=ary, name="{0:s}_b_in".format(name))
        if s_in is None:
            # input scales are always initialized to one
            ary = 0.541325 * np.ones((in_dim,), dtype=theano.config.floatX)
            s_in = theano.shared(value=ary, name="{0:s}_s_in".format(name))
        self.b_in = b_in
        self.s_in = s_in

        # Set some basic layer properties
        self.pool_size = pool_size
        self.in_dim = in_dim
        self.out_dim = out_dim
        if self.pool_size <= 1:
            self.filt_count = self.out_dim
        else:
            self.filt_count = self.out_dim * self.pool_size
        self.pool_count = self.filt_count / max(self.pool_size, 1)
        if activation is None:
            activation = relu_actfun
        if self.pool_size <= 1:
            self.activation = activation
        else:
            self.activation = lambda x: \
                    maxout_actfun(x, self.pool_size, self.filt_count)

        # Get some random initial weights and biases, if not given
        if W is None:
            # Generate initial filters using orthogonal random trick
            W_shape = (self.in_dim, self.filt_count)
            #W_scale = W_scale * (1.0 / np.sqrt(self.in_dim))
            #W_init = W_scale * npr.normal(0.0, 1.0, W_shape)
            W_init = ortho_matrix(shape=(self.in_dim, self.filt_count), \
                    gain=W_scale)
            #W_init = 0.01 * npr.normal(0.0, 1.0, W_shape)
            W_init = W_init.astype(theano.config.floatX)
            W = theano.shared(value=W_init, name="{0:s}_W".format(name))
        if b is None:
            b_init = np.zeros((self.filt_count,), dtype=theano.config.floatX)
            b = theano.shared(value=b_init, name="{0:s}_b".format(name))

        # Set layer weights and biases
        self.W = W
        self.b = b

        # Feedforward through the layer
        use_in = input_noise > 0.001
        use_bn = bias_noise > 0.001
        use_drop = drop_rate > 0.001
        self.linear_output, self.noisy_linear, self.output = \
                self.apply(input, use_in=use_in, use_bn=use_bn, \
                use_drop=use_drop)

        # Compute some properties of the activations, probably to regularize
        self.act_l2_sum = T.sum(self.noisy_linear**2.) / self.output.size

        # Conveniently package layer parameters
        self.params = [self.W, self.b, self.b_in, self.s_in]
        self.shared_param_dicts = { \
                'W': self.W, \
                'b': self.b, \
                'b_in': self.b_in, \
                's_in': self.s_in }
        # Layer construction complete...
        return

    def apply(self, input, use_in=False, use_bn=False, use_drop=False):
        """
        Apply feedforward to this input, returning several partial results.
        """
        # Add gaussian noise to the input (if desired)
        #fancy_input = T.nnet.softplus(self.s_in) * (input + self.b_in)
        fancy_input = input
        if use_in:
            fuzzy_input = fancy_input + self.input_noise[0] * \
                    self.rng.normal(size=fancy_input.shape, avg=0.0, std=1.0, \
                    dtype=theano.config.floatX)
        else:
            fuzzy_input = fancy_input
        # Apply masking noise to the input (if desired)
        if use_drop:
            noisy_input = self._drop_from_input(fuzzy_input, self.drop_rate[0])
        else:
            noisy_input = fuzzy_input
        self.noisy_input = noisy_input
        # Compute linear "pre-activation" for this layer
        linear_output = T.dot(noisy_input, self.W) + self.b
        # Add noise to the pre-activation features (if desired)
        if use_bn:
            noisy_linear = linear_output + self.bias_noise[0] * \
                    self.rng.normal(size=linear_output.shape, avg=0.0, \
                    std=1.0, dtype=theano.config.floatX)
        else:
            noisy_linear = linear_output
        # Apply activation function
        final_output = self.activation(noisy_linear)
        # package partial results for easy return
        results = [linear_output, noisy_linear, final_output]
        return results

    def _drop_from_input(self, input, p):
        """p is the probability of dropping elements of input."""
        # get a drop mask that drops things with probability p
        drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \
                dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # get a scaling factor to keep expectations fixed after droppage
        drop_scale = 1. / (1. - p)
        # apply dropout mask and rescaling factor to the input
        droppy_input = drop_scale * input * drop_mask
        return droppy_input

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                dtype=theano.config.floatX)
        return P_nz
コード例 #11
0
class ClassModel(object):
    """
    Controller for training a fancy pseudo-bayesian classifier.

    Parameters:
        rng: numpy.random.RandomState (for reproducibility)
        x_in: the input data to encode
        y_in: int labels >= 1 for x_in when available, otherwise 0.
        q_z_given_x: InfNet for z given x
        class_count: number of classes to classify into
        z_dim: dimension of the "initial" latent space
        use_samples: whether to use z samples or just z mean
    """
    def __init__(self, rng=None, \
            x_in=None, y_in=None, \
            q_z_given_x=None, \
            class_count=None, \
            z_dim=None, \
            use_samples=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # record the dimensions of various spaces relevant to this model
        self.class_count = class_count
        self.z_dim = z_dim
        self.shared_dim = q_z_given_x.shared_layers[-1].out_dim
        self.use_samples = use_samples

        # grab handles to the relevant InfNets
        self.q_z_given_x = q_z_given_x

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.y_in = y_in

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        # setup a variable for controlling dropout noise
        self.drop_rate = theano.shared(value=zero_ary, name='cm_drop_rate')
        self.set_drop_rate(0.0)

        # initialize classification layer parameters
        init_mat = to_fX(0.01 * npr.randn(self.shared_dim, self.class_count))
        init_vec = to_fX( np.zeros((self.class_count,)) )
        self.W_class = theano.shared(value=init_mat, name='cm_W_class')
        self.b_class = theano.shared(value=init_vec, name='cm_b_class')
        # initialize "optimizable" parameters specific to this CM
        init_vec = to_fX( np.zeros((self.z_dim,)) )
        self.p_z_mean = theano.shared(value=init_vec, name='cm_p_z_mean')
        self.p_z_logvar = theano.shared(value=init_vec, name='cm_p_z_logvar')

        #################
        # Setup self.z. #
        #################
        self.q_z_mean, self.q_z_logvar, self.q_z_samples = \
                self.q_z_given_x.apply(self.x_in, do_samples=True)
        self.q_z_samples = self.q_z_given_x.apply_shared(self.x_in)

        # get a drop mask that drops things with probability p
        drop_scale = 1. / (1. - self.drop_rate[0])
        drop_rnd = self.rng.uniform(size=self.q_z_samples.shape, \
                low=0.0, high=1.0, dtype=theano.config.floatX)
        drop_mask = drop_scale * (drop_rnd > self.drop_rate[0])

        # get a droppy version of either z mean or z samples
        # if self.use_samples:
        #     self.z = self.q_z_samples * drop_mask
        # else:
        #     self.z = self.q_z_mean * drop_mask
        self.z = self.q_z_samples * drop_mask

        # compute class predictions
        self.y_out = T.dot(self.z, self.W_class) + self.b_class

        # compute KLds for training via variational free-energy
        self.kld_z_q2ps = gaussian_kld(self.q_z_mean, self.q_z_logvar, \
                                       self.p_z_mean, self.p_z_logvar)
        self.kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \
                                       self.q_z_mean, self.q_z_logvar)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr_1 = theano.shared(value=zero_ary, name='cm_lr_1')
        self.lr_2 = theano.shared(value=zero_ary, name='cm_lr_2')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='cm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='cm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='cm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='cm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='cm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_q2p=0.9, lam_kld_p2q=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='cm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters
        self.joint_params = [self.p_z_mean, self.p_z_logvar, \
                             self.W_class, self.b_class]
        self.joint_params.extend(self.q_z_given_x.mlp_params)

        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self.lam_nll[0] * self._construct_nll_costs(self.y_in)
        self.nll_cost = T.mean(self.nll_costs)
        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z_q2p, self.kld_z_p2q = self._construct_kld_costs(p=1.0)
        self.kld_costs = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                         (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_cost = T.mean(self.kld_costs)
        ##################################
        # CONSTRUCT THE FINAL JOINT COST #
        ##################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the model parameters
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling class error estimator...")
        self.class_error = self._construct_class_error()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        # make easy access points for some interesting parameters
        self.inf_weights = self.q_z_given_x.shared_layers[0].W
        return

    def set_sgd_params(self, lr_1=0.01, lr_2=0.01, \
                mom_1=0.9, mom_2=0.999):
        """
        Set learning rate and momentum parameter for all updates.
        """
        zero_ary = np.zeros((1,))
        # set learning rates
        new_lr_1 = zero_ary + lr_1
        self.lr_1.set_value(to_fX(new_lr_1))
        new_lr_2 = zero_ary + lr_2
        self.lr_2.set_value(to_fX(new_lr_2))
        # set momentums
        new_mom_1 = zero_ary + mom_1
        self.mom_1.set_value(to_fX(new_mom_1))
        new_mom_2 = zero_ary + mom_2
        self.mom_2.set_value(to_fX(new_mom_2))
        return

    def set_lam_nll(self, lam_nll=1.0):
        """
        Set weight for controlling the influence of the data likelihood.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_nll
        self.lam_nll.set_value(to_fX(new_lam))
        return

    def set_lam_kld(self, lam_kld_q2p=1.0, lam_kld_p2q=1.0):
        """
        Set the relative weight of various KL-divergences.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_kld_q2p
        self.lam_kld_q2p.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_p2q
        self.lam_kld_p2q.set_value(to_fX(new_lam))
        return

    def set_lam_l2w(self, lam_l2w=1e-3):
        """
        Set the relative strength of l2 regularization on network params.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_l2w
        self.lam_l2w.set_value(to_fX(new_lam))
        return

    def set_drop_rate(self, drop_rate=0.0):
        """
        Set the weight for shaping penalty on conditional priors over zt.
        """
        zero_ary = np.zeros((1,))
        new_val = zero_ary + drop_rate
        self.drop_rate.set_value(to_fX(new_val))
        return

    def _construct_nll_costs(self, yi):
        """
        Construct the categorical log-likelihood part of cost.
        """
        y_prob = safe_softmax(self.y_out)
        row_idx = T.arange(yi.shape[0])
        col_idx = yi.flatten() - 1
        row_mask = T.neq(yi, 0).reshape((yi.shape[0], 1))
        wacky_mat = (y_prob * row_mask) + (1. - row_mask)
        flat_nlls = -T.log(wacky_mat[row_idx,col_idx])
        class_nlls = flat_nlls.reshape((yi.shape[0], 1))
        return class_nlls

    def _construct_kld_costs(self, p=1.0):
        """
        Construct the z KLd part of cost.
        """
        kld_z_q2p = T.sum(self.kld_z_q2ps**p, axis=1, keepdims=True)
        kld_z_p2q = T.sum(self.kld_z_p2qs**p, axis=1, keepdims=True)
        return kld_z_q2p, kld_z_p2q

    def _construct_reg_costs(self):
        """
        Construct the cost for low-level basic regularization. E.g. for
        applying l2 regularization to the network activations and parameters.
        """
        param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params])
        return param_reg_cost

    def _construct_train_joint(self):
        """
        Construct theano function to train all networks jointly.
        """
        inputs = [self.x_in, self.y_in]
        # collect the outputs to return from this function
        outputs = [self.joint_cost, self.nll_cost, self.kld_cost, \
                   self.reg_cost, self.obs_costs]
        # compile the theano function
        func = theano.function(inputs=inputs, outputs=outputs, \
                               updates=self.joint_updates)
        return func

    def _construct_compute_fe_terms(self):
        """
        Construct a function for computing terms in variational free energy.
        """
        # construct values to output
        nll = self._construct_nll_costs(self.y_in)
        kld = self.kld_z_q2p
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(inputs=[self.x_in, self.y_in], \
                                         outputs=[nll, kld])
        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(XI, XO, sample_count):
            # compute a multi-sample estimate of variational free-energy
            nll_sum = np.zeros((XI.shape[0],))
            kld_sum = np.zeros((XI.shape[0],))
            for i in range(sample_count):
                result = fe_term_sample(XI, XO)
                nll_sum += result[0].ravel()
                kld_sum += result[1].ravel()
            mean_nll = nll_sum / float(sample_count)
            mean_kld = kld_sum / float(sample_count)
            return [mean_nll, mean_kld]
        return fe_term_estimator

    def _construct_class_error(self):
        """
        Compute classification error for a set of observations xi with known
        labels yi, based on multiple passes through noisy initial model.
        """
        # make a function for computing self.y_out
        y_func = theano.function([self.x_in], outputs=self.y_out)
        def multi_sample_error(xi, yi, samples=20):
            # compute self.y_out for the observations in xi
            xi = to_fX(xi)
            yp = y_func(xi)
            for i in range(samples-1):
                yp += y_func(xi)
            yp = yp / float(samples)
            # get the implied class labels
            yc = np.argmax(yp, axis=1).flatten()
            yi = yi.flatten()
            mask = 1.0 * (yi != 0)
            yi = yi - 1
            # compute the classification error for points with valid labels
            err_rate = np.sum(((yi != yc) * mask)) / np.sum(mask)
            return err_rate, yp
        return multi_sample_error
コード例 #12
0
class GenFCModule(object):
    """
    Module that transforms random values through a single fully connected
    layer, and then a linear transform (with another relu, optionally).
    """
    def __init__(self, rand_dim, out_dim, fc_dim,
                 apply_bn_1=True, apply_bn_2=True,
                 init_func=None, rand_type='normal',
                 final_relu=True, mod_name='dm_fc'):
        self.rand_dim = rand_dim
        self.out_dim = out_dim
        self.fc_dim = fc_dim
        self.apply_bn_1 = apply_bn_1
        self.apply_bn_2 = apply_bn_2
        self.mod_name = mod_name
        self.rand_type = rand_type
        self.final_relu = final_relu
        self.rng = RandStream(123)
        if init_func is None:
            self.init_func = inits.Normal(scale=0.02)
        else:
            self.init_func = init_func
        self._init_params() # initialize parameters
        return

    def _init_params(self):
        """
        Initialize parameters for the layers in this generator module.
        """
        self.w1 = self.init_func((self.rand_dim, self.fc_dim),
                                 "{}_w1".format(self.mod_name))
        self.w2 = self.init_func((self.fc_dim, self.out_dim),
                                 "{}_w2".format(self.mod_name))
        self.params = [self.w1, self.w2]
        # make gains and biases for transforms that will get batch normed
        if self.apply_bn_1:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g1 = gain_ifn((self.fc_dim), "{}_g1".format(self.mod_name))
            self.b1 = bias_ifn((self.fc_dim), "{}_b1".format(self.mod_name))
            self.params.extend([self.g1, self.b1])
        if self.apply_bn_2:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g2 = gain_ifn((self.out_dim), "{}_g2".format(self.mod_name))
            self.b2 = bias_ifn((self.out_dim), "{}_b2".format(self.mod_name))
            self.params.extend([self.g2, self.b2])
        return

    def apply(self, batch_size=None, rand_vals=None):
        """
        Apply this generator module. Pass _either_ batch_size or rand_vals.
        """
        assert not ((batch_size is None) and (rand_vals is None)), "need either batch_size or rand_vals"
        if rand_vals is None:
            rand_shape = (batch_size, self.rand_dim)
            if self.rand_type == 'normal':
                rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \
                                            dtype=theano.config.floatX)
            else:
                rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \
                                             dtype=theano.config.floatX)
        else:
            rand_shape = (rand_vals.shape[0], self.rand_dim)
        rand_vals = rand_vals.reshape(rand_shape)
        # transform random values into fc layer
        h1 = T.dot(rand_vals, self.w1)
        if self.apply_bn_1:
            h1 = batchnorm(h1, g=self.g1, b=self.b1)
        h1 = relu(h1)
        # transform from fc layer to output
        h2 = T.dot(h1, self.w2)
        if self.apply_bn_2:
            h2 = batchnorm(h2, g=self.g2, b=self.b2)
        if self.final_relu:
            h2 = relu(h2)
        return h2
コード例 #13
0
class GenUniModule(object):
    """
    Module that applies a linear transform followed by an non-linearity.
    """
    def __init__(self,
                 rand_dim,
                 out_dim,
                 apply_bn=True,
                 init_func=None,
                 rand_type='normal',
                 final_relu=True,
                 mod_name='dm_uni'):
        self.rand_dim = rand_dim
        self.out_dim = out_dim
        self.apply_bn = apply_bn
        self.mod_name = mod_name
        self.rand_type = rand_type
        self.final_relu = final_relu
        self.rng = RandStream(123)
        if init_func is None:
            self.init_func = inits.Normal(scale=0.02)
        else:
            self.init_func = init_func
        self._init_params()  # initialize parameters
        return

    def _init_params(self):
        """
        Initialize parameters for the layers in this generator module.
        """
        self.w1 = self.init_func((self.rand_dim, self.out_dim),
                                 "{}_w1".format(self.mod_name))
        self.params = [self.w1]
        # make gains and biases for transforms that will get batch normed
        if self.apply_bn:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g1 = gain_ifn((self.out_dim), "{}_g1".format(self.mod_name))
            self.b1 = bias_ifn((self.out_dim), "{}_b1".format(self.mod_name))
            self.params.extend([self.g1, self.b1])
        return

    def apply(self, batch_size=None, rand_vals=None):
        """
        Apply this generator module. Pass _either_ batch_size or rand_vals.
        """
        assert not ((batch_size is None) and
                    (rand_vals is None)), "need either batch_size or rand_vals"
        if rand_vals is None:
            rand_shape = (batch_size, self.rand_dim)
            if self.rand_type == 'normal':
                rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \
                                            dtype=theano.config.floatX)
            else:
                rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \
                                             dtype=theano.config.floatX)
        else:
            rand_shape = (rand_vals.shape[0], self.rand_dim)
        rand_vals = rand_vals.reshape(rand_shape)
        # transform random values linearly
        h1 = T.dot(rand_vals, self.w1)
        if self.apply_bn:
            h1 = batchnorm(h1, g=self.g1, b=self.b1)
        if self.final_relu:
            h1 = relu(h1)
        return h1


##############
# EYE BUFFER #
##############
コード例 #14
0
class TwoStageModel(object):
    """
    Controller for training a multi-step iterative refinement model.

    Parameters:
        rng: numpy.random.RandomState (for reproducibility)
        x_in: the input data to encode
        x_out: the target output to decode
        p_s_given_z: InfNet for initializing "canvas" state
        p_h_given_s: InfNet for h given s
        p_x_given_s_h: InfNet for x given s and h
        q_z_given_x: InfNet for z given x
        q_h_given_x_s: InfNet for h given x and s
        x_dim: dimension of the observations to generate
        z_dim: dimension of the "first" latent space
        s_dim: dimension of the "second" latent space
        h_dim: dimension of the "third" latent space
        params: REQUIRED PARAMS SHOWN BELOW
                x_type: can be "bernoulli" or "gaussian"
                obs_transform: can be 'none' or 'sigmoid'
    """
    def __init__(self, rng=None, \
            x_in=None, x_out=None, \
            p_s_given_z=None, \
            p_h_given_s=None, \
            p_x_given_s_h=None, \
            q_z_given_x=None, \
            q_h_given_x_s=None, \
            x_dim=None, \
            z_dim=None, \
            s_dim=None, \
            h_dim=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        self.shared_param_dicts = shared_param_dicts

        # record the dimensions of various spaces relevant to this model
        self.x_dim = x_dim
        self.z_dim = z_dim
        self.s_dim = s_dim
        self.h_dim = h_dim

        # grab handles to the relevant InfNets
        self.q_z_given_x = q_z_given_x
        self.q_h_given_x_s = q_h_given_x_s
        self.p_s_given_z = p_s_given_z
        self.p_h_given_s = p_h_given_s
        self.p_x_given_s_h = p_x_given_s_h

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out
        self.batch_reps = T.lscalar()

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch')
        self.set_train_switch(1.0)
        # setup a variable for controlling dropout noise
        self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate')
        self.set_drop_rate(0.0)
        # this weight balances l1 vs. l2 penalty on posterior KLds
        self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2')
        self.set_lam_kld_l1l2(1.0)

        if self.shared_param_dicts is None:
            # initialize "optimizable" parameters specific to this MSM
            init_vec = to_fX( np.zeros((self.z_dim,)) )
            self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean')
            self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar')
            init_vec = to_fX( np.zeros((self.x_dim,)) )
            self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)
            self.shared_param_dicts = {}
            self.shared_param_dicts['p_z_mean'] = self.p_z_mean
            self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            self.p_z_mean = self.shared_param_dicts['p_z_mean']
            self.p_z_logvar = self.shared_param_dicts['p_z_logvar']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)

        # get a drop mask that drops things with probability p
        drop_scale = 1. / (1. - self.drop_rate[0])
        drop_rnd = self.rng.uniform(size=self.x_out.shape, \
                low=0.0, high=1.0, dtype=theano.config.floatX)
        drop_mask = drop_scale * (drop_rnd > self.drop_rate[0])

        ##############################################
        # Setup the TwoStageModels main computation. #
        ##############################################
        print("Building TSM...")
        # samples of "first" latent state
        drop_x = drop_mask * self.x_in
        z_q_mean, z_q_logvar, self.z = \
                self.q_z_given_x.apply(drop_x, do_samples=True)
        # compute relevant KLds for this step
        self.kld_z_q2ps = gaussian_kld(z_q_mean, z_q_logvar, \
                                       self.p_z_mean, self.p_z_logvar)
        self.kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \
                                       z_q_mean, z_q_logvar)
        # transform "first" latent state into "second" latent state
        self.s, _ = self.p_s_given_z.apply(self.z, do_samples=False)

        # get samples of h, conditioned on current s
        h_p_mean, h_p_logvar, h_p = self.p_h_given_s.apply( \
                self.s, do_samples=True)
        # get variational samples of h, given s and x_out
        h_q_mean, h_q_logvar, h_q = self.q_h_given_x_s.apply( \
                T.horizontal_stack(self.x_out, self.s), \
                do_samples=True)

        # make h samples that can be switched between h_p and h_q
        self.h = (self.train_switch[0] * h_q) + \
                 ((1.0 - self.train_switch[0]) * h_p)

        # compute relevant KLds for this step
        self.kld_h_q2ps = gaussian_kld(h_q_mean, h_q_logvar, \
                                       h_p_mean, h_p_logvar)
        self.kld_h_p2qs = gaussian_kld(h_p_mean, h_p_logvar, \
                                       h_q_mean, h_q_logvar)

        # p_x_given_s_h is conditioned on s and  h.
        self.x_gen, _ = self.p_x_given_s_h.apply( \
                T.horizontal_stack(self.s, self.h), \
                do_samples=False)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1')
        self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z')
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.group_1_params = []
        self.group_1_params.extend(self.q_z_given_x.mlp_params)
        self.group_1_params.extend(self.q_h_given_x_s.mlp_params)
        # Grab all of the "optimizable" parameters in "group 2"
        self.group_2_params = [self.p_z_mean, self.p_z_logvar]
        self.group_2_params.extend(self.p_s_given_z.mlp_params)
        self.group_2_params.extend(self.p_h_given_s.mlp_params)
        self.group_2_params.extend(self.p_x_given_s_h.mlp_params)

        # Make a joint list of parameters group 1/2
        self.joint_params = self.group_1_params + self.group_2_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z_q2p, self.kld_z_p2q, self.kld_h_q2p, self.kld_h_p2q = \
                self._construct_kld_costs(p=1.0)
        self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_h_p2q)
        self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_h
        # now do l2 KLd costs
        self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_h_q2p, self.kl2_h_p2q = \
                self._construct_kld_costs(p=2.0)
        self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kl2_z_p2q)
        self.kl2_h = (self.lam_kld_q2p[0] * self.kl2_h_q2p) + \
                     (self.lam_kld_p2q[0] * self.kl2_h_p2q)
        self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_h
        # compute joint l1/l2 KLd cost
        self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \
                ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs)
        # compute "mean" (rather than per-input) costs
        self.kld_cost = T.mean(self.kld_costs)
        self.kl2_cost = T.mean(self.kl2_costs)
        self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self._construct_nll_costs(self.x_out)
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_l1l2_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_l1l2_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.group_1_updates = get_adam_updates(params=self.group_1_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.group_2_updates = get_adam_updates(params=self.group_2_params, \
                grads=self.joint_grads, alpha=self.lr_2, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.joint_updates = OrderedDict()
        for k in self.group_1_updates:
            self.joint_updates[k] = self.group_1_updates[k]
        for k in self.group_2_updates:
            self.joint_updates[k] = self.group_2_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling open-loop model sampler...")
        self.sample_from_prior = self._construct_sample_from_prior()
        print("Compiling data-guided model sampler...")
        self.sample_from_input = self._construct_sample_from_input()
        # make easy access points for some interesting parameters
        self.gen_gen_weights = self.p_x_given_s_h.mu_layers[-1].W
        return

    def set_sgd_params(self, lr_1=0.01, lr_2=0.01, \
                mom_1=0.9, mom_2=0.999):
        """
        Set learning rate and momentum parameter for all updates.
        """
        zero_ary = np.zeros((1,))
        # set learning rates
        new_lr_1 = zero_ary + lr_1
        self.lr_1.set_value(to_fX(new_lr_1))
        new_lr_2 = zero_ary + lr_2
        self.lr_2.set_value(to_fX(new_lr_2))
        # set momentums
        new_mom_1 = zero_ary + mom_1
        self.mom_1.set_value(to_fX(new_mom_1))
        new_mom_2 = zero_ary + mom_2
        self.mom_2.set_value(to_fX(new_mom_2))
        return

    def set_lam_nll(self, lam_nll=1.0):
        """
        Set weight for controlling the influence of the data likelihood.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_nll
        self.lam_nll.set_value(to_fX(new_lam))
        return

    def set_lam_kld(self, lam_kld_z=1.0, lam_kld_q2p=1.0, lam_kld_p2q=1.0):
        """
        Set the relative weight of various KL-divergences.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_kld_z
        self.lam_kld_z.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_q2p
        self.lam_kld_q2p.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_p2q
        self.lam_kld_p2q.set_value(to_fX(new_lam))
        return

    def set_lam_l2w(self, lam_l2w=1e-3):
        """
        Set the relative strength of l2 regularization on network params.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_l2w
        self.lam_l2w.set_value(to_fX(new_lam))
        return

    def set_train_switch(self, switch_val=0.0):
        """
        Set the switch for changing between training and sampling behavior.
        """
        if (switch_val < 0.5):
            switch_val = 0.0
        else:
            switch_val = 1.0
        zero_ary = np.zeros((1,))
        new_val = zero_ary + switch_val
        self.train_switch.set_value(to_fX(new_val))
        return

    def set_lam_kld_l1l2(self, lam_kld_l1l2=1.0):
        """
        Set the weight for shaping penalty on conditional priors over zt.
        """
        zero_ary = np.zeros((1,))
        new_val = zero_ary + lam_kld_l1l2
        self.lam_kld_l1l2.set_value(to_fX(new_val))
        return

    def set_drop_rate(self, drop_rate=0.0):
        """
        Set the weight for shaping penalty on conditional priors over zt.
        """
        zero_ary = np.zeros((1,))
        new_val = zero_ary + drop_rate
        self.drop_rate.set_value(to_fX(new_val))
        return

    def _construct_nll_costs(self, xo):
        """
        Construct the negative log-likelihood part of free energy.
        """
        # average log-likelihood over the refinement sequence
        xh = self.obs_transform(self.x_gen)
        if self.x_type == 'bernoulli':
            ll_costs = log_prob_bernoulli(xo, xh)
        else:
            ll_costs = log_prob_gaussian2(xo, xh, \
                    log_vars=self.bounded_logvar)
        nll_costs = -ll_costs
        return nll_costs

    def _construct_kld_costs(self, p=1.0):
        """
        Construct the posterior KL-divergence part of cost to minimize.
        """
        kld_z_q2p = T.sum(self.kld_z_q2ps**p, axis=1, keepdims=True)
        kld_z_p2q = T.sum(self.kld_z_p2qs**p, axis=1, keepdims=True)
        kld_h_q2p = T.sum(self.kld_h_q2ps**p, axis=1, keepdims=True)
        kld_h_p2q = T.sum(self.kld_h_p2qs**p, axis=1, keepdims=True)
        return [kld_z_q2p, kld_z_p2q, kld_h_q2p, kld_h_p2q]

    def _construct_reg_costs(self):
        """
        Construct the cost for low-level basic regularization. E.g. for
        applying l2 regularization to the network activations and parameters.
        """
        param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params])
        return param_reg_cost

    def _construct_train_joint(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xi = T.matrix()
        xo = T.matrix()
        # collect the outputs to return from this function
        outputs = [self.joint_cost, self.nll_cost, self.kld_cost, \
                   self.reg_cost, self.obs_costs]
        # compile the theano function
        func = theano.function(inputs=[ xi, xo, self.batch_reps ], \
                outputs=outputs, \
                givens={ self.x_in: xi.repeat(self.batch_reps, axis=0), \
                         self.x_out: xo.repeat(self.batch_reps, axis=0) }, \
                updates=self.joint_updates)
        return func

    def _construct_compute_fe_terms(self):
        """
        Construct a function for computing terms in variational free energy.
        """
        # construct values to output
        nll = self._construct_nll_costs(self.x_out)
        kld = self.kld_z_q2p + self.kld_h_q2p
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(inputs=[self.x_in, self.x_out], \
                                         outputs=[nll, kld])
        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(XI, XO, sample_count):
            # compute a multi-sample estimate of variational free-energy
            nll_sum = np.zeros((XI.shape[0],))
            kld_sum = np.zeros((XI.shape[0],))
            for i in range(sample_count):
                result = fe_term_sample(XI, XO)
                nll_sum += result[0].ravel()
                kld_sum += result[1].ravel()
            mean_nll = nll_sum / float(sample_count)
            mean_kld = kld_sum / float(sample_count)
            return [mean_nll, mean_kld]
        return fe_term_estimator

    def _construct_sample_from_prior(self):
        """
        Construct a function for drawing independent samples from the
        distribution generated by this MultiStageModel. This function returns
        the full sequence of "partially completed" examples.
        """
        z_sym = T.matrix()
        x_sym = T.matrix()
        sample_func = theano.function(inputs=[z_sym, x_sym], \
                outputs=self.obs_transform(self.x_gen), \
                givens={self.z: z_sym, \
                        self.x_in: T.zeros_like(x_sym), \
                        self.x_out: T.zeros_like(x_sym)})
        def prior_sampler(samp_count):
            x_samps = to_fX( np.zeros((samp_count, self.x_dim)) )
            old_switch = self.train_switch.get_value(borrow=False)
            # set model to generation mode
            self.set_train_switch(switch_val=0.0)
            z_samps = to_fX( npr.randn(samp_count, self.z_dim) )
            model_samps = sample_func(z_samps, x_samps)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            return model_samps
        return prior_sampler

    def _construct_sample_from_input(self):
        """
        Construct a function for drawing samples from the distribution
        generated by this MultiStageModel, conditioned on some inputs to the
        initial encoder stage (i.e. self.q_z_given_x). This returns the full 
        sequence of "partially completed" examples.

        The 
        """
        sample_func = theano.function(inputs=[self.x_in, self.x_out], \
                outputs=self.obs_transform(self.x_gen))
        def conditional_sampler(XI, XO=None, guided_decoding=False):
            XI = to_fX( XI )
            if XO is None:
                XO = XI
            XO = to_fX( XO )
            # set model to desired generation mode                
            old_switch = self.train_switch.get_value(borrow=False)
            if guided_decoding:
                # take samples from guide policies (i.e. variational q)
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from model's generative policy
                self.set_train_switch(switch_val=0.0)
            # draw guided/unguided conditional samples
            model_samps = sample_func(XI, XO)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            return model_samps
        return conditional_sampler
コード例 #15
0
class GenFCModule(object):
    """
    Module that transforms random values through a single fully connected
    layer, and then a linear transform (with another relu, optionally).
    """
    def __init__(self,
                 rand_dim,
                 out_dim,
                 fc_dim,
                 apply_bn_1=True,
                 apply_bn_2=True,
                 init_func=None,
                 rand_type='normal',
                 final_relu=True,
                 mod_name='dm_fc'):
        self.rand_dim = rand_dim
        self.out_dim = out_dim
        self.fc_dim = fc_dim
        self.apply_bn_1 = apply_bn_1
        self.apply_bn_2 = apply_bn_2
        self.mod_name = mod_name
        self.rand_type = rand_type
        self.final_relu = final_relu
        self.rng = RandStream(123)
        if init_func is None:
            self.init_func = inits.Normal(scale=0.02)
        else:
            self.init_func = init_func
        self._init_params()  # initialize parameters
        return

    def _init_params(self):
        """
        Initialize parameters for the layers in this generator module.
        """
        self.w1 = self.init_func((self.rand_dim, self.fc_dim),
                                 "{}_w1".format(self.mod_name))
        self.w2 = self.init_func((self.fc_dim, self.out_dim),
                                 "{}_w2".format(self.mod_name))
        self.params = [self.w1, self.w2]
        # make gains and biases for transforms that will get batch normed
        if self.apply_bn_1:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g1 = gain_ifn((self.fc_dim), "{}_g1".format(self.mod_name))
            self.b1 = bias_ifn((self.fc_dim), "{}_b1".format(self.mod_name))
            self.params.extend([self.g1, self.b1])
        if self.apply_bn_2:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g2 = gain_ifn((self.out_dim), "{}_g2".format(self.mod_name))
            self.b2 = bias_ifn((self.out_dim), "{}_b2".format(self.mod_name))
            self.params.extend([self.g2, self.b2])
        return

    def apply(self, batch_size=None, rand_vals=None):
        """
        Apply this generator module. Pass _either_ batch_size or rand_vals.
        """
        assert not ((batch_size is None) and
                    (rand_vals is None)), "need either batch_size or rand_vals"
        if rand_vals is None:
            rand_shape = (batch_size, self.rand_dim)
            if self.rand_type == 'normal':
                rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \
                                            dtype=theano.config.floatX)
            else:
                rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \
                                             dtype=theano.config.floatX)
        else:
            rand_shape = (rand_vals.shape[0], self.rand_dim)
        rand_vals = rand_vals.reshape(rand_shape)
        # transform random values into fc layer
        h1 = T.dot(rand_vals, self.w1)
        if self.apply_bn_1:
            h1 = batchnorm(h1, g=self.g1, b=self.b1)
        h1 = relu(h1)
        # transform from fc layer to output
        h2 = T.dot(h1, self.w2)
        if self.apply_bn_2:
            h2 = batchnorm(h2, g=self.g2, b=self.b2)
        if self.final_relu:
            h2 = relu(h2)
        return h2
コード例 #16
0
class GenConvModule(object):
    """
    Module of one "fractionally strided" convolution layer followed by one
    regular convolution layer. Inputs to the fractionally strided convolution
    can optionally be augmented with some random values.

    Params:
        filt_shape: shape for convolution filters -- should be square and odd
        in_chans: number of channels in the inputs to module
        out_chans: number of channels in the outputs from module
        rand_chans: number of random channels to augment input
        use_rand: flag for whether or not to augment inputs
        apply_bn_1: flag for whether to batch normalize following first conv
        apply_bn_2: flag for whether to batch normalize following second conv
        us_stride: upsampling ratio in the fractionally strided convolution
        use_pooling: whether to use unpooling or fractional striding
        init_func: function for initializing module parameters
        mod_name: text name for identifying module in theano graph
        rand_type: whether to use Gaussian or uniform randomness
    """
    def __init__(self,
                 filt_shape,
                 in_chans,
                 out_chans,
                 rand_chans,
                 use_rand=True,
                 apply_bn_1=True,
                 apply_bn_2=True,
                 us_stride=2,
                 use_pooling=True,
                 init_func=None,
                 mod_name='gm_conv',
                 rand_type='normal'):
        assert ((filt_shape[0] % 2) > 0), "filter dim should be odd (not even)"
        self.filt_dim = filt_shape[0]
        self.in_chans = in_chans
        self.out_chans = out_chans
        self.rand_chans = rand_chans
        self.use_rand = use_rand
        self.apply_bn_1 = apply_bn_1
        self.apply_bn_2 = apply_bn_2
        self.us_stride = us_stride
        self.use_pooling = use_pooling
        self.mod_name = mod_name
        self.rand_type = rand_type
        self.rng = RandStream(123)
        if init_func is None:
            self.init_func = inits.Normal(scale=0.02)
        else:
            self.init_func = init_func
        self._init_params()  # initialize parameters
        return

    def _init_params(self):
        """
        Initialize parameters for the layers in this generator module.
        """
        if self.use_rand:
            # random values will be stacked on exogenous input
            self.w1 = self.init_func(
                (self.out_chans, (self.in_chans + self.rand_chans),
                 self.filt_dim, self.filt_dim), "{}_w1".format(self.mod_name))
        else:
            # random values won't be stacked on exogenous input
            self.w1 = self.init_func(
                (self.out_chans, self.in_chans, self.filt_dim, self.filt_dim),
                "{}_w1".format(self.mod_name))
        self.w2 = self.init_func(
            (self.out_chans, self.out_chans, self.filt_dim, self.filt_dim),
            "{}_w2".format(self.mod_name))
        self.params = [self.w1, self.w2]
        # make gains and biases for transforms that will get batch normed
        if self.apply_bn_1:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g1 = gain_ifn((self.out_chans), "{}_g1".format(self.mod_name))
            self.b1 = bias_ifn((self.out_chans), "{}_b1".format(self.mod_name))
            self.params.extend([self.g1, self.b1])
        if self.apply_bn_2:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g2 = gain_ifn((self.out_chans), "{}_g2".format(self.mod_name))
            self.b2 = bias_ifn((self.out_chans), "{}_b2".format(self.mod_name))
            self.params.extend([self.g2, self.b2])
        return

    def apply(self, input, rand_vals=None):
        """
        Apply this generator module to some input.
        """
        batch_size = input.shape[0]
        bm = int((self.filt_dim - 1) / 2)  # use "same" mode convolutions
        ss = self.us_stride  # stride for "learned upsampling"
        if self.use_pooling:
            # "unpool" the input if desired
            input = input.repeat(ss, axis=2).repeat(ss, axis=3)
        # get shape for random values that will augment input
        rand_shape = (batch_size, self.rand_chans, input.shape[2],
                      input.shape[3])
        if self.use_rand:
            # augment input with random channels
            if rand_vals is None:
                if self.rand_type == 'normal':
                    rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \
                                                dtype=theano.config.floatX)
                else:
                    rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \
                                                 dtype=theano.config.floatX)
            rand_vals = rand_vals.reshape(rand_shape)
            # stack random values on top of input
            full_input = T.concatenate([rand_vals, input], axis=1)
        else:
            # don't augment input with random channels
            full_input = input
        # apply first convolution, perhaps with fractional striding
        if self.use_pooling:
            h1 = dnn_conv(full_input,
                          self.w1,
                          subsample=(1, 1),
                          border_mode=(bm, bm))
        else:
            # apply first conv layer (with fractional stride for upsampling)
            h1 = deconv(full_input,
                        self.w1,
                        subsample=(ss, ss),
                        border_mode=(bm, bm))
        if self.apply_bn_1:
            h1 = batchnorm(h1, g=self.g1, b=self.b1)
        h1 = relu(h1)
        # apply second conv layer
        h2 = dnn_conv(h1, self.w2, subsample=(1, 1), border_mode=(bm, bm))
        if self.apply_bn_2:
            h2 = batchnorm(h2, g=self.g2, b=self.b2)
        h2 = relu(h2)
        return h2
コード例 #17
0
ファイル: NetLayers.py プロジェクト: darcy0511/NN-Python
class ConvPoolLayer(object):
    """
    A simple convolution --> max-pooling layer.

    The (symbolic) input to this layer must be a theano.tensor.dtensor4 shaped
    like (batch_size, chan_count, im_dim_1, im_dim_2).

    filt_def should be a 4-tuple like (filt_count, in_chans, filt_def_1, filt_def_2)

    pool_def should be a 3-tuple like (pool_dim, pool_stride)
    """
    def __init__(self, rng, input=None, filt_def=None, pool_def=(2, 2), \
    		activation=None, drop_rate=0., input_noise=0., bias_noise=0., \
    		W=None, b=None, name="", W_scale=1.0):

        # Setup a shared random generator for this layer
        #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \
        #        rng.randint(100000))
        self.rng = CURAND_RandomStreams(rng.randint(1000000))

        self.clean_input = input

        # Add gaussian noise to the input (if desired)
        if (input_noise > 1e-4):
            self.fuzzy_input = input + self.rng.normal(size=input.shape, \
                    avg=0.0, std=input_noise, dtype=theano.config.floatX)
        else:
            self.fuzzy_input = input

        # Apply masking noise to the input (if desired)
        if (drop_rate > 1e-4):
            self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate)
        else:
            self.noisy_input = self.fuzzy_input

        # Set the activation function for the conv filters
        if activation:
            self.activation = activation
        else:
        	self.activation = lambda x: relu_actfun(x)

        # initialize weights with random weights
        W_init = 0.01 * np.asarray(rng.normal( \
        		size=filt_def), dtype=theano.config.floatX)
        self.W = theano.shared(value=(W_scale*W_init), \
        		name="{0:s}_W".format(name))

        # the bias is a 1D tensor -- one bias per output feature map
        b_init = np.zeros((filt_def[0],), dtype=theano.config.floatX) + 0.1
        self.b = theano.shared(value=b_init, name="{0:s}_b".format(name))

        # convolve input feature maps with filters
        input_c01b = self.noisy_input.dimshuffle(1, 2, 3, 0) # bc01 to c01b
        filters_c01b = self.W.dimshuffle(1, 2, 3, 0) # bc01 to c01b
        conv_op = FilterActs(stride=1, partial_sum=1)
        contig_input = gpu_contiguous(input_c01b)
        contig_filters = gpu_contiguous(filters_c01b)
        conv_out_c01b = conv_op(contig_input, contig_filters)

        if (bias_noise > 1e-4):
        	noisy_conv_out_c01b = conv_out_c01b + self.rng.normal( \
        			size=conv_out_c01b.shape, avg=0.0, std=bias_noise, \
        			dtype=theano.config.floatX)
        else:
        	noisy_conv_out_c01b = conv_out_c01b

        # downsample each feature map individually, using maxpooling
        pool_op = MaxPool(ds=pool_def[0], stride=pool_def[1])
        mp_out_c01b = pool_op(noisy_conv_out_c01b)
        mp_out_bc01 = mp_out_c01b.dimshuffle(3, 0, 1, 2) # c01b to bc01

        # add the bias term. Since the bias is a vector (1D array), we first
        # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will
        # thus be broadcasted across mini-batches and feature map
        # width & height
        self.noisy_linear_output = mp_out_bc01 + self.b.dimshuffle('x', 0, 'x', 'x')
        self.linear_output = self.noisy_linear_output
        self.output = self.activation(self.noisy_linear_output)

        # store parameters of this layer
        self.params = [self.W, self.b]

        return

    def _drop_from_input(self, input, p):
        """p is the probability of dropping elements of input."""
        # get a drop mask that drops things with probability p
        drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \
                dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # get a scaling factor to keep expectations fixed after droppage
        drop_scale = 1. / (1. - p)
        # apply dropout mask and rescaling factor to the input
        droppy_input = drop_scale * input * drop_mask
        return droppy_input

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                dtype=theano.config.floatX)
        return P_nz
コード例 #18
0
class WalkoutModel(object):
    """
    Controller for training a forwards-backwards chainy model.

    Parameters:
        rng: numpy.random.RandomState (for reproducibility)
        x_out: the goal state for forwards-backwards walking process
        p_z_given_x: InfNet for stochastic part of step
        p_x_given_z: HydraNet for deterministic part of step
        params: REQUIRED PARAMS SHOWN BELOW
                x_dim: dimension of observations to construct
                z_dim: dimension of latent space for policy wobble
                walkout_steps: number of steps to walk out
                x_type: can be "bernoulli" or "gaussian"
                x_transform: can be 'none' or 'sigmoid'
    """
    def __init__(self, rng=None,
            x_out=None, \
            p_z_given_x=None, \
            p_x_given_z=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this WalkoutModel
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_dim = self.params['x_dim']
        self.z_dim = self.params['z_dim']
        self.walkout_steps = self.params['walkout_steps']
        self.x_type = self.params['x_type']
        self.shared_param_dicts = shared_param_dicts
        if 'x_transform' in self.params:
            assert((self.params['x_transform'] == 'sigmoid') or \
                    (self.params['x_transform'] == 'none'))
            if self.params['x_transform'] == 'sigmoid':
                self.x_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.x_transform = lambda x: x
        else:
            self.x_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.x_transform = lambda x: T.nnet.sigmoid(x)
        assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        assert ((self.step_type == 'add') or (self.step_type == 'jump'))

        # grab handles to the relevant networks
        self.p_z_given_x = p_z_given_x
        self.p_x_given_z = p_x_given_z

        # record the symbolic variables that will provide inputs to the
        # computation graph created for this WalkoutModel
        self.x_out = x_out  # target output for generation
        self.zi_zmuv = T.tensor3()  # ZMUV gauss noise for walk-out wobble

        if self.shared_param_dicts is None:
            # initialize the parameters "owned" by this model
            zero_ary = to_fX(np.zeros((1, )))
            self.obs_logvar = theano.shared(value=zero_ary, name='obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh(
                (1.0 / 8.0) * self.obs_logvar[0])
            self.shared_param_dicts = {}
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            # grab the parameters required by this model from a given dict
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh(
                (1.0 / 8.0) * self.obs_logvar[0])

        ###############################################################
        # Setup the forwards (i.e. training) walk-out loop using scan #
        ###############################################################
        def forwards_loop(xi_zmuv, zi_zmuv, xi_fw, zi_fw):
            # get samples of next zi, according to the forwards model
            zi_fw_mean, zi_fw_logvar = self.p_z_given_x.apply(xi_fw, \
                                       do_samples=False)
            zi_fw = zi_fw_mean + (T.exp(0.5 * zi_fw_logvar) * zi_zmuv)

            # check reverse direction probability p(xi_fw | zi_fw)
            xi_bw_mean, xi_bw_logvar = self.p_x_given_z.apply(zi_fw, \
                                       do_samples=False)
            xi_bw_mean = self.x_transform(xi_bw_mean)
            nll_xi_bw = log_prob_gaussian2(xi_fw, xi_bw_mean, \
                        log_vars=xi_bw_logvar, mask=None)
            nll_xi_bw = nll_xi_bw.flatten()

            # get samples of next xi, according to the forwards model
            xi_fw_mean, xi_fw_logvar = self.p_x_given_z.apply(zi_fw, \
                                       do_samples=False)
            xi_fw_mean = self.x_transform(xi_fw_mean)
            xi_fw = xi_fw_mean + (T.exp(0.5 * xi_fw_logvar) * xi_zmuv)

            # check reverse direction probability p(zi_fw | xi_fw)
            zi_bw_mean, zi_bw_logvar = self.p_z_given_x.apply(xi_fw, \
                                       do_samples=False)
            nll_zi_bw = log_prob_gaussian2(zi_fw, zi_bw_mean, \
                        log_vars=zi_bw_logvar, mask=None)
            nll_zi_bw = nll_zi_bw.flatten()

            # each loop iteration produces the following values:
            #   xi_fw: xi generated fom zi by forwards walk
            #   zi_fw: zi generated fom xi by forwards walk
            #   xi_fw_mean: ----
            #   xi_fw_logvar: ----
            #   zi_fw_mean: ----
            #   zi_fw_logvar: ----
            #   nll_xi_bw: NLL for reverse step zi_fw -> xi_fw
            #   nll_zi_bw: NLL for reverse step xi_fw -> zi_fw
            return xi_fw, zi_fw, xi_fw_mean, xi_fw_logvar, zi_fw_mean, zi_fw_logvar, nll_xi_bw, nll_zi_bw

        # initialize states for x/z
        self.x0 = self.x_out
        self.z0 = T.alloc(0.0, self.x0.shape[0], self.z_dim)
        # setup initial values to pass to scan op
        outputs_init = [self.x0, self.z0, None, None, None, None, None, None]
        sequences_init = [self.xi_zmuv, self.zi_zmuv]
        # apply scan op for the sequential imputation loop
        self.scan_results, self.scan_updates = theano.scan(forwards_loop, \
                    outputs_info=outputs_init, \
                    sequences=sequences_init)

        # grab results of the scan op. all values are computed for each step
        self.xi = self.scan_results[0]
        self.zi = self.scan_results[1]
        self.xi_fw_mean = self.scan_results[2]
        self.xi_fw_logvar = self.scan_results[3]
        self.zi_fw_mean = self.scan_results[4]
        self.zi_fw_logvar = self.scan_results[5]
        self.nll_xi_bw = self.scan_results[6]
        self.nll_zi_bw = self.scan_results[7]

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX(np.zeros((1, )))
        self.lr = theano.shared(value=zero_ary, name='srr_lr')
        # shared var momentum parameters for ADAM optimization
        self.mom_1 = theano.shared(value=zero_ary, name='srr_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='srr_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared vars for weighting prior kld against reconstruction
        self.lam_kld_p = theano.shared(value=zero_ary, name='srr_lam_kld_p')
        self.lam_kld_q = theano.shared(value=zero_ary, name='srr_lam_kld_q')
        self.lam_kld_g = theano.shared(value=zero_ary, name='srr_lam_kld_g')
        self.lam_kld_s = theano.shared(value=zero_ary, name='srr_lam_kld_s')
        self.set_lam_kld(lam_kld_p=0.0,
                         lam_kld_q=1.0,
                         lam_kld_g=0.0,
                         lam_kld_s=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='srr_lam_l2w')
        self.set_lam_l2w(1e-5)

        # grab all of the "optimizable" parameters from the base networks
        self.joint_params = [self.s0, self.obs_logvar, self.step_scales]
        self.joint_params.extend(self.p_zi_given_xi.mlp_params)
        self.joint_params.extend(self.p_sip1_given_zi.mlp_params)
        self.joint_params.extend(self.p_x_given_si.mlp_params)
        self.joint_params.extend(self.q_zi_given_xi.mlp_params)

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(
            p=1.0)
        self.kld_costs = (self.lam_kld_p[0] * self.kld_p) + \
                         (self.lam_kld_q[0] * self.kld_q) + \
                         (self.lam_kld_g[0] * self.kld_g) + \
                         (self.lam_kld_s[0] * self.kld_s)
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = T.sum(self.nlli, axis=0)  # sum the per-step NLLs
        self.nll_cost = T.mean(self.nll_costs)
        self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel()
        self.nll_bound = T.mean(self.nll_bounds)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-TRIAL COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        for k, v in self.scan_updates.items():
            self.joint_updates[k] = v

        # Construct theano functions for training and diagnostic computations
        print("Compiling cost computer...")
        self.compute_raw_costs = self._construct_raw_costs()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling sequence sampler...")
        self.sequence_sampler = self._construct_sequence_sampler()
        # make easy access points for some interesting parameters
        #self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W
        return

    def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999):
        """
        Set learning rate and momentum parameter for all updates.
        """
        zero_ary = np.zeros((1, ))
        # set learning rate
        new_lr = zero_ary + lr
        self.lr.set_value(to_fX(new_lr))
        # set momentums (use first and second order "momentum")
        new_mom_1 = zero_ary + mom_1
        self.mom_1.set_value(to_fX(new_mom_1))
        new_mom_2 = zero_ary + mom_2
        self.mom_2.set_value(to_fX(new_mom_2))
        return

    def set_lam_kld(self,
                    lam_kld_p=0.0,
                    lam_kld_q=1.0,
                    lam_kld_g=0.0,
                    lam_kld_s=0.0):
        """
        Set the relative weight of prior KL-divergence vs. data likelihood.
        """
        zero_ary = np.zeros((1, ))
        new_lam = zero_ary + lam_kld_p
        self.lam_kld_p.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_q
        self.lam_kld_q.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_g
        self.lam_kld_g.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_s
        self.lam_kld_s.set_value(to_fX(new_lam))
        return

    def set_lam_l2w(self, lam_l2w=1e-3):
        """
        Set the relative strength of l2 regularization on network params.
        """
        zero_ary = np.zeros((1, ))
        new_lam = zero_ary + lam_l2w
        self.lam_l2w.set_value(to_fX(new_lam))
        return

    def set_train_switch(self, switch_val=0.0):
        """
        Set the switch for changing between training and sampling behavior.
        """
        if (switch_val < 0.5):
            switch_val = 0.0
        else:
            switch_val = 1.0
        zero_ary = np.zeros((1, ))
        new_val = zero_ary + switch_val
        self.train_switch.set_value(to_fX(new_val))
        return

    def _construct_zi_zmuv(self, xo):
        """
        Construct the necessary ZMUV gaussian samples for generating
        trajectories from this WalkoutModel, for input matrix xo.
        """
        zi_zmuv = self.rng.normal( \
                size=(self.total_steps, xo.shape[0], self.z_dim), \
                avg=0.0, std=1.0, dtype=theano.config.floatX)
        return zi_zmuv

    def _construct_rev_masks(self, xo):
        """
        Compute the sequential revelation masks for the input batch in xo.
        -- We need to construct mask sequences for both p and q.
        """
        if self.use_rev_masks:
            # make batch copies of self.rev_masks_p and self.rev_masks_q
            pmasks = self.rev_masks_p.dimshuffle(0, 'x', 1).repeat(xo.shape[0],
                                                                   axis=1)
            qmasks = self.rev_masks_q.dimshuffle(0, 'x', 1).repeat(xo.shape[0],
                                                                   axis=1)
        else:
            pm_list = []
            qm_list = []
            # make a zero mask that does nothing
            zero_mask = T.alloc(0.0, 1, xo.shape[0], xo.shape[1])
            # generate independently sampled masks for each revelation block
            for rb in self.rev_sched:
                # make a random binary mask with ones at rate rb[1]
                rand_vals = self.rng.uniform( \
                        size=(1, xo.shape[0], xo.shape[1]), \
                        low=0.0, high=1.0, dtype=theano.config.floatX)
                rand_mask = rand_vals < rb[1]
                # append the masks for this revleation block to the mask lists
                #
                # the guide policy (in q) gets to peek at the values that will be
                # revealed to the primary policy (in p) for the entire block. The
                # primary policy only gets to see these values at end of the final
                # step of the block. Within a given step, values are revealed to q
                # at the beginning of the step, and to p at the end.
                #
                # e.g. in a revelation block with only a single step, the guide
                # policy sees the values at the beginning of the step, which allows
                # it to guide the step. the primary policy only gets to see the
                # values at the end of the step.
                #
                # i.e. a standard variational auto-encoder is equivalent to a
                # sequential revelation and refinement model with only one
                # revelation block, which has one step and a reveal rate of 1.0.
                #
                for refine_step in range(rb[0] - 1):
                    pm_list.append(zero_mask)
                    qm_list.append(rand_mask)
                pm_list.append(rand_mask)
                qm_list.append(rand_mask)
            # concatenate each mask list into a 3-tensor
            pmasks = T.cast(T.concatenate(pm_list, axis=0), 'floatX')
            qmasks = T.cast(T.concatenate(qm_list, axis=0), 'floatX')
        return [pmasks, qmasks]

    def _construct_nll_costs(self, si, xo, nll_mask):
        """
        Construct the negative log-likelihood part of free energy.
        -- only check NLL where nll_mask == 1
        """
        xh = self._from_si_to_x(si)
        if self.x_type == 'bernoulli':
            ll_costs = log_prob_bernoulli(xo, xh, mask=nll_mask)
        else:
            ll_costs = log_prob_gaussian2(xo, xh, \
                    log_vars=self.bounded_logvar, mask=nll_mask)
        nll_costs = -ll_costs.flatten()
        return nll_costs

    def _construct_kld_s(self, s_i, s_j):
        """
        Compute KL(s_i || s_j) -- assuming bernoullish outputs
        """
        x_i = self._from_si_to_x(s_i)
        x_j = self._from_si_to_x(s_j)
        kld_s = (x_i * (T.log(x_i)  - T.log(x_j))) + \
                ((1.0 - x_i) * (T.log(1.0-x_i) - T.log(1.0-x_j)))
        sum_kld = T.sum(kld_s, axis=1)
        return sum_kld

    def _construct_kld_costs(self, p=1.0):
        """
        Construct the policy KL-divergence part of cost to minimize.
        """
        kld_pis = []
        kld_qis = []
        kld_gis = []
        kld_sis = []
        s0 = 0.0 * self.si[0] + self.s0
        for i in range(self.total_steps):
            kld_pis.append(T.sum(self.kldi_p2q[i]**p, axis=1))
            kld_qis.append(T.sum(self.kldi_q2p[i]**p, axis=1))
            kld_gis.append(T.sum(self.kldi_p2g[i]**p, axis=1))
            if i == 0:
                kld_sis.append(self._construct_kld_s(self.si[i], s0))
            else:
                kld_sis.append(
                    self._construct_kld_s(self.si[i], self.si[i - 1]))
        # compute the batch-wise costs
        kld_pi = sum(kld_pis)
        kld_qi = sum(kld_qis)
        kld_gi = sum(kld_gis)
        kld_si = sum(kld_sis)
        return [kld_pi, kld_qi, kld_gi, kld_si]

    def _construct_reg_costs(self):
        """
        Construct the cost for low-level basic regularization. E.g. for
        applying l2 regularization to the network activations and parameters.
        """
        param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params])
        return param_reg_cost

    def _construct_compute_fe_terms(self):
        """
        Construct a function for computing terms in variational free energy.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # construct values to output
        nll = self.nll_costs.flatten()
        kld = self.kld_q.flatten()
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(inputs=[ xo ], \
                outputs=[nll, kld], \
                givens={self.x_out: xo, \
                        self.zi_zmuv: zizmuv, \
                        self.p_masks: pmasks, \
                        self.q_masks: qmasks}, \
                updates=self.scan_updates, \
                on_unused_input='ignore')

        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(XO, sample_count=20, use_guide_policy=True):
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from the guide policy
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from the primary policy
                self.set_train_switch(switch_val=0.0)
            # compute a multi-sample estimate of variational free-energy
            nll_sum = np.zeros((XO.shape[0], ))
            kld_sum = np.zeros((XO.shape[0], ))
            for i in range(sample_count):
                result = fe_term_sample(XO)
                nll_sum += result[0].ravel()
                kld_sum += result[1].ravel()
            mean_nll = nll_sum / float(sample_count)
            mean_kld = kld_sum / float(sample_count)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            if not use_guide_policy:
                # no KLd if samples are from the primary policy...
                mean_kld = 0.0 * mean_kld
            return [mean_nll, mean_kld]

        return fe_term_estimator

    def _construct_raw_costs(self):
        """
        Construct all the raw, i.e. not weighted by any lambdas, costs.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # compile theano function for computing the costs
        all_step_costs = [
            self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g
        ]
        cost_func = theano.function(inputs=[ xo ], \
                    outputs=all_step_costs, \
                    givens={self.x_out: xo, \
                            self.zi_zmuv: zizmuv, \
                            self.p_masks: pmasks, \
                            self.q_masks: qmasks}, \
                    updates=self.scan_updates, \
                    on_unused_input='ignore')

        # make a function for computing batch-based estimates of costs.
        #   _step_nlls: the expected NLL cost for each step
        #   _step_klds: the expected KL(q||p) cost for each step
        #   _kld_q2p: the expected KL(q||p) cost for each latent dim
        #   _kld_p2q: the expected KL(p||q) cost for each latent dim
        #   _kld_p2g: the expected KL(p||N(0,I)) cost for each latent dim
        def raw_cost_computer(XO):
            _all_costs = cost_func(to_fX(XO))
            _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True),
                              axis=0)
            _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True),
                              axis=0)
            _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True),
                              axis=0)
            _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True),
                                 axis=1)
            _step_klds = to_fX(np.asarray([k for k in _step_klds]))
            _step_nlls = np.mean(_all_costs[0], axis=1)
            _step_nlls = to_fX(np.asarray([k for k in _step_nlls]))
            results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g]
            return results

        return raw_cost_computer

    def _construct_train_joint(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # collect the outputs to return from this function
        outputs = [self.joint_cost, self.nll_bound, self.nll_cost, \
                   self.kld_cost, self.reg_cost, self.obs_costs]
        # compile the theano function
        func = theano.function(inputs=[ xo ], \
                outputs=outputs, \
                givens={self.x_out: xo, \
                        self.zi_zmuv: zizmuv, \
                        self.p_masks: pmasks, \
                        self.q_masks: qmasks}, \
                updates=self.joint_updates, \
                on_unused_input='ignore')
        return func

    def _construct_sequence_sampler(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # collect the outputs to return from this function
        states = [self._from_si_to_x(self.s0_full)] + \
                 [self._from_si_to_x(self.si[i]) for i in range(self.total_steps)]
        masks = [self.m0_full
                 ] + [self.mi_p[i] for i in range(self.total_steps)]
        outputs = states + masks
        # compile the theano function
        func = theano.function(inputs=[ xo ], \
                outputs=outputs, \
                givens={self.x_out: xo, \
                        self.zi_zmuv: zizmuv, \
                        self.p_masks: pmasks, \
                        self.q_masks: qmasks}, \
                updates=self.joint_updates, \
                on_unused_input='ignore')

        # visualize trajectories generated by the model
        def sample_func(XO, use_guide_policy=False):
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from the guide policy
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from the primary policy
                self.set_train_switch(switch_val=0.0)
            # get belief states and masks generated by the scan loop
            scan_vals = func(to_fX(XO))
            step_count = self.total_steps + 1
            seq_shape = (step_count, XO.shape[0], XO.shape[1])
            xm_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            xi_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            mi_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            for i in range(step_count):
                _xi = scan_vals[i]
                _mi = scan_vals[i + step_count]
                _xm = (_mi * XO) + ((1.0 - _mi) * _xi)
                xm_seq[i, :, :] = _xm
                xi_seq[i, :, :] = _xi
                mi_seq[i, :, :] = _mi
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            return [xm_seq, xi_seq, mi_seq]

        return sample_func

    def save_to_file(self, f_name=None):
        """
        Dump important stuff to a Python pickle, so that we can reload this
        model later.
        """
        assert (not (f_name is None))
        f_handle = file(f_name, 'wb')
        # dump the dict self.params, which just holds "simple" python values
        cPickle.dump(self.params, f_handle, protocol=-1)
        # make a copy of self.shared_param_dicts, with numpy arrays in place
        # of the theano shared variables
        numpy_param_dicts = {}
        for key in self.shared_param_dicts:
            numpy_ary = self.shared_param_dicts[key].get_value(borrow=False)
            numpy_param_dicts[key] = numpy_ary
        # dump the numpy version of self.shared_param_dicts to pickle file
        cPickle.dump(numpy_param_dicts, f_handle, protocol=-1)
        # get numpy dicts for each of the "child" models that we must save
        child_model_dicts = {}
        child_model_dicts['p_zi_given_xi'] = self.p_zi_given_xi.save_to_dict()
        child_model_dicts[
            'p_sip1_given_zi'] = self.p_sip1_given_zi.save_to_dict()
        child_model_dicts['p_x_given_si'] = self.p_x_given_si.save_to_dict()
        child_model_dicts['q_zi_given_xi'] = self.q_zi_given_xi.save_to_dict()
        # dump the numpy child model dicts to the pickle file
        cPickle.dump(child_model_dicts, f_handle, protocol=-1)
        f_handle.close()
        return
コード例 #19
0
class MultiStageModel(object):
    """
    Controller for training a multi-step iterative refinement model.

    Parameters:
        rng: numpy.random.RandomState (for reproducibility)
        x_in: the input data to encode
        x_out: the target output to decode
        p_s0_given_z: InfNet for initializing "canvas" state
        p_hi_given_si: InfNet for hi given si
        p_sip1_given_si_hi: HydraNet for sip1 given si and hi
        q_z_given_x: InfNet for z given x
        q_hi_given_x_si: InfNet for hi given x and si
        obs_dim: dimension of the observations to generate
        z_dim: dimension of the "initial" latent space
        h_dim: dimension of the "primary" latent space
        ir_steps: number of "iterative refinement" steps to perform
        params: REQUIRED PARAMS SHOWN BELOW
                x_type: can be "bernoulli" or "gaussian"
                obs_transform: can be 'none' or 'sigmoid'
    """
    def __init__(self, rng=None, \
            x_in=None, x_out=None, \
            p_s0_given_z=None, \
            p_hi_given_si=None, \
            p_sip1_given_si_hi=None, \
            q_z_given_x=None, \
            q_hi_given_x_si=None, \
            obs_dim=None, \
            z_dim=None, h_dim=None, \
            ir_steps=4, params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x))
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x))
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(20.0 * T.tanh(0.05 * x))
        self.shared_param_dicts = shared_param_dicts

        # record the dimensions of various spaces relevant to this model
        self.obs_dim = obs_dim
        self.z_dim = z_dim
        self.h_dim = h_dim
        self.ir_steps = ir_steps

        # grab handles to the relevant InfNets
        self.q_z_given_x = q_z_given_x
        self.q_hi_given_x_si = q_hi_given_x_si
        self.p_s0_given_z = p_s0_given_z
        self.p_hi_given_si = p_hi_given_si
        self.p_sip1_given_si_hi = p_sip1_given_si_hi

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out
        self.hi_zmuv = T.tensor3() # for ZMUV Gaussian samples to use in scan

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch')
        self.set_train_switch(1.0)
        # setup a variable for controlling dropout noise
        self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate')
        self.set_drop_rate(0.0)
        # this weight balances l1 vs. l2 penalty on posterior KLds
        self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2')
        self.set_lam_kld_l1l2(1.0)

        if self.shared_param_dicts is None:
            # initialize "optimizable" parameters specific to this MSM
            init_vec = to_fX( np.zeros((self.z_dim,)) )
            self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean')
            self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar')
            init_vec = to_fX( np.zeros((self.obs_dim,)) )
            self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)
            self.shared_param_dicts = {}
            self.shared_param_dicts['p_z_mean'] = self.p_z_mean
            self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            self.p_z_mean = self.shared_param_dicts['p_z_mean']
            self.p_z_logvar = self.shared_param_dicts['p_z_logvar']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)

        # setup a function for computing reconstruction log likelihood
        if self.x_type == 'bernoulli':
            self.log_prob_func = lambda xo, xh: \
                    (-1.0 * log_prob_bernoulli(xo, xh))
        else:
            self.log_prob_func = lambda xo, xh: \
                    (-1.0 * log_prob_gaussian2(xo, xh, \
                     log_vars=self.bounded_logvar))

        # get a drop mask that drops things with probability p
        drop_scale = 1. / (1. - self.drop_rate[0])
        drop_rnd = self.rng.uniform(size=self.x_out.shape, \
                low=0.0, high=1.0, dtype=theano.config.floatX)
        drop_mask = drop_scale * (drop_rnd > self.drop_rate[0])

        #############################
        # Setup self.z and self.s0. #
        #############################
        print("Building MSM step 0...")
        drop_x = drop_mask * self.x_in
        self.q_z_mean, self.q_z_logvar, self.z = \
                self.q_z_given_x.apply(drop_x, do_samples=True)
        # get initial observation state
        self.s0, _ = self.p_s0_given_z.apply(self.z, do_samples=False)

        # gather KLd and NLL for the initialization step
        self.init_klds = gaussian_kld(self.q_z_mean, self.q_z_logvar, \
                                      self.p_z_mean, self.p_z_logvar)
        self.init_nlls =  -1.0 * \
                self.log_prob_func(self.x_out, self.obs_transform(self.s0))

        ##################################################
        # Setup the iterative generation loop using scan #
        ##################################################
        def ir_step_func(hi_zmuv, sim1):
            # get variables used throughout this refinement step
            sim1_obs = self.obs_transform(sim1) # transform state -> obs
            grad_ll = self.x_out - sim1_obs

            # get samples of next hi, conditioned on current si
            hi_p_mean, hi_p_logvar = self.p_hi_given_si.apply( \
                    sim1_obs, do_samples=False)
            # now we build the model for variational hi given si
            hi_q_mean, hi_q_logvar = self.q_hi_given_x_si.apply( \
                    T.horizontal_stack(grad_ll, sim1_obs), \
                    do_samples=False)
            hi_q = (T.exp(0.5 * hi_q_logvar) * hi_zmuv) + hi_q_mean
            hi_p = (T.exp(0.5 * hi_p_logvar) * hi_zmuv) + hi_p_mean

            # make hi samples that can be switched between hi_p and hi_q
            hi = ( ((self.train_switch[0] * hi_q) + \
                    ((1.0 - self.train_switch[0]) * hi_p)) )

            # p_sip1_given_si_hi is conditioned on si and  hi.
            ig_vals, fg_vals, in_vals = self.p_sip1_given_si_hi.apply(hi)
                    
            # get the transformed values (for an LSTM style update)
            i_gate = 1.0 * T.nnet.sigmoid(ig_vals + 2.0)
            f_gate = 1.0 * T.nnet.sigmoid(fg_vals + 2.0)
            # perform an LSTM-like update of the state sim1 -> si
            si = (in_vals * i_gate) + (sim1 * f_gate)

            # compute generator NLL for this step
            nlli = self.log_prob_func(self.x_out, self.obs_transform(si))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(hi_q_mean, hi_q_logvar, \
                                    hi_p_mean, hi_p_logvar)
            kldi_p2q = gaussian_kld(hi_p_mean, hi_p_logvar, \
                                    hi_q_mean, hi_q_logvar)
            return si, nlli, kldi_q2p, kldi_p2q

        init_values = [self.s0, None, None, None]

        self.scan_results, self.scan_updates = theano.scan(ir_step_func, \
                outputs_info=init_values, sequences=self.hi_zmuv)

        self.si = self.scan_results[0]
        self.nlli = self.scan_results[1]
        self.kldi_q2p = self.scan_results[2]
        self.kldi_p2q = self.scan_results[3]

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1')
        self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z')
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.q_params = []
        self.q_params.extend(self.q_z_given_x.mlp_params)
        self.q_params.extend(self.q_hi_given_x_si.mlp_params)
        # Grab all of the "optimizable" parameters in "group 2"
        self.p_params = [self.p_z_mean, self.p_z_logvar]
        self.p_params.extend(self.p_hi_given_si.mlp_params)
        self.p_params.extend(self.p_sip1_given_si_hi.mlp_params)
        self.p_params.extend(self.p_s0_given_z.mlp_params)

        # Make a joint list of parameters group 1/2
        self.joint_params = self.q_params + self.p_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z_q2p, self.kld_z_p2q, self.kld_hi_q2p, self.kld_hi_p2q = \
                self._construct_kld_costs(p=1.0)
        self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_hi = (self.lam_kld_q2p[0] * self.kld_hi_q2p) + \
                      (self.lam_kld_p2q[0] * self.kld_hi_p2q)
        self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_hi
        # now do l2 KLd costs
        self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_hi_q2p, self.kl2_hi_p2q = \
                self._construct_kld_costs(p=2.0)
        self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kl2_z_p2q)
        self.kl2_hi = (self.lam_kld_q2p[0] * self.kl2_hi_q2p) + \
                      (self.lam_kld_p2q[0] * self.kl2_hi_p2q)
        self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_hi
        # compute joint l1/l2 KLd cost
        self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \
                ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs)
        # compute "mean" (rather than per-input) costs
        self.kld_cost = T.mean(self.kld_costs)
        self.kl2_cost = T.mean(self.kl2_costs)
        self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self.nlli[-1]
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_l1l2_cost + \
                          self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_l1l2_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.q_updates = get_adam_updates(params=self.q_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.p_updates = get_adam_updates(params=self.p_params, \
                grads=self.joint_grads, alpha=self.lr_2, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.joint_updates = OrderedDict()
        for k in self.q_updates:
            self.joint_updates[k] = self.q_updates[k]
        for k in self.p_updates:
            self.joint_updates[k] = self.p_updates[k]
        # add scan updates, which seem to be required
        for k in self.scan_updates:
            self.joint_updates[k] = self.scan_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling cost computer...")
        self.compute_raw_klds = self._construct_raw_klds()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling open-loop model sampler...")
        self.sample_from_prior = self._construct_sample_from_prior()
        print("Compiling data-guided model sampler...")
        self.sample_from_input = self._construct_sample_from_input()
        return

    def set_sgd_params(self, lr_1=0.01, lr_2=0.01, \
                mom_1=0.9, mom_2=0.999):
        """
        Set learning rate and momentum parameter for all updates.
        """
        zero_ary = np.zeros((1,))
        # set learning rates
        new_lr_1 = zero_ary + lr_1
        self.lr_1.set_value(to_fX(new_lr_1))
        new_lr_2 = zero_ary + lr_2
        self.lr_2.set_value(to_fX(new_lr_2))
        # set momentums
        new_mom_1 = zero_ary + mom_1
        self.mom_1.set_value(to_fX(new_mom_1))
        new_mom_2 = zero_ary + mom_2
        self.mom_2.set_value(to_fX(new_mom_2))
        return

    def set_lam_nll(self, lam_nll=1.0):
        """
        Set weight for controlling the influence of the data likelihood.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_nll
        self.lam_nll.set_value(to_fX(new_lam))
        return

    def set_lam_kld(self, lam_kld_z=1.0, lam_kld_q2p=1.0, lam_kld_p2q=1.0):
        """
        Set the relative weight of various KL-divergences.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_kld_z
        self.lam_kld_z.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_q2p
        self.lam_kld_q2p.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_p2q
        self.lam_kld_p2q.set_value(to_fX(new_lam))
        return

    def set_lam_l2w(self, lam_l2w=1e-3):
        """
        Set the relative strength of l2 regularization on network params.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_l2w
        self.lam_l2w.set_value(to_fX(new_lam))
        return

    def set_train_switch(self, switch_val=0.0):
        """
        Set the switch for changing between training and sampling behavior.
        """
        if (switch_val < 0.5):
            switch_val = 0.0
        else:
            switch_val = 1.0
        zero_ary = np.zeros((1,))
        new_val = zero_ary + switch_val
        self.train_switch.set_value(to_fX(new_val))
        return

    def set_lam_kld_l1l2(self, lam_kld_l1l2=1.0):
        """
        Set the weight for shaping penalty on conditional priors over zt.
        """
        zero_ary = np.zeros((1,))
        new_val = zero_ary + lam_kld_l1l2
        self.lam_kld_l1l2.set_value(to_fX(new_val))
        return

    def set_drop_rate(self, drop_rate=0.0):
        """
        Set the weight for shaping penalty on conditional priors over zt.
        """
        zero_ary = np.zeros((1,))
        new_val = zero_ary + drop_rate
        self.drop_rate.set_value(to_fX(new_val))
        return

    def _construct_zmuv_samples(self, xi, br):
        """
        Construct the necessary (symbolic) samples for computing through this
        MultiStageModel for input (sybolic) matrix X.
        """
        z_zmuv = self.rng.normal( \
                size=(xi.shape[0]*br, self.z_dim), \
                avg=0.0, std=1.0, dtype=theano.config.floatX)
        hi_zmuv = self.rng.normal( \
                size=(self.ir_steps, xi.shape[0]*br, self.h_dim), \
                avg=0.0, std=1.0, dtype=theano.config.floatX)
        return z_zmuv, hi_zmuv

    def _construct_nll_costs(self, si, xo):
        """
        Construct the negative log-likelihood part of free energy.
        """
        # average log-likelihood over the refinement sequence
        xh = self.obs_transform(si)
        if self.x_type == 'bernoulli':
            ll_costs = log_prob_bernoulli(xo, xh)
        else:
            ll_costs = log_prob_gaussian2(xo, xh, \
                    log_vars=self.bounded_logvar)
        nll_costs = -ll_costs
        return nll_costs

    def _construct_kld_costs(self, p=1.0):
        """
        Construct the posterior KL-divergence part of cost to minimize.
        """
        kld_hi_q2ps = []
        kld_hi_p2qs = []
        for i in range(self.ir_steps):
            kld_hi_q2p = self.kldi_q2p[i]
            kld_hi_p2q = self.kldi_p2q[i]
            kld_hi_q2ps.append(T.sum(kld_hi_q2p**p, \
                    axis=1, keepdims=True))
            kld_hi_p2qs.append(T.sum(kld_hi_p2q**p, \
                    axis=1, keepdims=True))
        # compute the batch-wise costs
        kld_hi_q2p = sum(kld_hi_q2ps)
        kld_hi_p2q = sum(kld_hi_p2qs)
        # construct KLd cost for the distributions over z
        kld_z_q2ps = gaussian_kld(self.q_z_mean, self.q_z_logvar, \
                                  self.p_z_mean, self.p_z_logvar)
        kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \
                                  self.q_z_mean, self.q_z_logvar)
        kld_z_q2p = T.sum(kld_z_q2ps**p, axis=1, keepdims=True)
        kld_z_p2q = T.sum(kld_z_p2qs**p, axis=1, keepdims=True)
        return [kld_z_q2p, kld_z_p2q, kld_hi_q2p, kld_hi_p2q]

    def _construct_reg_costs(self):
        """
        Construct the cost for low-level basic regularization. E.g. for
        applying l2 regularization to the network activations and parameters.
        """
        param_reg_cost = sum([T.sum(p**2.0) for p in self.joint_params])
        return param_reg_cost

    def _construct_train_joint(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xi = T.matrix()
        xo = T.matrix()
        br = T.lscalar()
        # collect the outputs to return from this function
        outputs = [self.joint_cost, self.nll_cost, self.kld_cost, \
                   self.reg_cost, self.obs_costs]
        # compile the theano function
        _, hi_zmuv = self._construct_zmuv_samples(xi, br)
        func = theano.function(inputs=[ xi, xo, br ], \
                outputs=outputs, \
                givens={ self.x_in: xi.repeat(br, axis=0), \
                         self.x_out: xo.repeat(br, axis=0), \
                         self.hi_zmuv: hi_zmuv }, \
                updates=self.joint_updates)
        return func

    def _construct_raw_klds(self):
        """
        Construct function for computing KLd per latent dimension.
        """
        # gather step-wise costs into a single list (init costs at the end)
        all_step_costs = [self.init_klds, self.kldi_q2p, self.kldi_p2q]
        # compile theano function for computing all relevant costs
        inputs = [self.x_in, self.x_out, self.hi_zmuv]
        cost_func = theano.function(inputs=inputs, outputs=all_step_costs, \
                                    updates=self.scan_updates)
        def raw_kld_computer(XI, XO):
            hi_zmuv = to_fX( npr.randn(self.ir_steps, XI.shape[0], self.h_dim) )
            _all_costs = cost_func(XI, XO, hi_zmuv)
            _init_klds = _all_costs[0]
            _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0)
            _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0)
            results = [_init_klds, _kld_q2p, _kld_p2q]
            return results
        return raw_kld_computer

    def _construct_compute_fe_terms(self):
        """
        Construct a function for computing terms in variational free energy.
        """
        # setup some symbolic variables for theano to deal with
        xi = T.matrix()
        xo = T.matrix()
        _, hi_zmuv = self._construct_zmuv_samples(xi, 1)
        # construct values to output
        nll = self.nlli[-1]
        kld = self.kld_z.flatten() + self.kld_hi_q2p.flatten()
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(inputs=[ xi, xo ], \
                outputs=[nll, kld], \
                givens={self.x_in: xi, \
                        self.x_out: xo, \
                        self.hi_zmuv: hi_zmuv}, \
                updates=self.scan_updates)
        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(XI, XO, sample_count):
            # compute a multi-sample estimate of variational free-energy
            nll_sum = np.zeros((XI.shape[0],))
            kld_sum = np.zeros((XI.shape[0],))
            for i in range(sample_count):
                result = fe_term_sample(XI, XO)
                nll_sum += result[0].ravel()
                kld_sum += result[1].ravel()
            mean_nll = nll_sum / float(sample_count)
            mean_kld = kld_sum / float(sample_count)
            return [mean_nll, mean_kld]
        return fe_term_estimator

    def _construct_sample_from_prior(self):
        """
        Construct a function for drawing independent samples from the
        distribution generated by this MultiStageModel. This function returns
        the full sequence of "partially completed" examples.
        """
        z_sym = T.matrix()
        x_sym = T.matrix()
        irs = self.ir_steps
        oputs = [self.obs_transform(self.s0)]
        oputs.extend([self.obs_transform(self.si[i]) for i in range(irs)])
        _, hi_zmuv = self._construct_zmuv_samples(x_sym, 1)
        sample_func = theano.function(inputs=[z_sym, x_sym], outputs=oputs, \
                givens={ self.z: z_sym, \
                         self.x_in: T.zeros_like(x_sym), \
                         self.x_out: T.zeros_like(x_sym), \
                         self.hi_zmuv: hi_zmuv }, \
                updates=self.scan_updates)
        def prior_sampler(samp_count):
            x_samps = to_fX( np.zeros((samp_count, self.obs_dim)) )
            old_switch = self.train_switch.get_value(borrow=False)
            # set model to generation mode
            self.set_train_switch(switch_val=0.0)
            z_samps = to_fX( npr.randn(samp_count, self.z_dim) )
            model_samps = sample_func(z_samps, x_samps)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            return model_samps
        return prior_sampler

    def _construct_sample_from_input(self):
        """
        Construct a function for drawing samples from the distribution
        generated by this MultiStageModel, conditioned on some inputs to the
        initial encoder stage (i.e. self.q_z_given_x). This returns the full 
        sequence of "partially completed" examples.
        """
        xi = T.matrix()
        xo = T.matrix()
        irs = self.ir_steps
        oputs = [self.obs_transform(self.s0)]
        oputs.extend([self.obs_transform(self.si[i]) for i in range(irs)])
        _, hi_zmuv = self._construct_zmuv_samples(xi, 1)
        sample_func = theano.function(inputs=[xi, xo], outputs=oputs, \
                givens={ self.x_in: xi, \
                         self.x_out: xo, \
                         self.hi_zmuv: hi_zmuv }, \
                updates=self.scan_updates)
        def conditional_sampler(XI, XO=None, guided_decoding=False):
            XI = to_fX( XI )
            if XO is None:
                XO = XI
            XO = to_fX( XO )
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if guided_decoding:
                # take samples from guide policies (i.e. variational q)
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from model's generative policy
                self.set_train_switch(switch_val=0.0)
            # draw guided/unguided conditional samples
            model_samps = sample_func(XI, XO)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            return model_samps
        return conditional_sampler
コード例 #20
0
ファイル: DexNet.py プロジェクト: Philip-Bachman/NN-Python
class HiddenLayer(object):
    def __init__(self, rng, input, in_dim, out_dim, \
                 activation=None, pool_size=0, \
                 drop_rate=0., input_noise=0., bias_noise=0., \
                 W=None, b=None, \
                 use_bias=True, name=""):

        # Setup a shared random generator for this layer
        #self.srng = theano.tensor.shared_randomstreams.RandomStreams( \
        #        rng.randint(100000))

        self.srng = CURAND_RandomStreams(rng.randint(1000000))

        self.clean_input = input

        # Add gaussian noise to the input (if desired)
        if (input_noise > 1e-4):
            self.fuzzy_input = input + \
                    (input_noise * self.srng.normal(size=input.shape, \
                    dtype=theano.config.floatX))
        else:
            self.fuzzy_input = input

        # Apply masking noise to the input (if desired)
        if (drop_rate > 1e-4):
            self.noisy_input = self._drop_from_input(self.fuzzy_input, drop_rate)
        else:
            self.noisy_input = self.fuzzy_input

        # Set some basic layer properties
        self.pool_size = pool_size
        self.in_dim = in_dim
        self.out_dim = out_dim
        if self.pool_size <= 1:
            self.filt_count = self.out_dim
        else:
            self.filt_count = self.out_dim * self.pool_size
        self.pool_count = self.filt_count / max(self.pool_size, 1)
        if activation:
            self.activation = activation
        else:
            if self.pool_size <= 1:
                self.activation = lambda x: relu_actfun(x)
            else:
                self.activation = lambda x: \
                        maxout_actfun(x, self.pool_size, self.filt_count)

        # Get some random initial weights and biases, if not given
        if W is None:
            if self.pool_size <= 1:
                # Generate random initial filters in a typical way
                W_init = np.asarray(0.04 * rng.standard_normal( \
                          size=(self.in_dim, self.filt_count)), \
                          dtype=theano.config.floatX)
            else:
                # Generate groups of random filters to pool over such that
                # intra-group correlations are stronger than inter-group
                # correlations, to encourage pooling over similar filters...
                filters = []
                for g_num in range(self.pool_count):
                    g_filt = 0.01 * rng.standard_normal(size=(self.in_dim,1))
                    for f_num in range(self.pool_size):
                        f_filt = g_filt + (0.005 * rng.standard_normal( \
                                size=(self.in_dim,1)))
                        filters.append(f_filt)
                W_init = np.hstack(filters).astype(theano.config.floatX)

            W = theano.shared(value=W_init, name="{0:s}_W".format(name))
        if b is None:
            b_init = np.zeros((self.filt_count,), dtype=theano.config.floatX)
            b = theano.shared(value=b_init, name="{0:s}_b".format(name))

        # Set layer weights and biases
        self.W = W
        self.b = b

        # Compute linear "pre-activation" for this layer
        if use_bias:
            self.linear_output = T.dot(self.noisy_input, self.W) + self.b
        else:
            self.linear_output = T.dot(self.noisy_input, self.W)

        # Add noise to the pre-activation features (if desired)
        self.noisy_linear = self.linear_output  + \
                (bias_noise * self.srng.normal(size=self.linear_output.shape, \
                dtype=theano.config.floatX))

        # Apply activation function
        self.output = self.activation(self.noisy_linear)

        # Compute some properties of the activations, probably to regularize
        self.act_l2_sum = T.sum(self.output**2.) / self.output.size
        self.row_l1_sum = T.sum(abs(row_normalize(self.output))) / \
                self.output.shape[0]
        self.col_l1_sum = T.sum(abs(col_normalize(self.output))) / \
                self.output.shape[1]

        # Conveniently package layer parameters
        if use_bias:
            self.params = [self.W, self.b]
        else:
            self.params = [self.W]
        # Layer construction complete...
        return

    def _drop_from_input(self, input, p):
        """p is the probability of dropping elements of input."""
        # get a drop mask that drops things with probability p
        #drop_mask = self.srng.binomial(n=1, p=1-p, size=input.shape, \
        #        dtype=theano.config.floatX)
        noise_rnd = self.srng.uniform(input.shape, low=0.0, high=1.0, \
            dtype=theano.config.floatX)
        drop_mask = noise_rnd > p
        # get a scaling factor to keep expectations fixed after droppage
        drop_scale = 1. / (1. - p)
        # apply dropout mask and rescaling factor to the input
        droppy_input = drop_scale * input * drop_mask
        return droppy_input

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        #P_nz = P + self.srng.normal(size=P.shape, avg=0., std=noise_lvl, \
        #        dtype=theano.config.floatX)
        P_nz = P + self.srng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                dtype=theano.config.floatX)
        return P_nz
コード例 #21
0
class HiddenLayer(object):
    def __init__(self, rng, input, in_dim, out_dim, \
                 activation=None, pool_size=0, \
                 drop_rate=0., input_noise=0., bias_noise=0., \
                 W=None, b=None, b_in=None, s_in=None,
                 name="", W_scale=1.0):

        # Setup a shared random generator for this layer
        self.rng = RandStream(rng.randint(1000000))

        # setup scale and bias params for the input
        if b_in is None:
            # input biases are always initialized to zero
            ary = np.zeros((in_dim, ), dtype=theano.config.floatX)
            b_in = theano.shared(value=ary, name="{0:s}_b_in".format(name))
        if s_in is None:
            # input scales are always initialized to one
            ary = 0.541325 * np.ones((in_dim, ), dtype=theano.config.floatX)
            s_in = theano.shared(value=ary, name="{0:s}_s_in".format(name))
        self.b_in = b_in
        self.s_in = s_in

        # allow an early shift and rescale for inputs to this layer
        #self.clean_input = T.nnet.softplus(self.s_in) * (input + self.b_in)
        # use the input directly
        self.clean_input = input

        zero_ary = np.zeros((1, )).astype(theano.config.floatX)
        self.input_noise = theano.shared(value=(zero_ary+input_noise), \
                name="{0:s}_input_noise".format(name))
        self.bias_noise = theano.shared(value=(zero_ary+bias_noise), \
                name="{0:s}_bias_noise".format(name))
        self.drop_rate = theano.shared(value=(zero_ary+drop_rate), \
                name="{0:s}_bias_noise".format(name))

        # Add gaussian noise to the input (if desired)
        self.fuzzy_input = self.clean_input + (self.input_noise[0] * \
                self.rng.normal(size=self.clean_input.shape, avg=0.0, std=1.0, \
                dtype=theano.config.floatX))

        # Apply masking noise to the input (if desired)
        self.noisy_input = self._drop_from_input(self.fuzzy_input, \
                self.drop_rate[0])

        # Set some basic layer properties
        self.pool_size = pool_size
        self.in_dim = in_dim
        self.out_dim = out_dim
        if self.pool_size <= 1:
            self.filt_count = self.out_dim
        else:
            self.filt_count = self.out_dim * self.pool_size
        self.pool_count = self.filt_count / max(self.pool_size, 1)
        if activation is None:
            activation = relu_actfun
        if self.pool_size <= 1:
            self.activation = activation
        else:
            self.activation = lambda x: \
                    maxout_actfun(x, self.pool_size, self.filt_count)

        # Get some random initial weights and biases, if not given
        if W is None:
            # Generate initial filters using orthogonal random trick
            #W_shape = (self.in_dim, self.filt_count)
            #W_scale = W_scale * (1.0 / np.sqrt(self.in_dim))
            #W_init = W_scale * npr.normal(0.0, 1.0, W_shape)
            W_init = ortho_matrix(shape=(self.in_dim, self.filt_count), \
                    gain=W_scale)
            W_init = W_init.astype(theano.config.floatX)
            W = theano.shared(value=W_init, name="{0:s}_W".format(name))
        if b is None:
            b_init = np.zeros((self.filt_count, ), dtype=theano.config.floatX)
            b = theano.shared(value=b_init, name="{0:s}_b".format(name))

        # Set layer weights and biases
        self.W = W
        self.b = b

        # Compute linear "pre-activation" for this layer
        self.linear_output = T.dot(self.noisy_input, self.W) + self.b

        # Add noise to the pre-activation features (if desired)
        self.noisy_linear = self.linear_output + (self.bias_noise[0] * \
                self.rng.normal(size=self.linear_output.shape, avg=0.0, \
                std=1.0, dtype=theano.config.floatX))

        # Apply activation function
        self.output = self.activation(self.noisy_linear)

        # Compute some properties of the activations, probably to regularize
        self.act_l2_sum = T.sum(self.noisy_linear**2.) / self.output.size

        # Conveniently package layer parameters
        self.params = [self.W, self.b, self.b_in, self.s_in]
        # Layer construction complete...
        return

    def _drop_from_input(self, input, p):
        """p is the probability of dropping elements of input."""
        # get a drop mask that drops things with probability p
        drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \
                dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # get a scaling factor to keep expectations fixed after droppage
        drop_scale = 1. / (1. - p)
        # apply dropout mask and rescaling factor to the input
        droppy_input = drop_scale * input * drop_mask
        return droppy_input

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                dtype=theano.config.floatX)
        return P_nz
コード例 #22
0
class HiddenLayer(object):
    def __init__(self, rng, input, in_dim, out_dim, \
                 activation=None, pool_size=0, \
                 drop_rate=0., input_noise=0., bias_noise=0., \
                 W=None, b=None, name="", W_scale=1.0):

        # Setup a shared random generator for this layer
        #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \
        #        rng.randint(100000))
        self.rng = CURAND_RandomStreams(rng.randint(1000000))

        self.clean_input = input

        # Add gaussian noise to the input (if desired)
        if (input_noise > 1e-4):
            self.fuzzy_input = input + self.rng.normal(size=input.shape, \
                    avg=0.0, std=input_noise, dtype=theano.config.floatX)
        else:
            self.fuzzy_input = input

        # Apply masking noise to the input (if desired)
        if (drop_rate > 1e-4):
            self.noisy_input = self._drop_from_input(self.fuzzy_input,
                                                     drop_rate)
        else:
            self.noisy_input = self.fuzzy_input

        # Set some basic layer properties
        self.pool_size = pool_size
        self.in_dim = in_dim
        self.out_dim = out_dim
        if self.pool_size <= 1:
            self.filt_count = self.out_dim
        else:
            self.filt_count = self.out_dim * self.pool_size
        self.pool_count = self.filt_count / max(self.pool_size, 1)
        if activation:
            self.activation = activation
        else:
            if self.pool_size <= 1:
                self.activation = lambda x: relu_actfun(x)
            else:
                self.activation = lambda x: \
                        maxout_actfun(x, self.pool_size, self.filt_count)

        # Get some random initial weights and biases, if not given
        if W is None:
            if self.pool_size <= 1:
                # Generate random initial filters in a typical way
                W_init = 0.01 * np.asarray(rng.normal( \
                          size=(self.in_dim, self.filt_count)), \
                          dtype=theano.config.floatX)
            else:
                # Generate groups of random filters to pool over such that
                # intra-group correlations are stronger than inter-group
                # correlations, to encourage pooling over similar filters...
                filters = []
                f_size = (self.in_dim, 1)
                for g_num in range(self.pool_count):
                    g_filt = 0.01 * rng.normal(size=f_size)
                    for f_num in range(self.pool_size):
                        f_filt = g_filt + 0.003 * rng.normal(size=f_size)
                        filters.append(f_filt)
                W_init = np.hstack(filters).astype(theano.config.floatX)
            W = theano.shared(value=(W_scale * W_init),
                              name="{0:s}_W".format(name))
        if b is None:
            b_init = np.zeros((self.filt_count, ), dtype=theano.config.floatX)
            b = theano.shared(value=b_init, name="{0:s}_b".format(name))

        # Set layer weights and biases
        self.W = W
        self.b = b

        # Compute linear "pre-activation" for this layer
        self.linear_output = T.dot(self.noisy_input, self.W) + self.b

        # Add noise to the pre-activation features (if desired)
        if bias_noise > 1e-3:
            self.noisy_linear = self.linear_output  + \
                    self.rng.normal(size=self.linear_output.shape, \
                    avg=0.0, std=bias_noise, dtype=theano.config.floatX)
        else:
            self.noisy_linear = self.linear_output

        # Apply activation function
        self.output = self.activation(self.noisy_linear)

        # Compute some properties of the activations, probably to regularize
        self.act_l2_sum = T.sum(self.output**2.) / self.output.size
        self.row_l1_sum = T.sum(abs(row_normalize(self.output))) / \
                self.output.shape[0]
        self.col_l1_sum = T.sum(abs(col_normalize(self.output))) / \
                self.output.shape[1]

        # Conveniently package layer parameters
        self.params = [self.W, self.b]
        # Layer construction complete...
        return

    def _drop_from_input(self, input, p):
        """p is the probability of dropping elements of input."""
        # get a drop mask that drops things with probability p
        drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \
                dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # get a scaling factor to keep expectations fixed after droppage
        drop_scale = 1. / (1. - p)
        # apply dropout mask and rescaling factor to the input
        droppy_input = drop_scale * input * drop_mask
        return droppy_input

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                dtype=theano.config.floatX)
        return P_nz
コード例 #23
0
class GenUniModule(object):
    """
    Module that applies a linear transform followed by an non-linearity.
    """
    def __init__(self, rand_dim, out_dim,
                 apply_bn=True, init_func=None,
                 rand_type='normal', final_relu=True, 
                 mod_name='dm_uni'):
        self.rand_dim = rand_dim
        self.out_dim = out_dim
        self.apply_bn = apply_bn
        self.mod_name = mod_name
        self.rand_type = rand_type
        self.final_relu = final_relu
        self.rng = RandStream(123)
        if init_func is None:
            self.init_func = inits.Normal(scale=0.02)
        else:
            self.init_func = init_func
        self._init_params() # initialize parameters
        return

    def _init_params(self):
        """
        Initialize parameters for the layers in this generator module.
        """
        self.w1 = self.init_func((self.rand_dim, self.out_dim),
                                 "{}_w1".format(self.mod_name))
        self.params = [ self.w1 ]
        # make gains and biases for transforms that will get batch normed
        if self.apply_bn:
            gain_ifn = inits.Normal(loc=1., scale=0.02)
            bias_ifn = inits.Constant(c=0.)
            self.g1 = gain_ifn((self.out_dim), "{}_g1".format(self.mod_name))
            self.b1 = bias_ifn((self.out_dim), "{}_b1".format(self.mod_name))
            self.params.extend([self.g1, self.b1])
        return

    def apply(self, batch_size=None, rand_vals=None):
        """
        Apply this generator module. Pass _either_ batch_size or rand_vals.
        """
        assert not ((batch_size is None) and (rand_vals is None)), "need either batch_size or rand_vals"
        if rand_vals is None:
            rand_shape = (batch_size, self.rand_dim)
            if self.rand_type == 'normal':
                rand_vals = self.rng.normal(size=rand_shape, avg=0.0, std=1.0, \
                                            dtype=theano.config.floatX)
            else:
                rand_vals = self.rng.uniform(size=rand_shape, low=-1.0, high=1.0, \
                                             dtype=theano.config.floatX)
        else:
            rand_shape = (rand_vals.shape[0], self.rand_dim)
        rand_vals = rand_vals.reshape(rand_shape)
        # transform random values linearly
        h1 = T.dot(rand_vals, self.w1)
        if self.apply_bn:
            h1 = batchnorm(h1, g=self.g1, b=self.b1)
        if self.final_relu:
            h1 = relu(h1)
        return h1















##############
# EYE BUFFER #
##############
コード例 #24
0
class ConvPoolLayer(object):
    """
    A simple convolution --> max-pooling layer.

    The (symbolic) input to this layer must be a theano.tensor.dtensor4 shaped
    like (batch_size, chan_count, im_dim_1, im_dim_2).

    filt_def should be a 4-tuple like (filt_count, in_chans, filt_def_1, filt_def_2)

    pool_def should be a 3-tuple like (pool_dim, pool_stride)
    """
    def __init__(self, rng, input=None, filt_def=None, pool_def=(2, 2), \
      activation=None, drop_rate=0., input_noise=0., bias_noise=0., \
      W=None, b=None, name="", W_scale=1.0):

        # Setup a shared random generator for this layer
        #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \
        #        rng.randint(100000))
        self.rng = CURAND_RandomStreams(rng.randint(1000000))

        self.clean_input = input

        # Add gaussian noise to the input (if desired)
        if (input_noise > 1e-4):
            self.fuzzy_input = input + self.rng.normal(size=input.shape, \
                    avg=0.0, std=input_noise, dtype=theano.config.floatX)
        else:
            self.fuzzy_input = input

        # Apply masking noise to the input (if desired)
        if (drop_rate > 1e-4):
            self.noisy_input = self._drop_from_input(self.fuzzy_input,
                                                     drop_rate)
        else:
            self.noisy_input = self.fuzzy_input

        # Set the activation function for the conv filters
        if activation:
            self.activation = activation
        else:
            self.activation = lambda x: relu_actfun(x)

        # initialize weights with random weights
        W_init = 0.01 * np.asarray(rng.normal( \
          size=filt_def), dtype=theano.config.floatX)
        self.W = theano.shared(value=(W_scale*W_init), \
          name="{0:s}_W".format(name))

        # the bias is a 1D tensor -- one bias per output feature map
        b_init = np.zeros((filt_def[0], ), dtype=theano.config.floatX) + 0.1
        self.b = theano.shared(value=b_init, name="{0:s}_b".format(name))

        # convolve input feature maps with filters
        input_c01b = self.noisy_input.dimshuffle(1, 2, 3, 0)  # bc01 to c01b
        filters_c01b = self.W.dimshuffle(1, 2, 3, 0)  # bc01 to c01b
        conv_op = FilterActs(stride=1, partial_sum=1)
        contig_input = gpu_contiguous(input_c01b)
        contig_filters = gpu_contiguous(filters_c01b)
        conv_out_c01b = conv_op(contig_input, contig_filters)

        if (bias_noise > 1e-4):
            noisy_conv_out_c01b = conv_out_c01b + self.rng.normal( \
              size=conv_out_c01b.shape, avg=0.0, std=bias_noise, \
              dtype=theano.config.floatX)
        else:
            noisy_conv_out_c01b = conv_out_c01b

        # downsample each feature map individually, using maxpooling
        pool_op = MaxPool(ds=pool_def[0], stride=pool_def[1])
        mp_out_c01b = pool_op(noisy_conv_out_c01b)
        mp_out_bc01 = mp_out_c01b.dimshuffle(3, 0, 1, 2)  # c01b to bc01

        # add the bias term. Since the bias is a vector (1D array), we first
        # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will
        # thus be broadcasted across mini-batches and feature map
        # width & height
        self.noisy_linear_output = mp_out_bc01 + self.b.dimshuffle(
            'x', 0, 'x', 'x')
        self.linear_output = self.noisy_linear_output
        self.output = self.activation(self.noisy_linear_output)

        # store parameters of this layer
        self.params = [self.W, self.b]

        return

    def _drop_from_input(self, input, p):
        """p is the probability of dropping elements of input."""
        # get a drop mask that drops things with probability p
        drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \
                dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # get a scaling factor to keep expectations fixed after droppage
        drop_scale = 1. / (1. - p)
        # apply dropout mask and rescaling factor to the input
        droppy_input = drop_scale * input * drop_mask
        return droppy_input

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                dtype=theano.config.floatX)
        return P_nz
コード例 #25
0
class DAELayer(object):
    def __init__(self, rng, clean_input=None, fuzzy_input=None, \
            in_dim=0, out_dim=0, activation=None, input_noise=0., \
            W=None, b_h=None, b_v=None):

        # Setup a shared random generator for this layer
        #self.rng = theano.tensor.shared_randomstreams.RandomStreams( \
        #        rng.randint(100000))
        self.rng = CURAND_RandomStreams(rng.randint(1000000))

        # Grab the layer input and perturb it with some sort of noise. This
        # is, afterall, a _denoising_ autoencoder...
        self.clean_input = clean_input
        self.noisy_input = self._get_noisy_input(fuzzy_input, input_noise)

        # Set some basic layer properties
        self.activation = activation
        self.in_dim = in_dim
        self.out_dim = out_dim

        # Get some random initial weights and biases, if not given
        if W is None:
            W_init = np.asarray(0.01 * rng.standard_normal( \
                      size=(in_dim, out_dim)), dtype=theano.config.floatX)
            W = theano.shared(value=W_init, name='W')
        if b_h is None:
            b_init = np.zeros((out_dim, ), dtype=theano.config.floatX)
            b_h = theano.shared(value=b_init, name='b_h')
        if b_v is None:
            b_init = np.zeros((in_dim, ), dtype=theano.config.floatX)
            b_v = theano.shared(value=b_init, name='b_v')

        # Grab pointers to the now-initialized weights and biases
        self.W = W
        self.b_h = b_h
        self.b_v = b_v

        # Put the learnable/optimizable parameters into a list
        self.params = [self.W, self.b_h, self.b_v]
        # Beep boop... layer construction complete...
        return

    def compute_costs(self, lam_l1=None):
        """Compute reconstruction and activation sparsity costs."""
        # Get noise-perturbed encoder/decoder parameters
        W_nz = self._noisy_params(self.W, 0.01)
        b_nz = self.b_h  #self._noisy_params(self.b_h, 0.05)
        # Compute hidden and visible activations
        A_v, A_h = self._compute_activations(self.noisy_input, \
                W_nz, b_nz, self.b_v)
        # Compute reconstruction error cost
        recon_cost = T.sum((self.clean_input - A_v)**2.0) / \
                self.clean_input.shape[0]
        # Compute sparsity penalty (over both population and lifetime)
        row_l1_sum = T.sum(abs(row_normalize(A_h))) / A_h.shape[0]
        col_l1_sum = T.sum(abs(col_normalize(A_h))) / A_h.shape[1]
        sparse_cost = lam_l1[0] * (row_l1_sum + col_l1_sum)
        return [recon_cost, sparse_cost]

    def _compute_hidden_acts(self, X, W, b_h):
        """Compute activations of encoder (at hidden layer)."""
        A_h = self.activation(T.dot(X, W) + b_h)
        return A_h

    def _compute_activations(self, X, W, b_h, b_v):
        """Compute activations of decoder (at visible layer)."""
        A_h = self._compute_hidden_acts(X, W, b_h)
        A_v = T.dot(A_h, W.T) + b_v
        return [A_v, A_h]

    def _noisy_params(self, P, noise_lvl=0.):
        """Noisy weights, like convolving energy surface with a gaussian."""
        if noise_lvl > 1e-3:
            P_nz = P + self.rng.normal(size=P.shape, avg=0.0, std=noise_lvl, \
                    dtype=theano.config.floatX)
        else:
            P_nz = P
        return P_nz

    def _get_noisy_input(self, input, p):
        """p is the probability of dropping elements of input."""
        drop_rnd = self.rng.uniform(input.shape, low=0.0, high=1.0, \
            dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # Cast mask from int to float32, to keep things on GPU
        noisy_input = input * drop_mask
        return noisy_input
コード例 #26
0
class HiddenLayer(object):
    def __init__(self, rng, layer_description,
                 W=None, b=None, b_in=None, s_in=None,
                 name="", W_scale=1.0):
        # parse options from layer_description
        assert 'layer_type' in layer_description, \
                "layer_description must provide layer_type"
        assert ((layer_description['layer_type'] == 'fc') or \
                (layer_description['layer_type'] == 'conv')), \
                "layer_type must be fc or conv"

        self.layer_description = layer_description
        self.layer_type = layer_description['layer_type']
        self.in_chans = layer_description['in_chans']
        self.out_chans = layer_description['out_chans']
        self.activation = layer_description['activation']
        self.filt_dim = layer_description.get('filt_dim', None)
        self.conv_stride = layer_description.get('conv_stride', None)
        self.apply_bn = layer_description.get('apply_bn', False)
        self.drop_rate = layer_description.get('drop_rate', 0.0)
        self.shape_func_in = layer_description.get('shape_func_in', None)
        self.shape_func_out = layer_description.get('shape_func_out', None)

        # setup additional params
        self.rng = RandStream(rng.randint(1000000))
        self.W_scale = W_scale
        self.name = name

        if self.layer_type == 'fc':
            self.W, self.b, self.b_in, self.s_in = \
                    self._init_fc_params(W=W, b=b, b_in=b_in, s_in=s_in)
        else:
            self.W, self.b, self.b_in, self.s_in = \
                    self._init_conv_params(W=W, b=b, b_in=b_in, s_in=s_in)

        # Conveniently package layer parameters
        self.params = [self.W, self.b, self.b_in, self.s_in]
        self.shared_param_dicts = {
                'W': self.W,
                'b': self.b,
                'b_in': self.b_in,
                's_in': self.s_in }
        # Layer construction complete...
        return

    def _init_fc_params(self, W=None, b=None, b_in=None, s_in=None):
        """
        Initialize all parameters that may be required for feedforward through
        a fully-connected hidden layer.
        """
        # Get some random initial weights and biases, if not given
        if W is None:
            # Generate initial filters using orthogonal random trick
            W_shape = (self.in_chans, self.out_chans)
            if self.W_scale == 'xg':
                W_np = glorot_matrix(W_shape)
            else:
                #W_np = (self.W_scale * (1.0 / np.sqrt(self.in_chans))) * \
                #          npr.normal(0.0, 1.0, W_shape)
                W_np = ortho_matrix(shape=W_shape, gain=self.W_scale)
            W_np = W_np.astype(theano.config.floatX)
            W = theano.shared(value=W_np, name="{0:s}_W".format(self.name))
        if b is None:
            b_np = np.zeros((self.out_chans,), dtype=theano.config.floatX)
            b = theano.shared(value=b_np, name="{0:s}_b".format(self.name))
        # setup scale and bias params for after batch normalization
        if b_in is None:
            # batch normalization reshifts are initialized to zero
            ary = np.zeros((self.out_chans,), dtype=theano.config.floatX)
            b_in = theano.shared(value=ary, name="{0:s}_b_in".format(self.name))
        if s_in is None:
            # batch normalization rescales are initialized to zero
            ary = np.zeros((self.out_chans,), dtype=theano.config.floatX)
            s_in = theano.shared(value=ary, name="{0:s}_s_in".format(self.name))
        return W, b, b_in, s_in

    def _init_conv_params(self, W=None, b=None, b_in=None, s_in=None):
        """
        Initialize all parameters that may be required for feedforward through
        a convolutional hidden layer.
        """
        if W is None:
            W_shape = (self.out_chans, self.in_chans, self.filt_dim, self.filt_dim)
            ary = npr.normal(0.0, self.W_scale*0.02, W_shape).astype(theano.config.floatX)
            W = theano.shared(value=ary, name="{0:s}_W".format(self.name))
        if b is None:
            b_shape = (self.out_chans,)
            ary = npr.normal(0.0, 0.01, b_shape).astype(theano.config.floatX)
            b = theano.shared(value=ary, name="{0:s}_b".format(self.name))
        # setup scale and bias params for after batch normalization
        if b_in is None:
            # batch normalization reshifts are initialized to zero
            ary = np.zeros((self.out_chans,), dtype=theano.config.floatX)
            b_in = theano.shared(value=ary, name="{0:s}_b_in".format(self.name))
        if s_in is None:
            # batch normalization rescales are initialized to zero
            ary = np.zeros((self.out_chans,), dtype=theano.config.floatX)
            s_in = theano.shared(value=ary, name="{0:s}_s_in".format(self.name))
        return W, b, b_in, s_in

    def apply(self, input, use_drop=False):
        """
        Apply feedforward to this input, returning several partial results.
        """
        # Reshape input if a reshape command was provided
        if not (self.shape_func_in is None):
            input = self.shape_func_in(input)
        # Apply masking noise to the input (if desired)
        if use_drop:
            input = self._drop_from_input(input, self.drop_rate)
        if self.layer_type == 'fc':
            # Feedforward through fully-connected layer
            linear_output = T.dot(input, self.W) + self.b
        elif self.layer_type == 'conv':
            # Feedforward through convolutional layer, with adjustable stride
            bm = int((self.filt_dim - 1) / 2) # use "same" mode convolutions
            if self.conv_stride == 'double':
                linear_output = dnn_conv(input, self.W, subsample=(2, 2),
                                         border_mode=(bm, bm))
            elif self.conv_stride == 'single':
                linear_output = dnn_conv(input, self.W, subsample=(1, 1),
                                         border_mode=(bm, bm))
            elif self.conv_stride == 'half':
                linear_output = deconv(input, self.W, subsample=(2, 2),
                                       border_mode=(bm, bm))
            else:
                assert False, "Unknown stride type!"
            linear_output = linear_output + self.b.dimshuffle('x',0,'x','x')
        else:
            assert False, "Unknown layer type!"
        # Apply batch normalization if desired
        if self.apply_bn:
            linear_output = batchnorm(linear_output, rescale=self.s_in,
                                      reshift=self.b_in, u=None, s=None)
        # Apply activation function
        final_output = self.activation(linear_output)
        # Reshape output if a reshape command was provided
        if not (self.shape_func_out is None):
            linear_output = self.shape_func_out(linear_output)
            final_output = self.shape_func_out(final_output)
        return final_output, linear_output

    def _drop_from_input(self, input, p):
        """p is the probability of dropping elements of input."""
        # get a drop mask that drops things with probability p
        drop_rnd = self.rng.uniform(size=input.shape, low=0.0, high=1.0, \
                dtype=theano.config.floatX)
        drop_mask = drop_rnd > p
        # get a scaling factor to keep expectations fixed after droppage
        drop_scale = 1. / (1. - p)
        # apply dropout mask and rescaling factor to the input
        droppy_input = drop_scale * input * drop_mask
        return droppy_input
コード例 #27
0
class SRRModel(object):
    """
    Controller for training a sequential revelation and refinement model.

    Parameters:
        rng: numpy.random.RandomState (for reproducibility)
        x_out: the goal state for iterative refinement
        p_zi_given_xi: InfNet for stochastic part of step
        p_sip1_given_zi: HydraNet for deterministic part of step
        p_x_given_si: HydraNet for transform from s-space to x-space
        q_zi_given_xi: InfNet for the guide policy
        params: REQUIRED PARAMS SHOWN BELOW
                x_dim: dimension of observations to construct
                z_dim: dimension of latent space for policy wobble
                s_dim: dimension of space in which to perform construction
                use_p_x_given_si: boolean for whether to use p_x_given_si
                rev_sched: list of "revelation" blocks. each block is described
                           by the number of steps prior to revelation, and the
                           percentage of remaining pixels to reveal.
                rev_masks: matrix of revelation masks. the row i provides the
                           mask for iteration i of the srr loop. when this
                           argument is passed, rev_sched is ignored and the
                           revelation schedule is determined by rev_masks.
                step_type: either "add" or "jump"
                x_type: can be "bernoulli" or "gaussian"
                obs_transform: can be 'none' or 'sigmoid'
    """

    def __init__(
        self,
        rng=None,
        x_out=None,
        p_zi_given_xi=None,
        p_sip1_given_zi=None,
        p_x_given_si=None,
        q_zi_given_xi=None,
        params=None,
        shared_param_dicts=None,
    ):
        # setup a rng for this SRRModel
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_dim = self.params["x_dim"]
        self.z_dim = self.params["z_dim"]
        self.s_dim = self.params["s_dim"]
        self.use_p_x_given_si = self.params["use_p_x_given_si"]
        self.step_type = self.params["step_type"]
        self.x_type = self.params["x_type"]
        if self.use_p_x_given_si:
            print("Constructing hypotheses indirectly in s-space...")
        else:
            print("Constructing hypotheses directly in x-space...")
            assert self.s_dim == self.x_dim
        if "obs_transform" in self.params:
            assert (self.params["obs_transform"] == "sigmoid") or (self.params["obs_transform"] == "none")
            if self.params["obs_transform"] == "sigmoid":
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == "bernoulli":
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        self.shared_param_dicts = shared_param_dicts
        # Deal with revelation scheduling
        if ("rev_masks" in self.params) and (self.params["rev_masks"] is not None):
            rmp = self.params["rev_masks"][0].astype(theano.config.floatX)
            rmq = self.params["rev_masks"][1].astype(theano.config.floatX)
            self.rev_masks_p = theano.shared(value=rmp, name="srrm_rev_masks_p")
            self.rev_masks_q = theano.shared(value=rmq, name="srrm_rev_masks_q")
            self.rev_sched = None
            self.use_rev_masks = True
        else:
            self.rev_sched = self.params["rev_sched"]
            self.rev_masks_p = None
            self.rev_masks_q = None
            self.use_rev_masks = False
            nice_nums = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
            # "validate" the set of revelation block descriptions
            for rev_block in self.rev_sched:
                assert rev_block[0] in nice_nums
                assert (rev_block[1] >= 0.0) and (rev_block[1] <= 1.01)
        assert (self.x_type == "bernoulli") or (self.x_type == "gaussian")
        assert (self.step_type == "add") or (self.step_type == "jump")

        # grab handles to the relevant networks
        self.p_zi_given_xi = p_zi_given_xi
        self.p_sip1_given_zi = p_sip1_given_zi
        self.p_x_given_si = p_x_given_si
        self.q_zi_given_xi = q_zi_given_xi

        # record the symbolic variables that will provide inputs to the
        # computation graph created for this SRRModel
        self.x_out = x_out  # target output for generation
        self.zi_zmuv = T.tensor3()  # ZMUV gauss noise for policy wobble
        self.p_masks = T.tensor3()  # revelation masks for primary policy
        self.q_masks = T.tensor3()  # revelation masks for guide policy
        if self.use_rev_masks:
            self.total_steps = self.params["rev_masks"][0].shape[0]
        else:
            self.total_steps = sum([rb[0] for rb in self.rev_sched])

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX(np.zeros((1,)))
        self.train_switch = theano.shared(value=zero_ary, name="srrm_train_switch")
        self.set_train_switch(1.0)

        if self.shared_param_dicts is None:
            # initialize the parameters "owned" by this model
            s0_init = to_fX(np.zeros((self.s_dim,)))
            ss_init = to_fX(0.5 * np.ones((self.total_steps,)))
            self.s0 = theano.shared(value=s0_init, name="srrm_s0")
            self.obs_logvar = theano.shared(value=zero_ary, name="srrm_obs_logvar")
            self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.obs_logvar[0])
            self.step_scales = theano.shared(value=ss_init, name="srrm_step_scales")
            self.shared_param_dicts = {}
            self.shared_param_dicts["s0"] = self.s0
            self.shared_param_dicts["obs_logvar"] = self.obs_logvar
            self.shared_param_dicts["step_scales"] = self.step_scales
        else:
            # grab the parameters required by this model from a given dict
            self.s0 = self.shared_param_dicts["s0"]
            self.obs_logvar = self.shared_param_dicts["obs_logvar"]
            self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.obs_logvar[0])
            self.step_scales = self.shared_param_dicts["step_scales"]

        ##################################################################
        # Setup the sequential revelation and refinement loop using scan #
        ##################################################################
        # ss: This is a sequence of scalars that will be used to rescale the
        #     "gradient" input to the primary and guide policies.
        #
        # zi_zmuv: This is a sequence of ZMUV gaussian samples that will be
        #          reparametrized to sample actions from the policies.
        #
        # p_masks: This is a sequence of "unmasking" masks. When one of these
        #          masking variables is 1, the corresponding value in self.x_out
        #          will be "revealed" to the primary policy. Prediction error
        #          is measured for a value only the first time it is revealed.
        #          Once revealed, a value remains "visible" to the policy.
        #          The final step should reveal all values.
        #
        # q_masks: This is a sequence of "unmasking" masks. These are similar
        #          to p_masks, but control which values are revealed to the
        #          guide policy. The guide policy masking sequence should be
        #          constructed to stay "ahead of" the primary policy's masking
        #          sequence. The guide policy needs to know which values will
        #          be revealed to the primary policy so that it can focus its
        #          reconstruction efforts on those values. Otherwise, the guide
        #          policy will immediately reconstruct the entire target.
        #
        # si: This is the current "belief state" for each trial in the training
        #     batch. The belief state is updated in each iteration, and passed
        #     forward through the recurrence.
        #
        # mi_p: This is the current revelation mask for the primary policy.
        #
        # mi_q: This is the current revelation mask for the guide policy.
        #
        def srr_step_func(ss, zi_zmuv, p_masks, q_masks, si, mi_p, mi_q):
            # transform the current belief state into an observation
            si_as_x = self._from_si_to_x(si)
            full_grad = T.log(1.0 + T.exp(ss)) * (self.x_out - si_as_x)

            # get the masked belief state and gradient for primary policy
            xi_for_p = (mi_p * self.x_out) + ((1.0 - mi_p) * si_as_x)
            grad_for_p = mi_p * full_grad

            # update the guide policy's revelation mask
            new_to_q = (1.0 - mi_q) * q_masks
            mip1_q = mi_q + new_to_q
            # get the masked belief state and gradient for guide policy
            # xi_for_q = (mip1_q * self.x_out) + ((1.0 - mip1_q) * si_as_x)
            xi_for_q = xi_for_p
            grad_for_q = mip1_q * full_grad

            # get samples of next zi, according to the primary policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(
                T.horizontal_stack(xi_for_p, grad_for_p), do_samples=False
            )
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply(
                T.horizontal_stack(xi_for_q, grad_for_q), do_samples=False
            )
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)
            # make zi samples that can be switched between zi_p and zi_q
            zi = (self.train_switch[0] * zi_q) + ((1.0 - self.train_switch[0]) * zi_p)

            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar)  # KL(q || p)
            kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar)  # KL(p || q)
            kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0)  # KL(p || N(0, I))

            # compute next si, given sampled zi (i.e. update the belief state)
            hydra_out = self.p_sip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if self.step_type == "jump":
                # jump steps always do a full swap of belief state
                sip1 = si_step
            else:
                # additive steps adjust the belief state like an LSTM
                write_gate = T.nnet.sigmoid(2.0 + hydra_out[1])
                erase_gate = T.nnet.sigmoid(2.0 + hydra_out[2])
                sip1 = (erase_gate * si) + (write_gate * si_step)
            # update the primary policy's revelation mask
            new_to_p = (1.0 - mi_p) * p_masks
            mip1_p = mi_p + new_to_p
            # compute NLL only for the newly revealed values
            nlli = self._construct_nll_costs(sip1, self.x_out, new_to_p)
            # each loop iteration produces the following values:
            #   sip1: belief state at end of current step
            #   mip1_p: revealed values mask to use in next step (primary)
            #   mip1_q: revealed values mask to use in next step (guide)
            #   nlli: NLL for values revealed at end of current step
            #   kldi_q2p: KL(q || p) for the current step
            #   kldi_p2q: KL(p || q) for the current step
            #   kldi_p2g: KL(p || N(0,I)) for the current step
            return sip1, mip1_p, mip1_q, nlli, kldi_q2p, kldi_p2q, kldi_p2g

        # initialize belief state to self.s0
        self.s0_full = T.alloc(0.0, self.x_out.shape[0], self.s_dim) + self.s0
        # initialize revelation masks to 0 for all values in all trials
        self.m0_full = T.zeros_like(self.x_out)
        # setup initial values to pass to scan op
        outputs_init = [self.s0_full, self.m0_full, self.m0_full, None, None, None, None]
        sequences_init = [self.step_scales, self.zi_zmuv, self.p_masks, self.q_masks]
        # apply scan op for the sequential imputation loop
        self.scan_results, self.scan_updates = theano.scan(
            srr_step_func, outputs_info=outputs_init, sequences=sequences_init
        )

        # grab results of the scan op. all values are computed for each step
        self.si = self.scan_results[0]  # belief states
        self.mi_p = self.scan_results[1]  # primary revelation masks
        self.mi_q = self.scan_results[2]  # guide revelation masks
        self.nlli = self.scan_results[3]  # NLL on newly revealed values
        self.kldi_q2p = self.scan_results[4]  # KL(q || p)
        self.kldi_p2q = self.scan_results[5]  # KL(p || q)
        self.kldi_p2g = self.scan_results[6]  # KL(p || N(0,I))

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX(np.zeros((1,)))
        self.lr = theano.shared(value=zero_ary, name="srr_lr")
        # shared var momentum parameters for ADAM optimization
        self.mom_1 = theano.shared(value=zero_ary, name="srr_mom_1")
        self.mom_2 = theano.shared(value=zero_ary, name="srr_mom_2")
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared vars for weighting prior kld against reconstruction
        self.lam_kld_p = theano.shared(value=zero_ary, name="srr_lam_kld_p")
        self.lam_kld_q = theano.shared(value=zero_ary, name="srr_lam_kld_q")
        self.lam_kld_g = theano.shared(value=zero_ary, name="srr_lam_kld_g")
        self.lam_kld_s = theano.shared(value=zero_ary, name="srr_lam_kld_s")
        self.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name="srr_lam_l2w")
        self.set_lam_l2w(1e-5)

        # grab all of the "optimizable" parameters from the base networks
        self.joint_params = [self.s0, self.obs_logvar, self.step_scales]
        self.joint_params.extend(self.p_zi_given_xi.mlp_params)
        self.joint_params.extend(self.p_sip1_given_zi.mlp_params)
        self.joint_params.extend(self.p_x_given_si.mlp_params)
        self.joint_params.extend(self.q_zi_given_xi.mlp_params)

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_p, self.kld_q, self.kld_g, self.kld_s = self._construct_kld_costs(p=1.0)
        self.kld_costs = (
            (self.lam_kld_p[0] * self.kld_p)
            + (self.lam_kld_q[0] * self.kld_q)
            + (self.lam_kld_g[0] * self.kld_g)
            + (self.lam_kld_s[0] * self.kld_s)
        )
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = T.sum(self.nlli, axis=0)  # sum the per-step NLLs
        self.nll_cost = T.mean(self.nll_costs)
        self.nll_bounds = self.nll_costs.ravel() + self.kld_q.ravel()
        self.nll_bound = T.mean(self.nll_bounds)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-TRIAL COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.joint_updates = get_adam_updates(
            params=self.joint_params,
            grads=self.joint_grads,
            alpha=self.lr,
            beta1=self.mom_1,
            beta2=self.mom_2,
            mom2_init=1e-3,
            smoothing=1e-5,
            max_grad_norm=10.0,
        )
        for k, v in self.scan_updates.items():
            self.joint_updates[k] = v

        # Construct theano functions for training and diagnostic computations
        print("Compiling cost computer...")
        self.compute_raw_costs = self._construct_raw_costs()
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling sequence sampler...")
        self.sequence_sampler = self._construct_sequence_sampler()
        # make easy access points for some interesting parameters
        # self.gen_inf_weights = self.p_zi_given_xi.shared_layers[0].W
        return

    def _from_si_to_x(self, si):
        """
        Convert the given si from s-space to x-space.
        """
        if self.use_p_x_given_si:
            x_pre_trans, _ = self.p_x_given_si.apply(si)
        else:
            x_pre_trans = si
        x_post_trans = self.obs_transform(x_pre_trans)
        return x_post_trans

    def set_sgd_params(self, lr=0.01, mom_1=0.9, mom_2=0.999):
        """
        Set learning rate and momentum parameter for all updates.
        """
        zero_ary = np.zeros((1,))
        # set learning rate
        new_lr = zero_ary + lr
        self.lr.set_value(to_fX(new_lr))
        # set momentums (use first and second order "momentum")
        new_mom_1 = zero_ary + mom_1
        self.mom_1.set_value(to_fX(new_mom_1))
        new_mom_2 = zero_ary + mom_2
        self.mom_2.set_value(to_fX(new_mom_2))
        return

    def set_lam_kld(self, lam_kld_p=0.0, lam_kld_q=1.0, lam_kld_g=0.0, lam_kld_s=0.0):
        """
        Set the relative weight of prior KL-divergence vs. data likelihood.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_kld_p
        self.lam_kld_p.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_q
        self.lam_kld_q.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_g
        self.lam_kld_g.set_value(to_fX(new_lam))
        new_lam = zero_ary + lam_kld_s
        self.lam_kld_s.set_value(to_fX(new_lam))
        return

    def set_lam_l2w(self, lam_l2w=1e-3):
        """
        Set the relative strength of l2 regularization on network params.
        """
        zero_ary = np.zeros((1,))
        new_lam = zero_ary + lam_l2w
        self.lam_l2w.set_value(to_fX(new_lam))
        return

    def set_train_switch(self, switch_val=0.0):
        """
        Set the switch for changing between training and sampling behavior.
        """
        if switch_val < 0.5:
            switch_val = 0.0
        else:
            switch_val = 1.0
        zero_ary = np.zeros((1,))
        new_val = zero_ary + switch_val
        self.train_switch.set_value(to_fX(new_val))
        return

    def _construct_zi_zmuv(self, xo):
        """
        Construct the necessary ZMUV gaussian samples for generating
        trajectories from this SRRModel, for input matrix xo.
        """
        zi_zmuv = self.rng.normal(
            size=(self.total_steps, xo.shape[0], self.z_dim), avg=0.0, std=1.0, dtype=theano.config.floatX
        )
        return zi_zmuv

    def _construct_rev_masks(self, xo):
        """
        Compute the sequential revelation masks for the input batch in xo.
        -- We need to construct mask sequences for both p and q.
        """
        if self.use_rev_masks:
            # make batch copies of self.rev_masks_p and self.rev_masks_q
            pmasks = self.rev_masks_p.dimshuffle(0, "x", 1).repeat(xo.shape[0], axis=1)
            qmasks = self.rev_masks_q.dimshuffle(0, "x", 1).repeat(xo.shape[0], axis=1)
        else:
            pm_list = []
            qm_list = []
            # make a zero mask that does nothing
            zero_mask = T.alloc(0.0, 1, xo.shape[0], xo.shape[1])
            # generate independently sampled masks for each revelation block
            for rb in self.rev_sched:
                # make a random binary mask with ones at rate rb[1]
                rand_vals = self.rng.uniform(
                    size=(1, xo.shape[0], xo.shape[1]), low=0.0, high=1.0, dtype=theano.config.floatX
                )
                rand_mask = rand_vals < rb[1]
                # append the masks for this revleation block to the mask lists
                #
                # the guide policy (in q) gets to peek at the values that will be
                # revealed to the primary policy (in p) for the entire block. The
                # primary policy only gets to see these values at end of the final
                # step of the block. Within a given step, values are revealed to q
                # at the beginning of the step, and to p at the end.
                #
                # e.g. in a revelation block with only a single step, the guide
                # policy sees the values at the beginning of the step, which allows
                # it to guide the step. the primary policy only gets to see the
                # values at the end of the step.
                #
                # i.e. a standard variational auto-encoder is equivalent to a
                # sequential revelation and refinement model with only one
                # revelation block, which has one step and a reveal rate of 1.0.
                #
                for refine_step in range(rb[0] - 1):
                    pm_list.append(zero_mask)
                    qm_list.append(rand_mask)
                pm_list.append(rand_mask)
                qm_list.append(rand_mask)
            # concatenate each mask list into a 3-tensor
            pmasks = T.cast(T.concatenate(pm_list, axis=0), "floatX")
            qmasks = T.cast(T.concatenate(qm_list, axis=0), "floatX")
        return [pmasks, qmasks]

    def _construct_nll_costs(self, si, xo, nll_mask):
        """
        Construct the negative log-likelihood part of free energy.
        -- only check NLL where nll_mask == 1
        """
        xh = self._from_si_to_x(si)
        if self.x_type == "bernoulli":
            ll_costs = log_prob_bernoulli(xo, xh, mask=nll_mask)
        else:
            ll_costs = log_prob_gaussian2(xo, xh, log_vars=self.bounded_logvar, mask=nll_mask)
        nll_costs = -ll_costs.flatten()
        return nll_costs

    def _construct_kld_s(self, s_i, s_j):
        """
        Compute KL(s_i || s_j) -- assuming bernoullish outputs
        """
        x_i = self._from_si_to_x(s_i)
        x_j = self._from_si_to_x(s_j)
        kld_s = (x_i * (T.log(x_i) - T.log(x_j))) + ((1.0 - x_i) * (T.log(1.0 - x_i) - T.log(1.0 - x_j)))
        sum_kld = T.sum(kld_s, axis=1)
        return sum_kld

    def _construct_kld_costs(self, p=1.0):
        """
        Construct the policy KL-divergence part of cost to minimize.
        """
        kld_pis = []
        kld_qis = []
        kld_gis = []
        kld_sis = []
        s0 = 0.0 * self.si[0] + self.s0
        for i in range(self.total_steps):
            kld_pis.append(T.sum(self.kldi_p2q[i] ** p, axis=1))
            kld_qis.append(T.sum(self.kldi_q2p[i] ** p, axis=1))
            kld_gis.append(T.sum(self.kldi_p2g[i] ** p, axis=1))
            if i == 0:
                kld_sis.append(self._construct_kld_s(self.si[i], s0))
            else:
                kld_sis.append(self._construct_kld_s(self.si[i], self.si[i - 1]))
        # compute the batch-wise costs
        kld_pi = sum(kld_pis)
        kld_qi = sum(kld_qis)
        kld_gi = sum(kld_gis)
        kld_si = sum(kld_sis)
        return [kld_pi, kld_qi, kld_gi, kld_si]

    def _construct_reg_costs(self):
        """
        Construct the cost for low-level basic regularization. E.g. for
        applying l2 regularization to the network activations and parameters.
        """
        param_reg_cost = sum([T.sum(p ** 2.0) for p in self.joint_params])
        return param_reg_cost

    def _construct_compute_fe_terms(self):
        """
        Construct a function for computing terms in variational free energy.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # construct values to output
        nll = self.nll_costs.flatten()
        kld = self.kld_q.flatten()
        # compile theano function for a one-sample free-energy estimate
        fe_term_sample = theano.function(
            inputs=[xo],
            outputs=[nll, kld],
            givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks},
            updates=self.scan_updates,
            on_unused_input="ignore",
        )
        # construct a wrapper function for multi-sample free-energy estimate
        def fe_term_estimator(XO, sample_count=20, use_guide_policy=True):
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from the guide policy
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from the primary policy
                self.set_train_switch(switch_val=0.0)
            # compute a multi-sample estimate of variational free-energy
            nll_sum = np.zeros((XO.shape[0],))
            kld_sum = np.zeros((XO.shape[0],))
            for i in range(sample_count):
                result = fe_term_sample(XO)
                nll_sum += result[0].ravel()
                kld_sum += result[1].ravel()
            mean_nll = nll_sum / float(sample_count)
            mean_kld = kld_sum / float(sample_count)
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            if not use_guide_policy:
                # no KLd if samples are from the primary policy...
                mean_kld = 0.0 * mean_kld
            return [mean_nll, mean_kld]

        return fe_term_estimator

    def _construct_raw_costs(self):
        """
        Construct all the raw, i.e. not weighted by any lambdas, costs.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # compile theano function for computing the costs
        all_step_costs = [self.nlli, self.kldi_q2p, self.kldi_p2q, self.kldi_p2g]
        cost_func = theano.function(
            inputs=[xo],
            outputs=all_step_costs,
            givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks},
            updates=self.scan_updates,
            on_unused_input="ignore",
        )
        # make a function for computing batch-based estimates of costs.
        #   _step_nlls: the expected NLL cost for each step
        #   _step_klds: the expected KL(q||p) cost for each step
        #   _kld_q2p: the expected KL(q||p) cost for each latent dim
        #   _kld_p2q: the expected KL(p||q) cost for each latent dim
        #   _kld_p2g: the expected KL(p||N(0,I)) cost for each latent dim
        def raw_cost_computer(XO):
            _all_costs = cost_func(to_fX(XO))
            _kld_q2p = np.sum(np.mean(_all_costs[1], axis=1, keepdims=True), axis=0)
            _kld_p2q = np.sum(np.mean(_all_costs[2], axis=1, keepdims=True), axis=0)
            _kld_p2g = np.sum(np.mean(_all_costs[3], axis=1, keepdims=True), axis=0)
            _step_klds = np.mean(np.sum(_all_costs[1], axis=2, keepdims=True), axis=1)
            _step_klds = to_fX(np.asarray([k for k in _step_klds]))
            _step_nlls = np.mean(_all_costs[0], axis=1)
            _step_nlls = to_fX(np.asarray([k for k in _step_nlls]))
            results = [_step_nlls, _step_klds, _kld_q2p, _kld_p2q, _kld_p2g]
            return results

        return raw_cost_computer

    def _construct_train_joint(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # collect the outputs to return from this function
        outputs = [self.joint_cost, self.nll_bound, self.nll_cost, self.kld_cost, self.reg_cost, self.obs_costs]
        # compile the theano function
        func = theano.function(
            inputs=[xo],
            outputs=outputs,
            givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks},
            updates=self.joint_updates,
            on_unused_input="ignore",
        )
        return func

    def _construct_sequence_sampler(self):
        """
        Construct theano function to train all networks jointly.
        """
        # setup some symbolic variables for theano to deal with
        xo = T.matrix()
        zizmuv = self._construct_zi_zmuv(xo)
        pmasks, qmasks = self._construct_rev_masks(xo)
        # collect the outputs to return from this function
        states = [self._from_si_to_x(self.s0_full)] + [self._from_si_to_x(self.si[i]) for i in range(self.total_steps)]
        masks = [self.m0_full] + [self.mi_p[i] for i in range(self.total_steps)]
        outputs = states + masks
        # compile the theano function
        func = theano.function(
            inputs=[xo],
            outputs=outputs,
            givens={self.x_out: xo, self.zi_zmuv: zizmuv, self.p_masks: pmasks, self.q_masks: qmasks},
            updates=self.joint_updates,
            on_unused_input="ignore",
        )
        # visualize trajectories generated by the model
        def sample_func(XO, use_guide_policy=False):
            # set model to desired generation mode
            old_switch = self.train_switch.get_value(borrow=False)
            if use_guide_policy:
                # take samples from the guide policy
                self.set_train_switch(switch_val=1.0)
            else:
                # take samples from the primary policy
                self.set_train_switch(switch_val=0.0)
            # get belief states and masks generated by the scan loop
            scan_vals = func(to_fX(XO))
            step_count = self.total_steps + 1
            seq_shape = (step_count, XO.shape[0], XO.shape[1])
            xm_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            xi_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            mi_seq = np.zeros(seq_shape).astype(theano.config.floatX)
            for i in range(step_count):
                _xi = scan_vals[i]
                _mi = scan_vals[i + step_count]
                _xm = (_mi * XO) + ((1.0 - _mi) * _xi)
                xm_seq[i, :, :] = _xm
                xi_seq[i, :, :] = _xi
                mi_seq[i, :, :] = _mi
            # set model back to either training or generation mode
            self.set_train_switch(switch_val=old_switch)
            return [xm_seq, xi_seq, mi_seq]

        return sample_func

    def save_to_file(self, f_name=None):
        """
        Dump important stuff to a Python pickle, so that we can reload this
        model later.
        """
        assert not (f_name is None)
        f_handle = file(f_name, "wb")
        # dump the dict self.params, which just holds "simple" python values
        cPickle.dump(self.params, f_handle, protocol=-1)
        # make a copy of self.shared_param_dicts, with numpy arrays in place
        # of the theano shared variables
        numpy_param_dicts = {}
        for key in self.shared_param_dicts:
            numpy_ary = self.shared_param_dicts[key].get_value(borrow=False)
            numpy_param_dicts[key] = numpy_ary
        # dump the numpy version of self.shared_param_dicts to pickle file
        cPickle.dump(numpy_param_dicts, f_handle, protocol=-1)
        # get numpy dicts for each of the "child" models that we must save
        child_model_dicts = {}
        child_model_dicts["p_zi_given_xi"] = self.p_zi_given_xi.save_to_dict()
        child_model_dicts["p_sip1_given_zi"] = self.p_sip1_given_zi.save_to_dict()
        child_model_dicts["p_x_given_si"] = self.p_x_given_si.save_to_dict()
        child_model_dicts["q_zi_given_xi"] = self.q_zi_given_xi.save_to_dict()
        # dump the numpy child model dicts to the pickle file
        cPickle.dump(child_model_dicts, f_handle, protocol=-1)
        f_handle.close()
        return