Ejemplo n.º 1
0
    def cov_interface_gradients(self):
        """
        Create covariance function for the gradiens
        Returns:
            theano.tensor.matrix: covariance of the gradients. Shape number of points in rest x number of
              points in dip_pos
        """

        # Euclidian distances
        sed_dips_rest = self.squared_euclidean_distances(
            self.dips_position_tiled, self.rest_layer_points)
        sed_dips_ref = self.squared_euclidean_distances(
            self.dips_position_tiled, self.ref_layer_points)

        # Cartesian distances between dips and interface points
        # Rest
        hu_rest = T.vertical_stack(
            (self.dips_position[:, 0] - self.rest_layer_points[:, 0].reshape(
                (self.rest_layer_points[:, 0].shape[0], 1))).T,
            (self.dips_position[:, 1] - self.rest_layer_points[:, 1].reshape(
                (self.rest_layer_points[:, 1].shape[0], 1))).T,
            (self.dips_position[:, 2] - self.rest_layer_points[:, 2].reshape(
                (self.rest_layer_points[:, 2].shape[0], 1))).T)

        # Reference point
        hu_ref = T.vertical_stack(
            (self.dips_position[:, 0] - self.ref_layer_points[:, 0].reshape(
                (self.ref_layer_points[:, 0].shape[0], 1))).T,
            (self.dips_position[:, 1] - self.ref_layer_points[:, 1].reshape(
                (self.ref_layer_points[:, 1].shape[0], 1))).T,
            (self.dips_position[:, 2] - self.ref_layer_points[:, 2].reshape(
                (self.ref_layer_points[:, 2].shape[0], 1))).T)

        # Cross-Covariance gradients-surface_points
        C_GI = self.gi_reescale * ((
            hu_rest * (sed_dips_rest < self.a_T) *  # first derivative
            (-self.c_o_T *
             ((-14 / self.a_T**2) + 105 / 4 * sed_dips_rest / self.a_T**3 -
              35 / 2 * sed_dips_rest**3 / self.a_T**5 +
              21 / 4 * sed_dips_rest**5 / self.a_T**7))) - (
                  hu_ref * (sed_dips_ref < self.a_T) *  # first derivative
                  (-self.c_o_T *
                   ((-14 / self.a_T**2) + 105 / 4 * sed_dips_ref / self.a_T**3
                    - 35 / 2 * sed_dips_ref**3 / self.a_T**5 +
                    21 / 4 * sed_dips_ref**5 / self.a_T**7)))).T

        # Add name to the theano node
        C_GI.name = 'Covariance gradient interface'

        if str(sys._getframe().f_code.co_name) + '_g' in self.verbose:
            theano.printing.pydotprint(C_GI,
                                       outfile="graphs/" +
                                       sys._getframe().f_code.co_name + ".png",
                                       var_with_name_simple=True)
        return C_GI
Ejemplo n.º 2
0
    def reconstruct(self, x_in, x_out):
        # get important size and shape information
        batch_size = x_in.shape[0]
        z_mix_dim = self.get_dim('z_mix')
        z_gen_dim = self.get_dim('z_gen')
        ce_dim = self.get_dim('c_enc')
        cd_dim = self.get_dim('c_dec')
        he_dim = self.get_dim('h_enc')
        hd_dim = self.get_dim('h_dec')

        # sample zero-mean, unit std. Gaussian noise for mixture init
        u_mix = self.theano_rng.normal(
                    size=(batch_size, z_mix_dim),
                    avg=0., std=1.)
        # transform ZMUV noise based on q(z_mix | x_in)
        z_mix_mean, z_mix_logvar, z_mix = \
                self.mix_enc_mlp.apply(x_in, u_mix)
        # transform samples from q(z_mix | x_in) into initial generator state
        mix_init = self.mix_dec_mlp.apply(z_mix)
        cd0 = mix_init[:, :cd_dim]
        hd0 = mix_init[:, cd_dim:(cd_dim+hd_dim)]
        ce0 = mix_init[:, (cd_dim+hd_dim):(cd_dim+hd_dim+ce_dim)]
        he0 = mix_init[:, (cd_dim+hd_dim+ce_dim):(cd_dim+hd_dim+ce_dim+he_dim)]
        sm0 = mix_init[:, (cd_dim+hd_dim+ce_dim+he_dim):]
        c0 = tensor.zeros_like(x_out) + self.c_0

        # compute KL-divergence information for the mixture init step
        akl_q2p_mix = gaussian_kld(z_mix_mean, z_mix_logvar, \
                                   self.zm_mean, self.zm_logvar)
        akl_p2q_mix = gaussian_kld(self.zm_mean, self.zm_logvar, \
                                   z_mix_mean, z_mix_logvar)
        kl_q2p_mix_np = tensor.sum(akl_q2p_mix, axis=1)
        kl_p2q_mix_np = tensor.sum(akl_p2q_mix, axis=1)
        kl_q2p_mix = kl_q2p_mix_np.reshape((1, batch_size))
        kl_p2q_mix = kl_p2q_mix_np.reshape((1, batch_size))

        # get zero-mean, unit-std. Gaussian noise for use in scan op
        u_gen = self.theano_rng.normal(
                    size=(self.n_iter, batch_size, z_gen_dim),
                    avg=0., std=1.)

        # run the multi-stage guided generative process
        c, h_enc, c_enc, z, kl_q2p_gen, kl_p2q_gen, h_dec, c_dec = \
                self.iterate(u=u_gen, c=c0, h_enc=he0, c_enc=ce0, \
                             h_dec=hd0, c_dec=cd0, x=x_out, s_mix=sm0)

        # grab the observations generated by the multi-stage process
        x_recons = tensor.nnet.sigmoid(c[-1,:,:])
        x_recons.name = "reconstruction"
        # group up the klds from mixture init and multi-stage generation
        kl_q2p = tensor.vertical_stack(kl_q2p_mix, kl_q2p_gen)
        kl_q2p.name = "kl_q2p"
        kl_p2q = tensor.vertical_stack(kl_p2q_mix, kl_p2q_gen)
        kl_p2q.name = "kl_p2q"
        return x_recons, kl_q2p, kl_p2q
Ejemplo n.º 3
0
def gSat(m, v=None, i=None, e=None):
    ''' Reimplementation from the PILCO matlab code. Saturates the input
    signal to -1 to 1 through the function sat(x) = (9*sin(x) +sin(3*x))/8.
    If v is not None, this function returns the output mean, covariance and
    input-output covariance for computing he joint distribution p(input,output)
    as a multivariate Gaussian.'''
    D = m.shape[0]

    if i is None:
        i = tt.arange(D)
    if e is None:
        e = tt.ones((D, ))
    elif e.__class__ is list:
        e = tt.as_tensor_variable(np.array(e)).flatten()
    elif e.__class__ is np.array:
        e = tt.as_tensor_variable(e).flatten()
    e = e.astype(m.dtype)

    # if no input variance, return deterministic
    if v is None:
        return e * (9 * tt.sin(m) + tt.sin(3 * m)) / 8

    # construct joint distribution of x and 3*x
    Q = tt.vertical_stack(tt.eye(D), 3 * tt.eye(D))
    ma = Q.dot(m)
    va = Q.dot(v).dot(Q.T)

    # compute the joint distribution of 9*sin(x)/8 and sin(3*x)/8
    i1 = tt.concatenate([i, i + D])
    e1 = tt.concatenate([9.0 * e, e]) / 8.0
    M2, V2, C2 = gSin(ma, va, i1, e1)
    # get the distribution of (9*sin(x) + sin(3*x))/8
    P = tt.vertical_stack(tt.eye(D), tt.eye(D))
    # mean
    M = M2.dot(P)
    # variance
    V = P.T.dot(V2).dot(P)

    # inv input covariance dot input output covariance
    C = Q.T.dot(C2).dot(P)

    retvars = [M, V, C]

    return retvars
Ejemplo n.º 4
0
def create_discriminator_func(layers, apply_updates=False):
    X = T.fmatrix('X')
    pz = T.fmatrix('pz')

    X_batch = T.fmatrix('X_batch')
    pz_batch = T.fmatrix('pz_batch')

    # the discriminator receives samples from q(z|x) and p(z)
    # and should predict to which distribution each sample belongs
    discriminator_outputs = get_output(
        layers['l_discriminator_out'],
        inputs={
            layers['l_prior_in']: pz,
            layers['l_encoder_in']: X,
        },
        deterministic=False,
    )

    # label samples from q(z|x) as 1 and samples from p(z) as 0
    discriminator_targets = T.vertical_stack(
        T.ones((X_batch.shape[0], 1)),
        T.zeros((pz_batch.shape[0], 1))
    )

    discriminator_loss = T.mean(
        T.nnet.binary_crossentropy(
            discriminator_outputs,
            discriminator_targets,
        )
    )

    if apply_updates:
        # only layers that are part of the discriminator should be updated
        discriminator_params = get_all_params(
            layers['l_discriminator_out'], trainable=True, discriminator=True)

        discriminator_updates = nesterov_momentum(
            discriminator_loss, discriminator_params, 0.1, 0.0)
    else:
        discriminator_updates = None

    discriminator_func = theano.function(
        inputs=[
            theano.In(X_batch),
            theano.In(pz_batch),
        ],
        outputs=discriminator_loss,
        updates=discriminator_updates,
        givens={
            X: X_batch,
            pz: pz_batch,
        },
    )

    return discriminator_func
Ejemplo n.º 5
0
 def __init__(self,u_size,y_size,reservoir_size,alpha, num_max_W, memory, target_spectral):
     #timesteps = 5
     #U is features x timesteps
     #W is the matrix for weights within reservoir x
     #W_in is matrix from input u
     #W_out is matrix from x to output y
     #First choose the number of nodes to fill. 10
     reservoir_size = reservoir_size if reservoir_size != None else u_size * memory
     
     self.alpha = alpha
     #set the values
     
     def initWeights(M,numEntries):
         for i in range(M.shape[0]):
             indices=  random.randint(0,M.shape[1]-1,numEntries)
             M[i,indices] = random.randn(1,numEntries)
         return M
     
     self.W_in = initWeights(random.rand(reservoir_size,u_size+1),num_max_W).astype(theano.config.floatX)
     
     initM = initWeights(np.zeros((reservoir_size,reservoir_size)), num_max_W)
     max_eig = sorted(np.absolute(linalg.eigvals(initM)),reverse=True)[0]
     if max_eig!=0:
         initM = initM*target_spectral/max_eig
     self.W = initM.astype(theano.config.floatX)
     #These are the weights that would be tuned
     self.W_out = theano.shared(np.zeros((y_size,reservoir_size + u_size +1)).astype(theano.config.floatX))
     #self.W_fb = theano.shared(np.zeros((reservoir_size, y_size)))
     
     #W_in is size of x x size of u +1
     #Un is size of u  
     #Xn is   size of x + 1  x T
     #for a sequence u get x
     def recurrence(u_t,prevX):
         x_t = (1-self.alpha)*prevX + self.alpha*T.tanh\
         (T.dot(self.W_in, T.vertical_stack(T.as_tensor_variable(np.ones((1,1)).astype(theano.config.floatX)),u_t[:,np.newaxis])[:,0])\
                                                         + T.dot(self.W,prevX))
         return x_t
     u = T.fmatrix()
     #provide with random input
     #u.tag.test_value =np.random.rand(5,2).astype(theano.config.floatX)
     x,_ = theano.scan(fn = recurrence, sequences=u, outputs_info=[T.zeros((reservoir_size)).astype(theano.config.floatX)])
     timesteps = T.iscalar()
     y = T.dot(self.W_out, T.vertical_stack(T.ones((1,timesteps)).astype(theano.config.floatX),u.T,x.T))
     self.predict = theano.function(inputs=[u,timesteps],outputs=y)
     #the true labels
     y0 = T.fmatrix()
     #provide with random input
     #y0.tag.test_value = np.random.rand(5,1).astype(theano.config.floatX)
     cost = T.sum((y.T - y0)**2)
     #cost = T.sum(y**2)
     g = T.grad(cost,self.W_out)
     lr = T.scalar()
     updates = OrderedDict([(self.W_out, self.W_out - lr*g)])
     self.train = theano.function(inputs=[u,y0,lr,timesteps],outputs=cost,updates=updates,on_unused_input='warn')
Ejemplo n.º 6
0
    def atData(input, left, right):
        sentence = input[0]

        min = T.switch(T.lt(left, right), left, right)
        max = T.switch(T.lt(left, right), right, left)

        sentenceHead = sentence[:(min + _N_PAD_HEAD)]
        sentenceMiddle = sentence[(min + _N_PAD_HEAD + 1):(max + _N_PAD_HEAD)]
        sentenceTail = sentence[(max + _N_PAD_HEAD + 1):]

        # 去掉了两个entityPair
        # 86×60
        newSentence = T.vertical_stack(sentenceHead, sentenceMiddle,
                                       sentenceTail)

        leftEntity = sentence[min + _N_PAD_HEAD]
        rightEntity = sentence[max + _N_PAD_HEAD]

        LRConnect = T.concatenate([leftEntity, rightEntity])

        def AtLayerData(LRConnect, newSentenceCon):
            def forEveryWord(word):
                temp = T.concatenate([word, LRConnect])
                # return T.concatenate(temp, rightEntity)
                return temp

            # 将两个entitypair加在了每个句子的后面
            # 86×180
            sentenceAfAdd, _ = theano.scan(forEveryWord,
                                           sequences=newSentenceCon)

            eForWord = T.dot(sentenceAfAdd, WForATData)

            aForWord = T.nnet.softmax(eForWord)[0]

            def mulWeight(word, weight):
                return word * weight

            # 86×60
            newSRep, _ = theano.scan(mulWeight,
                                     sequences=[newSentence, aForWord])

            # 1×60
            finalSRep = T.sum(newSRep, axis=0)

            return T.dot(finalSRep, linearW)

        finalSRep, _ = theano.scan(AtLayerData,
                                   outputs_info=LRConnect,
                                   non_sequences=newSentence,
                                   n_steps=NUMBER_DATA)

        return finalSRep[-1]
Ejemplo n.º 7
0
    def __init__(self,
                 input=None,
                 n_visible=16,
                 n_hidden=20,
                 W=None,
                 hbias=None,
                 vbias=None,
                 numpy_rng = None,
                 theano_rng=None,
                 batch_size=0, t_batch_size=1, 
                 n_beta=10, beta_lbound=0., tau=None):

        self.n_visible = n_visible
        self.n_hidden  = n_hidden
        self.t_batch_size = t_batch_size  # size of tempered minibatch
        self.batch_size = batch_size # size of T=1 minibatch
  
        if numpy_rng is None:
            numpy_rng = numpy.random.RandomState(1234)
        if theano_rng is None:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))
            
        self.rng = numpy_rng
        self.theano_rng = theano_rng

        if W is None : 
            initial_W = numpy.asarray(
                0.01 * numpy_rng.randn(n_visible, n_hidden),
                dtype=theano.config.floatX
            )
            W = theano.shared(value=initial_W, name='W', borrow=True)
        self.W = W

        if hbias is None :
            hbias = sharedX(numpy.zeros(n_hidden), 'hbias')
        self.hbias = hbias

        if vbias is None :
            vbias = sharedX(numpy.zeros(n_visible), 'vbias')
        self.vbias = vbias

        if input is None:
            input = T.matrix('input')
        self.input = input 

        #########################################################################
        # Fields indexed by batch_size + mixstat:    buffer, E
        # Fields indexed by mixstat:    beta, labels, rtime
        # Fields indexed by temp index: mixstat, fup_target, nup, ndown, swapstat
        #########################################################################

        ### initialize tempering stuff ###
        n_chain = t_batch_size * n_beta
        self.n_chain = theano.shared(n_chain, name='n_chain') # number of active chains in buffer array
        self.n_beta  = theano.shared(n_beta, name='n_beta')   # number of temperatures in system
        self.n_chain_total = batch_size + self.n_chain

        # configure buffers for negative particles
        _buffer = self.rng.randint(0,2,size=(batch_size + 2*n_chain, n_visible))
        self._buffer   = sharedX(_buffer, name='buffer')
        self.buffer    = self._buffer[:self.n_chain_total]
        # buffer used to store mean-field activation
        self.mf_buffer = sharedX(numpy.zeros_like(_buffer), name='mf_buffer')

        # vectors containing energy of current negative particles (at T=1)
        self._E = sharedX(numpy.zeros(batch_size + 2*n_chain), name='E')
        self.E  = self._E[:self.n_chain_total]

        # Space out inverse temperature parameters linearly in [1,beta_lbound] range .
        beta = numpy.zeros(2*n_chain)
        for bi in range(t_batch_size):
            base_idx = n_beta*bi
            beta[base_idx:base_idx+n_beta] = numpy.linspace(1, beta_lbound, n_beta)
        self._beta = sharedX(beta, name='beta')
        self.beta = self._beta[:self.n_chain]

        # Used to multiply the rows of "W x + b"
        self.beta_matrix = T.vertical_stack(
                T.alloc(1.0, batch_size, 1),
                self.beta.dimshuffle([0,'x']))

        # initialize data structure to map nhid/nvis rows to a given temperature
        # mixstat stores pointers to self.nvis array
        mixstat = numpy.zeros((t_batch_size, 2*n_beta), dtype='int32')
        mixstat[:, :n_beta] = numpy.arange(n_chain).reshape(t_batch_size, n_beta)
        self._mixstat = theano.shared(mixstat, name='mixstat')
        self.mixstat = self._mixstat[:, :self.n_beta]

        ### Initialize particle properties ###

        # labels: 1 means going up in temperature, 0 going down in temperature
        labels = LBL_NONE * numpy.ones(2*n_chain, dtype='int32')
        labels[mixstat[:,0]] = LBL_UP
        self.labels = theano.shared(labels, name='labels') 

        # return time
        rtime = numpy.zeros(2*n_chain, dtype='int32')
        self.rtime = theano.shared(rtime, name='rtime') 
        self.avg_rtime = sharedX(rtime_deo(0.4,n_beta), name='avg_rtime')

        ### Initialize temperature properties ###

        # configure fup target for each chain (this shouldn't change very often)
        _fup_target = numpy.zeros(2*n_beta)
        _fup_target[:n_beta] = numpy.linspace(1,0,n_beta)
        self._fup_target = sharedX(_fup_target, name='fup_target')
        self.fup_target = self._fup_target[:self.n_beta]

        # configure histogram of up moving particles
        _nup = numpy.zeros(2*n_beta)
        _nup[:n_beta] = numpy.linspace(1,0,n_beta)
        self._nup = sharedX(_nup, name='nup')
        self.nup = self._nup[:self.n_beta]
        
        # configure histogram of down moving particles
        _ndown = numpy.zeros(2*n_beta)
        _ndown[:n_beta] = numpy.linspace(0,1,n_beta)
        self._ndown = sharedX(_ndown, name='ndown')
        self.ndown = self._ndown[:self.n_beta]

        # use return time as the time constant for all moving averages
        if not tau:
            self.tau = 1./self.avg_rtime
        else:
            self.tau = T.as_tensor(tau)
        self.get_tau = theano.function([], self.tau)

        # create PT Op
        self._swapstat = sharedX(numpy.zeros(2*n_beta), name='swapstat')
        self.swapstat = self._swapstat[:self.n_beta]

        self.pt_swaps = PT_Swaps(rng=self.rng)
        self.pt_swap_t1_sample = PT_SwapT1Sample(rng=self.rng, batch_size=self.batch_size)
Ejemplo n.º 8
0
 def recurrence(u_t, prevX):
     x_t = (1-self.alpha)*prevX + self.alpha*T.tanh\
     (T.dot(self.W_in, T.vertical_stack(T.as_tensor_variable(np.ones((1,1)).astype(theano.config.floatX)),u_t[:,np.newaxis])[:,0])\
                                                     + T.dot(self.W,prevX))
     return x_t
Ejemplo n.º 9
0
    def __init__(self, u_size, y_size, reservoir_size, alpha, num_max_W,
                 memory, target_spectral):
        #timesteps = 5
        #U is features x timesteps
        #W is the matrix for weights within reservoir x
        #W_in is matrix from input u
        #W_out is matrix from x to output y
        #First choose the number of nodes to fill. 10
        reservoir_size = reservoir_size if reservoir_size != None else u_size * memory

        self.alpha = alpha

        #set the values

        def initWeights(M, numEntries):
            for i in range(M.shape[0]):
                indices = random.randint(0, M.shape[1] - 1, numEntries)
                M[i, indices] = random.randn(1, numEntries)
            return M

        self.W_in = initWeights(random.rand(reservoir_size, u_size + 1),
                                num_max_W).astype(theano.config.floatX)

        initM = initWeights(np.zeros((reservoir_size, reservoir_size)),
                            num_max_W)
        max_eig = sorted(np.absolute(linalg.eigvals(initM)), reverse=True)[0]
        if max_eig != 0:
            initM = initM * target_spectral / max_eig
        self.W = initM.astype(theano.config.floatX)
        #These are the weights that would be tuned
        self.W_out = theano.shared(
            np.zeros(
                (y_size,
                 reservoir_size + u_size + 1)).astype(theano.config.floatX))

        #self.W_fb = theano.shared(np.zeros((reservoir_size, y_size)))

        #W_in is size of x x size of u +1
        #Un is size of u
        #Xn is   size of x + 1  x T
        #for a sequence u get x
        def recurrence(u_t, prevX):
            x_t = (1-self.alpha)*prevX + self.alpha*T.tanh\
            (T.dot(self.W_in, T.vertical_stack(T.as_tensor_variable(np.ones((1,1)).astype(theano.config.floatX)),u_t[:,np.newaxis])[:,0])\
                                                            + T.dot(self.W,prevX))
            return x_t

        u = T.fmatrix()
        #provide with random input
        #u.tag.test_value =np.random.rand(5,2).astype(theano.config.floatX)
        x, _ = theano.scan(fn=recurrence,
                           sequences=u,
                           outputs_info=[
                               T.zeros((reservoir_size)).astype(
                                   theano.config.floatX)
                           ])
        timesteps = T.iscalar()
        y = T.dot(
            self.W_out,
            T.vertical_stack(
                T.ones((1, timesteps)).astype(theano.config.floatX), u.T, x.T))
        self.predict = theano.function(inputs=[u, timesteps], outputs=y)
        #the true labels
        y0 = T.fmatrix()
        #provide with random input
        #y0.tag.test_value = np.random.rand(5,1).astype(theano.config.floatX)
        cost = T.sum((y.T - y0)**2)
        #cost = T.sum(y**2)
        g = T.grad(cost, self.W_out)
        lr = T.scalar()
        updates = OrderedDict([(self.W_out, self.W_out - lr * g)])
        self.train = theano.function(inputs=[u, y0, lr, timesteps],
                                     outputs=cost,
                                     updates=updates,
                                     on_unused_input='warn')
Ejemplo n.º 10
0
    def __init__(self, rng=None, Xd=None, Xc=None, Xm=None, Xt=None, \
                 i_net=None, g_net=None, d_net=None, chain_len=None, \
                 data_dim=None, prior_dim=None, params=None):
        # Do some stuff!
        self.rng = RandStream(rng.randint(100000))
        self.data_dim = data_dim
        self.prior_dim = prior_dim
        self.prior_mean = 0.0
        self.prior_logvar = 0.0
        if params is None:
            self.params = {}
        else:
            self.params = params
        if 'cost_decay' in self.params:
            self.cost_decay = self.params['cost_decay']
        else:
            self.cost_decay = 0.1
        if 'chain_type' in self.params:
            assert((self.params['chain_type'] == 'walkback') or \
                (self.params['chain_type'] == 'walkout'))
            self.chain_type = self.params['chain_type']
        else:
            self.chain_type = 'walkout'
        if 'xt_transform' in self.params:
            assert((self.params['xt_transform'] == 'sigmoid') or \
                    (self.params['xt_transform'] == 'none'))
            if self.params['xt_transform'] == 'sigmoid':
                self.xt_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.xt_transform = lambda x: x
        else:
            self.xt_transform = lambda x: T.nnet.sigmoid(x)
        if 'logvar_bound' in self.params:
            self.logvar_bound = self.params['logvar_bound']
        else:
            self.logvar_bound = 10
        #
        # x_type: this tells if we're using bernoulli or gaussian model for
        #         the observations
        #
        self.x_type = self.params['x_type']
        assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))

        # symbolic var for inputting samples for initializing the VAE chain
        self.Xd = Xd
        # symbolic var for masking subsets of the state variables
        self.Xm = Xm
        # symbolic var for controlling subsets of the state variables
        self.Xc = Xc
        # symbolic var for inputting samples from the target distribution
        self.Xt = Xt
        # integer number of times to cycle the VAE loop
        self.chain_len = chain_len

        # symbolic matrix of indices for data inputs
        self.It = T.arange(self.Xt.shape[0])
        # symbolic matrix of indices for noise/generated inputs
        self.Id = T.arange(
            self.chain_len * self.Xd.shape[0]) + self.Xt.shape[0]

        # get a clone of the desired VAE, for easy access
        self.OSM = OneStageModel(rng=rng, Xd=self.Xd, Xc=self.Xc, Xm=self.Xm, \
                p_x_given_z=g_net, q_z_given_x=i_net, x_dim=self.data_dim, \
                z_dim=self.prior_dim, params=self.params)
        self.IN = self.OSM.q_z_given_x
        self.GN = self.OSM.p_x_given_z
        self.transform_x_to_z = self.OSM.transform_x_to_z
        self.transform_z_to_x = self.OSM.transform_z_to_x
        self.bounded_logvar = self.OSM.bounded_logvar
        # self-loop some clones of the main VAE into a chain.
        # ** All VAEs in the chain share the same Xc and Xm, which are the
        #    symbolic inputs for providing the observed portion of the input
        #    and a mask indicating which part of the input is "observed".
        #    These inputs are used for training "reconstruction" policies.
        self.IN_chain = []
        self.GN_chain = []
        self.Xg_chain = []
        _Xd = self.Xd
        print("Unrolling chain...")
        for i in range(self.chain_len):
            # create a VAE infer/generate pair with _Xd as input and with
            # masking variables shared by all VAEs in this chain
            _IN = self.IN.shared_param_clone(rng=rng, \
                    Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm), \
                    build_funcs=False)
            _GN = self.GN.shared_param_clone(rng=rng, Xd=_IN.output, \
                    build_funcs=False)
            _Xd = self.xt_transform(_GN.output_mean)
            self.IN_chain.append(_IN)
            self.GN_chain.append(_GN)
            self.Xg_chain.append(_Xd)
            print("    step {}...".format(i))

        # make a clone of the desired discriminator network, which will try
        # to discriminate between samples from the training data and samples
        # generated by the self-looped VAE chain.
        self.DN = d_net.shared_param_clone(rng=rng, \
                Xd=T.vertical_stack(self.Xt, *self.Xg_chain))

        zero_ary = np.zeros((1, )).astype(theano.config.floatX)
        # init shared var for weighting nll of data given posterior sample
        self.lam_chain_nll = theano.shared(value=zero_ary,
                                           name='vcg_lam_chain_nll')
        self.set_lam_chain_nll(lam_chain_nll=1.0)
        # init shared var for weighting posterior KL-div from prior
        self.lam_chain_kld = theano.shared(value=zero_ary,
                                           name='vcg_lam_chain_kld')
        self.set_lam_chain_kld(lam_chain_kld=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w')
        self.set_lam_l2w(lam_l2w=1e-4)
        # shared var learning rates for all networks
        self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn')
        self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn')
        self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in')
        # shared var momentum parameters for all networks
        self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='vcg_it_count')
        # shared var weights for adversarial classification objective
        self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn')
        self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn')
        # init parameters for controlling learning dynamics
        self.set_all_sgd_params()

        self.set_disc_weights()  # init adversarial cost weights for GN/DN
        # set a shared var for regularizing the output of the discriminator
        self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \
                name='vcg_lam_l2d')

        # Grab the full set of "optimizable" parameters from the generator
        # and discriminator networks that we'll be working with. We need to
        # ignore parameters in the final layers of the proto-networks in the
        # discriminator network (a generalized pseudo-ensemble). We ignore them
        # because the VCGair requires that they be "bypassed" in favor of some
        # binary classification layers that will be managed by this VCGair.
        self.dn_params = []
        for pn in self.DN.proto_nets:
            for pnl in pn[0:-1]:
                self.dn_params.extend(pnl.params)
        self.in_params = [p for p in self.IN.mlp_params]
        self.in_params.append(self.OSM.output_logvar)
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.in_params + self.gn_params + self.dn_params

        # Now construct a binary discriminator layer for each proto-net in the
        # discriminator network. And, add their params to optimization list.
        self._construct_disc_layers(rng)
        self.disc_reg_cost = self.lam_l2d[0] * \
                T.sum([dl.act_l2_sum for dl in self.disc_layers])

        # Construct costs for the generator and discriminator networks based
        # on adversarial binary classification
        self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs()

        # first, build the cost to be optimized by the discriminator network,
        # in general this will be treated somewhat indepedently of the
        # optimization of the generator and inferencer networks.
        self.dn_cost = self.disc_cost_dn + self.DN.act_reg_cost + \
                self.disc_reg_cost

        # construct costs relevant to the optimization of the generator and
        # discriminator networks
        self.chain_nll_cost = self.lam_chain_nll[0] * \
                self._construct_chain_nll_cost(cost_decay=self.cost_decay)
        self.chain_kld_cost = self.lam_chain_kld[0] * \
                self._construct_chain_kld_cost(cost_decay=self.cost_decay)
        self.other_reg_cost = self._construct_other_reg_cost()
        self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \
                self.chain_kld_cost + self.other_reg_cost
        # compute total cost on the discriminator and VB generator/inferencer
        self.joint_cost = self.dn_cost + self.osm_cost

        # Get the gradient of the joint cost for all optimizable parameters
        self.joint_grads = OrderedDict()
        print("Computing VCGLoop DN cost gradients...")
        grad_list = T.grad(self.dn_cost,
                           self.dn_params,
                           disconnected_inputs='warn')
        for i, p in enumerate(self.dn_params):
            self.joint_grads[p] = grad_list[i]
        print("Computing VCGLoop IN cost gradients...")
        grad_list = T.grad(self.osm_cost,
                           self.in_params,
                           disconnected_inputs='warn')
        for i, p in enumerate(self.in_params):
            self.joint_grads[p] = grad_list[i]
        print("Computing VCGLoop GN cost gradients...")
        grad_list = T.grad(self.osm_cost,
                           self.gn_params,
                           disconnected_inputs='warn')
        for i, p in enumerate(self.gn_params):
            self.joint_grads[p] = grad_list[i]

        # construct the updates for the discriminator, generator and
        # inferencer networks. all networks share the same first/second
        # moment momentum and iteration count. the networks each have their
        # own learning rates, which lets you turn their learning on/off.
        self.dn_updates = get_param_updates(params=self.dn_params, \
                grads=self.joint_grads, alpha=self.lr_dn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.gn_updates = get_param_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.in_updates = get_param_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)

        # bag up all the updates required for training
        self.joint_updates = OrderedDict()
        for k in self.dn_updates:
            self.joint_updates[k] = self.dn_updates[k]
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]
        # construct an update for tracking the mean KL divergence of
        # approximate posteriors for this chain
        new_kld_mean = (0.98 * self.IN.kld_mean) + ((0.02 / self.chain_len) * \
            sum([T.mean(I_N.kld_cost) for I_N in self.IN_chain]))
        self.joint_updates[self.IN.kld_mean] = T.cast(new_kld_mean, 'floatX')

        # construct the function for training on training data
        print("Compiling VCGLoop theano functions....")
        self.train_joint = self._construct_train_joint()
        return
Ejemplo n.º 11
0
 def recurrence(u_t,prevX):
     x_t = (1-self.alpha)*prevX + self.alpha*T.tanh\
     (T.dot(self.W_in, T.vertical_stack(T.as_tensor_variable(np.ones((1,1)).astype(theano.config.floatX)),u_t[:,np.newaxis])[:,0])\
                                                     + T.dot(self.W,prevX))
     return x_t
Ejemplo n.º 12
0
    def cov_gradients(self, verbose=0):
        """
         Create covariance function for the gradients

         Returns:
             theano.tensor.matrix: covariance of the gradients. Shape number of points in dip_pos x number of
             points in dip_pos

         """

        # Euclidean distances
        sed_dips_dips = self.squared_euclidean_distances(
            self.dips_position_tiled, self.dips_position_tiled)

        if 'sed_dips_dips' in self.verbose:
            sed_dips_dips = theano.printing.Print('sed_dips_dips')(
                sed_dips_dips)

        # Cartesian distances between dips positions
        h_u = T.vertical_stack(
            T.tile(
                self.dips_position[:, 0] - self.dips_position[:, 0].reshape(
                    (self.dips_position[:, 0].shape[0], 1)),
                self.n_dimensions),
            T.tile(
                self.dips_position[:, 1] - self.dips_position[:, 1].reshape(
                    (self.dips_position[:, 1].shape[0], 1)),
                self.n_dimensions),
            T.tile(
                self.dips_position[:, 2] - self.dips_position[:, 2].reshape(
                    (self.dips_position[:, 2].shape[0], 1)),
                self.n_dimensions))

        # Transpose
        h_v = h_u.T

        # Perpendicularity matrix. Boolean matrix to separate cross-covariance and
        # every gradient direction covariance (block diagonal)
        perpendicularity_matrix = T.zeros_like(sed_dips_dips)

        # Cross-covariances of x
        perpendicularity_matrix = T.set_subtensor(
            perpendicularity_matrix[0:self.dips_position.shape[0],
                                    0:self.dips_position.shape[0]], 1)

        # Cross-covariances of y
        perpendicularity_matrix = T.set_subtensor(
            perpendicularity_matrix[
                self.dips_position.shape[0]:self.dips_position.shape[0] * 2,
                self.dips_position.shape[0]:self.dips_position.shape[0] * 2],
            1)

        # Cross-covariances of z
        perpendicularity_matrix = T.set_subtensor(
            perpendicularity_matrix[self.dips_position.shape[0] *
                                    2:self.dips_position.shape[0] * 3,
                                    self.dips_position.shape[0] *
                                    2:self.dips_position.shape[0] * 3], 1)

        # Covariance matrix for gradients at every xyz direction and their cross-covariances
        C_G = T.switch(
            T.eq(sed_dips_dips, 0),  # This is the condition
            0,  # If true it is equal to 0. This is how a direction affect another
            (  # else, following Chiles book
                (h_u * h_v / sed_dips_dips**2) *
                (((sed_dips_dips < self.a_T) *  # first derivative
                  (-self.c_o_T *
                   ((-14 / self.a_T**2) + 105 / 4 * sed_dips_dips / self.a_T**3
                    - 35 / 2 * sed_dips_dips**3 / self.a_T**5 +
                    21 / 4 * sed_dips_dips**5 / self.a_T**7))) +
                 (sed_dips_dips < self.a_T) *  # Second derivative
                 self.c_o_T * 7 *
                 (9 * sed_dips_dips**5 - 20 * self.a_T**2 * sed_dips_dips**3 +
                  15 * self.a_T**4 * sed_dips_dips - 4 * self.a_T**5) /
                 (2 * self.a_T**7)) - (
                     perpendicularity_matrix *
                     (sed_dips_dips < self.a_T) *  # first derivative
                     self.c_o_T *
                     ((-14 / self.a_T**2) + 105 / 4 * sed_dips_dips /
                      self.a_T**3 - 35 / 2 * sed_dips_dips**3 / self.a_T**5 +
                      21 / 4 * sed_dips_dips**5 / self.a_T**7))))

        # Setting nugget effect of the gradients
        # TODO: This function can be substitued by simply adding the nugget effect to the diag if I remove the condition
        C_G += T.eye(C_G.shape[0]) * self.nugget_effect_grad_T

        # Add name to the theano node
        C_G.name = 'Covariance Gradient'

        if verbose > 1:
            theano.printing.pydotprint(C_G,
                                       outfile="graphs/" +
                                       sys._getframe().f_code.co_name + ".png",
                                       var_with_name_simple=True)

        if str(sys._getframe().f_code.co_name) in self.verbose:
            C_G = theano.printing.Print('Cov Gradients')(C_G)

        return C_G
Ejemplo n.º 13
0
    def __init__(self, input=None, n_visible=784, n_hidden=500, \
                 W=None, hbias=None, vbias=None, 
                 seed = None, theano_rng=None,
                 batch_size=0, t_batch_size=1, 
                 n_beta=10, beta_lbound=0., tau=None):
        """ 
        RBM constructor. Defines the parameters of the model along with
        basic operations for inferring hidden from visible (and vice-versa), 
        as well as for performing CD updates.

        :param input: None for standalone RBMs or symbolic variable if RBM is
         part of a larger graph.
        :param n_visible: number of visible units
        :param n_hidden: number of hidden units
        :param W: None for standalone RBMs or symbolic variable pointing to a
         shared weight matrix in case RBM is part of a DBN network; in a DBN,
         the weights are shared between RBMs and layers of a MLP
        :param hbias: None for standalone RBMs or symbolic variable pointing 
         to a shared hidden units bias vector in case RBM is part of a 
         different network
        :param vbias: None for standalone RBMs or a symbolic variable 
         pointing to a shared visible units bias
        :param tau: optional fixed time constant (overrides return time)
        """
        assert (n_beta > 1 and t_batch_size > 0) or (n_beta==1 and t_batch_size==0)
        if t_batch_size > 0: assert batch_size%t_batch_size==0

        self.n_visible = n_visible
        self.n_hidden  = n_hidden
        self.t_batch_size = t_batch_size  # size of tempered minibatch
        self.batch_size = batch_size # size of T=1 minibatch
  
        # deal with random number generation
        if seed is None:
            rng = numpy.random.RandomState(123)
        else:
            rng = numpy.random.RandomState(seed)
        if theano_rng is None:
            theano_rng = RandomStreams(rng.randint(2**30))
        self.rng = rng
        self.theano_rng = theano_rng

        if W is None : 
           # W is initialized with `initial_W` which is uniformely sampled
           # from -4*sqrt(6./(n_visible+n_hidden)) and 4*sqrt(6./(n_hidden+n_visible))
           # the output of uniform if converted using asarray to dtype 
           # theano.config.floatX so that the code is runable on GPU
           initial_W = 0.01 * self.rng.randn(n_visible, n_hidden)
           # theano shared variables for weights and biases
           W = sharedX(initial_W, 'W')
        self.W = W

        if hbias is None :
           # create shared variable for hidden units bias
           hbias = sharedX(numpy.zeros(n_hidden), 'hbias')
        self.hbias = hbias

        if vbias is None :
           # create shared variable for visible units bias
           vbias = sharedX(numpy.zeros(n_visible), 'vbias')
        self.vbias = vbias

        # initialize input layer for standalone RBM or layer0 of DBN
        if input is None:
            input = T.matrix('input')
        self.input = input 

        #########################################################################
        # Fields indexed by batch_size + mixstat:    buffer, E
        # Fields indexed by mixstat:    beta, labels, rtime
        # Fields indexed by temp index: mixstat, fup_target, nup, ndown, swapstat
        #########################################################################

        ### initialize tempering stuff ###
        n_chain = t_batch_size * n_beta
        self.n_chain = theano.shared(n_chain, name='n_chain') # number of active chains in buffer array
        self.n_beta  = theano.shared(n_beta, name='n_beta')   # number of temperatures in system
        self.n_chain_total = batch_size + self.n_chain

        # configure buffers for negative particles
        _buffer = self.rng.randint(0,2,size=(batch_size + 2*n_chain, n_visible))
        self._buffer   = sharedX(_buffer, name='buffer')
        self.buffer    = self._buffer[:self.n_chain_total]
        # buffer used to store mean-field activation
        self.mf_buffer = sharedX(numpy.zeros_like(_buffer), name='mf_buffer')

        # vectors containing energy of current negative particles (at T=1)
        self._E = sharedX(numpy.zeros(batch_size + 2*n_chain), name='E')
        self.E  = self._E[:self.n_chain_total]

        # Space out inverse temperature parameters linearly in [1,beta_lbound] range .
        beta = numpy.zeros(2*n_chain)
        for bi in range(t_batch_size):
            base_idx = n_beta*bi
            beta[base_idx:base_idx+n_beta] = numpy.linspace(1, beta_lbound, n_beta)
        self._beta = sharedX(beta, name='beta')
        self.beta = self._beta[:self.n_chain]

        # Used to multiply the rows of "W x + b"
        self.beta_matrix = T.vertical_stack(
                T.alloc(1.0, batch_size, 1),
                self.beta.dimshuffle([0,'x']))

        # initialize data structure to map nhid/nvis rows to a given temperature
        # mixstat stores pointers to self.nvis array
        mixstat = numpy.zeros((t_batch_size, 2*n_beta), dtype='int32')
        mixstat[:, :n_beta] = numpy.arange(n_chain).reshape(t_batch_size, n_beta)
        self._mixstat = theano.shared(mixstat, name='mixstat')
        self.mixstat = self._mixstat[:, :self.n_beta]

        ### Initialize particle properties ###

        # labels: 1 means going up in temperature, 0 going down in temperature
        labels = LBL_NONE * numpy.ones(2*n_chain, dtype='int32')
        labels[mixstat[:,0]] = LBL_UP
        self.labels = theano.shared(labels, name='labels') 

        # return time
        rtime = numpy.zeros(2*n_chain, dtype='int32')
        self.rtime = theano.shared(rtime, name='rtime') 
        self.avg_rtime = sharedX(rtime_deo(0.4,n_beta), name='avg_rtime')

        ### Initialize temperature properties ###

        # configure fup target for each chain (this shouldn't change very often)
        _fup_target = numpy.zeros(2*n_beta)
        _fup_target[:n_beta] = numpy.linspace(1,0,n_beta)
        self._fup_target = sharedX(_fup_target, name='fup_target')
        self.fup_target = self._fup_target[:self.n_beta]

        # configure histogram of up moving particles
        _nup = numpy.zeros(2*n_beta)
        _nup[:n_beta] = numpy.linspace(1,0,n_beta)
        self._nup = sharedX(_nup, name='nup')
        self.nup = self._nup[:self.n_beta]
        
        # configure histogram of down moving particles
        _ndown = numpy.zeros(2*n_beta)
        _ndown[:n_beta] = numpy.linspace(0,1,n_beta)
        self._ndown = sharedX(_ndown, name='ndown')
        self.ndown = self._ndown[:self.n_beta]

        # use return time as the time constant for all moving averages
        if not tau:
            self.tau = 1./self.avg_rtime
        else:
            self.tau = T.as_tensor(tau)
        self.get_tau = theano.function([], self.tau)

        # create PT Op
        self._swapstat = sharedX(numpy.zeros(2*n_beta), name='swapstat')
        self.swapstat = self._swapstat[:self.n_beta]

        self.pt_swaps = PT_Swaps(rng=self.rng)
        self.pt_swap_t1_sample = PT_SwapT1Sample(rng=self.rng, batch_size=self.batch_size)
Ejemplo n.º 14
0
    def __init__(self, rng=None, Xd=None, Xc=None, Xm=None, Xt=None, \
                 i_net=None, g_net=None, d_net=None, chain_len=None, \
                 data_dim=None, prior_dim=None, params=None):
        # Do some stuff!
        self.rng = RandStream(rng.randint(100000))
        self.data_dim = data_dim
        self.prior_dim = prior_dim
        self.prior_mean = 0.0
        self.prior_logvar = 0.0
        if params is None:
            self.params = {}
        else:
            self.params = params
        if 'cost_decay' in self.params:
            self.cost_decay = self.params['cost_decay']
        else:
            self.cost_decay = 0.1
        if 'chain_type' in self.params:
            assert((self.params['chain_type'] == 'walkback') or \
                (self.params['chain_type'] == 'walkout'))
            self.chain_type = self.params['chain_type']
        else:
            self.chain_type = 'walkout'
        if 'xt_transform' in self.params:
            assert((self.params['xt_transform'] == 'sigmoid') or \
                    (self.params['xt_transform'] == 'none'))
            if self.params['xt_transform'] == 'sigmoid':
                self.xt_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.xt_transform = lambda x: x
        else:
            self.xt_transform = lambda x: T.nnet.sigmoid(x)
        if 'logvar_bound' in self.params:
            self.logvar_bound = self.params['logvar_bound']
        else:
            self.logvar_bound = 10
        #
        # x_type: this tells if we're using bernoulli or gaussian model for
        #         the observations
        #
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))

        # symbolic var for inputting samples for initializing the VAE chain
        self.Xd = Xd
        # symbolic var for masking subsets of the state variables
        self.Xm = Xm
        # symbolic var for controlling subsets of the state variables
        self.Xc = Xc
        # symbolic var for inputting samples from the target distribution
        self.Xt = Xt
        # integer number of times to cycle the VAE loop
        self.chain_len = chain_len

        # symbolic matrix of indices for data inputs
        self.It = T.arange(self.Xt.shape[0])
        # symbolic matrix of indices for noise/generated inputs
        self.Id = T.arange(self.chain_len * self.Xd.shape[0]) + self.Xt.shape[0]

        # get a clone of the desired VAE, for easy access
        self.OSM = OneStageModel(rng=rng, Xd=self.Xd, Xc=self.Xc, Xm=self.Xm, \
                p_x_given_z=g_net, q_z_given_x=i_net, x_dim=self.data_dim, \
                z_dim=self.prior_dim, params=self.params)
        self.IN = self.OSM.q_z_given_x
        self.GN = self.OSM.p_x_given_z
        self.transform_x_to_z = self.OSM.transform_x_to_z
        self.transform_z_to_x = self.OSM.transform_z_to_x
        self.bounded_logvar = self.OSM.bounded_logvar
        # self-loop some clones of the main VAE into a chain.
        # ** All VAEs in the chain share the same Xc and Xm, which are the
        #    symbolic inputs for providing the observed portion of the input
        #    and a mask indicating which part of the input is "observed".
        #    These inputs are used for training "reconstruction" policies.
        self.IN_chain = []
        self.GN_chain = []
        self.Xg_chain = []
        _Xd = self.Xd
        print("Unrolling chain...")
        for i in range(self.chain_len):
            # create a VAE infer/generate pair with _Xd as input and with
            # masking variables shared by all VAEs in this chain
            _IN = self.IN.shared_param_clone(rng=rng, \
                    Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm), \
                    build_funcs=False)
            _GN = self.GN.shared_param_clone(rng=rng, Xd=_IN.output, \
                    build_funcs=False)
            _Xd = self.xt_transform(_GN.output_mean)
            self.IN_chain.append(_IN)
            self.GN_chain.append(_GN)
            self.Xg_chain.append(_Xd)
            print("    step {}...".format(i))

        # make a clone of the desired discriminator network, which will try
        # to discriminate between samples from the training data and samples
        # generated by the self-looped VAE chain.
        self.DN = d_net.shared_param_clone(rng=rng, \
                Xd=T.vertical_stack(self.Xt, *self.Xg_chain))

        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        # init shared var for weighting nll of data given posterior sample
        self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll')
        self.set_lam_chain_nll(lam_chain_nll=1.0)
        # init shared var for weighting posterior KL-div from prior
        self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld')
        self.set_lam_chain_kld(lam_chain_kld=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w')
        self.set_lam_l2w(lam_l2w=1e-4)
        # shared var learning rates for all networks
        self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn')
        self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn')
        self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in')
        # shared var momentum parameters for all networks
        self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='vcg_it_count')
        # shared var weights for adversarial classification objective
        self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn')
        self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn')
        # init parameters for controlling learning dynamics
        self.set_all_sgd_params()
        
        self.set_disc_weights()  # init adversarial cost weights for GN/DN
        # set a shared var for regularizing the output of the discriminator
        self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \
                name='vcg_lam_l2d')

        # Grab the full set of "optimizable" parameters from the generator
        # and discriminator networks that we'll be working with. We need to
        # ignore parameters in the final layers of the proto-networks in the
        # discriminator network (a generalized pseudo-ensemble). We ignore them
        # because the VCGair requires that they be "bypassed" in favor of some
        # binary classification layers that will be managed by this VCGair.
        self.dn_params = []
        for pn in self.DN.proto_nets:
            for pnl in pn[0:-1]:
                self.dn_params.extend(pnl.params)
        self.in_params = [p for p in self.IN.mlp_params]
        self.in_params.append(self.OSM.output_logvar)
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.in_params + self.gn_params + self.dn_params

        # Now construct a binary discriminator layer for each proto-net in the
        # discriminator network. And, add their params to optimization list.
        self._construct_disc_layers(rng)
        self.disc_reg_cost = self.lam_l2d[0] * \
                T.sum([dl.act_l2_sum for dl in self.disc_layers])

        # Construct costs for the generator and discriminator networks based 
        # on adversarial binary classification
        self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs()

        # first, build the cost to be optimized by the discriminator network,
        # in general this will be treated somewhat indepedently of the
        # optimization of the generator and inferencer networks.
        self.dn_cost = self.disc_cost_dn + self.DN.act_reg_cost + \
                self.disc_reg_cost

        # construct costs relevant to the optimization of the generator and
        # discriminator networks
        self.chain_nll_cost = self.lam_chain_nll[0] * \
                self._construct_chain_nll_cost(cost_decay=self.cost_decay)
        self.chain_kld_cost = self.lam_chain_kld[0] * \
                self._construct_chain_kld_cost(cost_decay=self.cost_decay)
        self.other_reg_cost = self._construct_other_reg_cost()
        self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \
                self.chain_kld_cost + self.other_reg_cost
        # compute total cost on the discriminator and VB generator/inferencer
        self.joint_cost = self.dn_cost + self.osm_cost

        # Get the gradient of the joint cost for all optimizable parameters
        self.joint_grads = OrderedDict()
        print("Computing VCGLoop DN cost gradients...")
        grad_list = T.grad(self.dn_cost, self.dn_params, disconnected_inputs='warn')
        for i, p in enumerate(self.dn_params):
            self.joint_grads[p] = grad_list[i]
        print("Computing VCGLoop IN cost gradients...")
        grad_list = T.grad(self.osm_cost, self.in_params, disconnected_inputs='warn')
        for i, p in enumerate(self.in_params):
            self.joint_grads[p] = grad_list[i]
        print("Computing VCGLoop GN cost gradients...")
        grad_list = T.grad(self.osm_cost, self.gn_params, disconnected_inputs='warn')
        for i, p in enumerate(self.gn_params):
            self.joint_grads[p] = grad_list[i]

        # construct the updates for the discriminator, generator and 
        # inferencer networks. all networks share the same first/second
        # moment momentum and iteration count. the networks each have their
        # own learning rates, which lets you turn their learning on/off.
        self.dn_updates = get_param_updates(params=self.dn_params, \
                grads=self.joint_grads, alpha=self.lr_dn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.gn_updates = get_param_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.in_updates = get_param_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)

        # bag up all the updates required for training
        self.joint_updates = OrderedDict()
        for k in self.dn_updates:
            self.joint_updates[k] = self.dn_updates[k]
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]
        # construct an update for tracking the mean KL divergence of
        # approximate posteriors for this chain
        new_kld_mean = (0.98 * self.IN.kld_mean) + ((0.02 / self.chain_len) * \
            sum([T.mean(I_N.kld_cost) for I_N in self.IN_chain]))
        self.joint_updates[self.IN.kld_mean] = T.cast(new_kld_mean, 'floatX')

        # construct the function for training on training data
        print("Compiling VCGLoop theano functions....")
        self.train_joint = self._construct_train_joint()
        return
Ejemplo n.º 15
0
 def raw_activation_fast(sl, e1, e2):
     self.Wact = T.batched_dot(theano.dot(e1, self.W[:,:,sl]), e2)
     self.Vact = 1e-4 * theano.dot(T.reshape(self.V[:,sl],(1,-1)), T.vertical_stack(e1.T, e2.T)) +\   //1e-4 reflects change of scale
     self.b[sl] # Bias part
     return self.Wact + self.Vact
Ejemplo n.º 16
0
    def __init__(self, rng=None, Xd=None, Xc=None, Xm=None, Xt=None, \
                 i_net=None, g_net=None, d_net=None, chain_len=None, \
                 data_dim=None, prior_dim=None, params=None):
        # Do some stuff!
        self.rng = RandStream(rng.randint(100000))
        self.data_dim = data_dim
        self.prior_dim = prior_dim

        # symbolic var for inputting samples for initializing the VAE chain
        self.Xd = Xd
        # symbolic var for masking subsets of the state variables
        self.Xm = Xm
        # symbolic var for controlling subsets of the state variables
        self.Xc = Xc
        # symbolic var for inputting samples from the target distribution
        self.Xt = Xt
        # integer number of times to cycle the VAE loop
        self.chain_len = chain_len
        # symbolic matrix of indices for data inputs
        self.It = T.arange(self.Xt.shape[0])
        # symbolic matrix of indices for noise inputs
        self.Id = T.arange(self.chain_len * self.Xd.shape[0]) + self.Xt.shape[0]

        # get a clone of the desired VAE, for easy access
        self.GIP = GIPair(rng=rng, Xd=self.Xd, Xc=self.Xc, Xm=self.Xm, \
                g_net=g_net, i_net=i_net, data_dim=self.data_dim, \
                prior_dim=self.prior_dim, params=None, shared_param_dicts=None)
        self.IN = self.GIP.IN
        self.GN = self.GIP.GN
        # self-loop some clones of the main VAE into a chain
        self.IN_chain = []
        self.GN_chain = []
        self.Xg_chain = []
        _Xd = self.Xd
        for i in range(self.chain_len):
            if (i == 0):
                # start the chain with data provided by user
                _IN = self.IN.shared_param_clone(rng=rng, Xd=_Xd, \
                        Xc=self.Xc, Xm=self.Xm)
                _GN = self.GN.shared_param_clone(rng=rng, Xp=_IN.output)
            else:
                # continue the chain with samples from previous VAE
                _IN = self.IN.shared_param_clone(rng=rng, Xd=_Xd, \
                        Xc=self.Xc, Xm=self.Xm)
                _GN = self.GN.shared_param_clone(rng=rng, Xp=_IN.output)
            _Xd = _GN.output
            self.IN_chain.append(_IN)
            self.GN_chain.append(_GN)
            self.Xg_chain.append(_Xd)
        #Xg_stack = T.vertical_stack(*self.Xg_chain)
        #self.Xg = Xg_stack + (0.1 * self.rng.normal(size=Xg_stack.shape, avg=0.0, \
        #        std=1.0, dtype=theano.config.floatX))

        # make a clone of the desired discriminator network, which will try
        # to discriminate between samples from the training data and samples
        # generated by the self-looped VAE chain.
        self.DN = d_net.shared_param_clone(rng=rng, \
                Xd=T.vertical_stack(self.Xt, *self.Xg_chain))

        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        # init shared var for weighting nll of data given posterior sample
        self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll')
        self.set_lam_chain_nll(lam_chain_nll=1.0)
        # init shared var for weighting posterior KL-div from prior
        self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld')
        self.set_lam_chain_kld(lam_chain_kld=1.0)
        # init shared var for weighting chain diffusion rate (a.k.a. velocity)
        self.lam_chain_vel = theano.shared(value=zero_ary, name='vcg_lam_chain_vel')
        self.set_lam_chain_vel(lam_chain_vel=1.0)
        # init shared var for weighting nll of data given posterior sample
        self.lam_mask_nll = theano.shared(value=zero_ary, name='vcg_lam_mask_nll')
        self.set_lam_mask_nll(lam_mask_nll=0.0)
        # init shared var for weighting posterior KL-div from prior
        self.lam_mask_kld = theano.shared(value=zero_ary, name='vcg_lam_mask_kld')
        self.set_lam_mask_kld(lam_mask_kld=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w')
        self.set_lam_l2w(lam_l2w=1e-4)
        # shared var learning rate for generator and discriminator
        self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn')
        self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn')
        self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in')
        # shared var momentum parameters for generator and discriminator
        self.mo_dn = theano.shared(value=zero_ary, name='vcg_mo_dn')
        self.mo_gn = theano.shared(value=zero_ary, name='vcg_mo_gn')
        self.mo_in = theano.shared(value=zero_ary, name='vcg_mo_in')
        # shared var weights for adversarial classification objective
        self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn')
        self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn')
        # init parameters for controlling learning dynamics
        self.set_dn_sgd_params() # init SGD rate/momentum for DN
        self.set_gn_sgd_params() # init SGD rate/momentum for GN
        self.set_in_sgd_params() # init SGD rate/momentum for IN
        
        self.set_disc_weights()  # init adversarial cost weights for GN/DN
        self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \
                name='vcg_lam_l2d')

        nll_weights = np.linspace(0.0, 5.0, num=self.chain_len)
        nll_weights = nll_weights / np.sum(nll_weights)
        nll_weights = nll_weights.astype(theano.config.floatX)
        self.mask_nll_weights = theano.shared(value=nll_weights, \
                name='vcg_mask_nll_weights')

        # Grab the full set of "optimizable" parameters from the generator
        # and discriminator networks that we'll be working with. We need to
        # ignore parameters in the final layers of the proto-networks in the
        # discriminator network (a generalized pseudo-ensemble). We ignore them
        # because the VCGair requires that they be "bypassed" in favor of some
        # binary classification layers that will be managed by this VCGair.
        self.dn_params = []
        for pn in self.DN.proto_nets:
            for pnl in pn[0:-1]:
                self.dn_params.extend(pnl.params)
        self.in_params = [p for p in self.IN.mlp_params]
        self.gn_params = [p for p in self.GN.mlp_params]

        # Now construct a binary discriminator layer for each proto-net in the
        # discriminator network. And, add their params to optimization list.
        self._construct_disc_layers(rng)
        self.disc_reg_cost = self.lam_l2d[0] * \
                T.sum([dl.act_l2_sum for dl in self.disc_layers])

        # Construct costs for the generator and discriminator networks based 
        # on adversarial binary classification
        self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs()

        # first, build the cost to be optimized by the discriminator network,
        # in general this will be treated somewhat indepedently of the
        # optimization of the generator and inferencer networks.
        self.dn_cost = self.disc_cost_dn + self.DN.act_reg_cost + \
                self.disc_reg_cost
        # construct costs relevant to the optimization of the generator and
        # discriminator networks
        self.chain_nll_cost = self.lam_chain_nll[0] * \
                self._construct_chain_nll_cost(data_weight=0.9)
        self.chain_kld_cost = self.lam_chain_kld[0] * \
                self._construct_chain_kld_cost(data_weight=0.9)
        self.chain_vel_cost = self.lam_chain_vel[0] * \
                self._construct_chain_vel_cost()
        self.mask_nll_cost = self.lam_mask_nll[0] * \
                self._construct_mask_nll_cost()
        self.mask_kld_cost = self.lam_mask_kld[0] * \
                self._construct_mask_kld_cost()
        self.other_reg_cost = self._construct_other_reg_cost()
        self.gip_cost = self.disc_cost_gn + self.chain_nll_cost + \
                self.chain_kld_cost + self.chain_vel_cost + \
                self.mask_nll_cost + self.mask_kld_cost + \
                self.other_reg_cost
        # compute total cost on the discriminator and VB generator/inferencer
        self.joint_cost = self.dn_cost + self.gip_cost

        # Initialize momentums for mini-batch SGD updates. All parameters need
        # to be safely nestled in their lists by now.
        self.joint_moms = OrderedDict()
        self.dn_moms = OrderedDict()
        self.in_moms = OrderedDict()
        self.gn_moms = OrderedDict()
        for p in self.dn_params:
            p_mo = np.zeros(p.get_value(borrow=True).shape) + 5.0
            self.dn_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX))
            self.joint_moms[p] = self.dn_moms[p]
        for p in self.in_params:
            p_mo = np.zeros(p.get_value(borrow=True).shape) + 5.0
            self.in_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX))
            self.joint_moms[p] = self.in_moms[p]
        for p in self.gn_params:
            p_mo = np.zeros(p.get_value(borrow=True).shape) + 5.0
            self.gn_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX))
            self.joint_moms[p] = self.gn_moms[p]

        # Construct the updates for the generator and discriminator network
        self.joint_updates = OrderedDict()
        self.dn_updates = OrderedDict()
        self.in_updates = OrderedDict()
        self.gn_updates = OrderedDict()

        ###########################################
        # Construct updates for the discriminator #
        ###########################################
        for var in self.dn_params:
            # these updates are for trainable params in the inferencer net...
            # first, get gradient of cost w.r.t. var
            var_grad = T.grad(self.dn_cost, var, \
                    consider_constant=[self.GN.dist_mean, self.GN.dist_cov]).clip(-0.1,0.1)
            # get the momentum for this var
            var_mom = self.dn_moms[var]
            # update the momentum for this var using its grad
            self.dn_updates[var_mom] = (self.mo_dn[0] * var_mom) + \
                    ((1.0 - self.mo_dn[0]) * (var_grad**2.0))
            self.joint_updates[var_mom] = self.dn_updates[var_mom]
            # make basic update to the var
            var_new = var - (self.lr_dn[0] * (var_grad / T.sqrt(var_mom + 1e-3)))
            self.dn_updates[var] = var_new
            # add this var's update to the joint updates too
            self.joint_updates[var] = self.dn_updates[var]
        ########################################
        # Construct updates for the inferencer #
        ########################################
        for var in self.in_params:
            # these updates are for trainable params in the generator net...
            # first, get gradient of cost w.r.t. var
            var_grad = T.grad(self.gip_cost, var, \
                    consider_constant=[self.GN.dist_mean, self.GN.dist_cov]).clip(-0.1,0.1)
            # get the momentum for this var
            var_mom = self.in_moms[var]
            # update the momentum for this var using its grad
            self.in_updates[var_mom] = (self.mo_in[0] * var_mom) + \
                    ((1.0 - self.mo_in[0]) * (var_grad**2.0))
            self.joint_updates[var_mom] = self.in_updates[var_mom]
            # make basic update to the var
            var_new = var - (self.lr_in[0] * (var_grad / T.sqrt(var_mom + 1e-3)))
            self.in_updates[var] = var_new
            # add this var's update to the joint updates too
            self.joint_updates[var] = self.in_updates[var]
        #######################################
        # Construct updates for the generator #
        #######################################
        for var in self.gn_params:
            # these updates are for trainable params in the generator net...
            # first, get gradient of cost w.r.t. var
            var_grad = T.grad(self.gip_cost, var, \
                    consider_constant=[self.GN.dist_mean, self.GN.dist_cov]).clip(-0.1,0.1)
            # get the momentum for this var
            var_mom = self.gn_moms[var]
            # update the momentum for this var using its grad
            self.gn_updates[var_mom] = (self.mo_gn[0] * var_mom) + \
                    ((1.0 - self.mo_gn[0]) * (var_grad**2.0))
            self.joint_updates[var_mom] = self.gn_updates[var_mom]
            # make basic update to the var
            var_new = var - (self.lr_gn[0] * (var_grad / T.sqrt(var_mom + 1e-3)))
            self.gn_updates[var] = var_new
            # add this var's update to the joint updates too
            self.joint_updates[var] = self.gn_updates[var]

        # Construct the function for training on training data
        self.train_joint = self._construct_train_joint()

        # Construct a function for computing the ouputs of the generator
        # network for a batch of noise. Presumably, the noise will be drawn
        # from the same distribution that was used in training....
        self.sample_chain_from_data = self.GIP.sample_gil_from_data
        return
Ejemplo n.º 17
0
    def __init__(self, rng=None, x_d=None, x_t=None, \
                 i_net=None, g_net=None, d_net=None, \
                 chain_len=None, data_dim=None, z_dim=None, \
                 params=None):
        # Do some stuff!
        self.rng = RandStream(rng.randint(100000))
        self.data_dim = data_dim
        self.z_dim = z_dim
        self.p_z_mean = 0.0
        self.p_z_logvar = 0.0
        if params is None:
            self.params = {}
        else:
            self.params = params
        if 'cost_decay' in self.params:
            self.cost_decay = self.params['cost_decay']
        else:
            self.cost_decay = 0.1
        if 'chain_type' in self.params:
            assert((self.params['chain_type'] == 'walkback') or \
                (self.params['chain_type'] == 'walkout'))
            self.chain_type = self.params['chain_type']
        else:
            self.chain_type = 'walkout'
        if 'xt_transform' in self.params:
            assert((self.params['xt_transform'] == 'sigmoid') or \
                    (self.params['xt_transform'] == 'none'))
            if self.params['xt_transform'] == 'sigmoid':
                self.xt_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.xt_transform = lambda x: x
        else:
            self.xt_transform = lambda x: T.nnet.sigmoid(x)
        if 'logvar_bound' in self.params:
            self.logvar_bound = self.params['logvar_bound']
        else:
            self.logvar_bound = 10
        #
        # x_type: this tells if we're using bernoulli or gaussian model for
        #         the observations
        #
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))

        # grab symbolic input variables
        self.x_d = x_d             # initial input for starting the chain
        self.x_t = x_t             # samples from target distribution
        self.z_zmuv = T.tensor3()  # ZMUV gaussian samples for use in scan

        # get the number of steps for chain unrolling
        self.chain_len = chain_len 

        # symbolic matrix of indices for inputs from target distribution
        self.It = T.arange(self.x_t.shape[0])
        # symbolic matrix of indices for noise/generated inputs
        self.Id = T.arange(self.chain_len * self.x_d.shape[0]) + self.x_t.shape[0]

        # get a clone of the desired VAE, for easy access
        self.OSM = OneStageModel(rng=rng, x_in=self.x_d, \
                                 p_x_given_z=g_net, q_z_given_x=i_net, \
                                 x_dim=self.data_dim, z_dim=self.z_dim, \
                                 params=self.params)
        self.IN = self.OSM.q_z_given_x
        self.GN = self.OSM.p_x_given_z
        self.transform_x_to_z = self.OSM.transform_x_to_z
        self.transform_z_to_x = self.OSM.transform_z_to_x
        self.bounded_logvar = self.OSM.bounded_logvar

        ##################################################
        # self-loop the VAE into a multi-step Markov chain.
        # ** All VAEs in the chain share the same Xc and Xm, which are the
        #    symbolic inputs for providing the observed portion of the input
        #    and a mask indicating which part of the input is "observed".
        #    These inputs are used for training "reconstruction" policies.
        ##################################################
        # Setup the iterative generation loop using scan #
        ##################################################
        def chain_step_func(zi_zmuv, xim1):
            # get mean and logvar of z samples for this step
            zi_mean, zi_logvar = self.IN.apply(xim1, do_samples=False)
            # transform ZMUV samples to get desired samples
            zi = (T.exp(0.5 * zi_logvar) * zi_zmuv) + zi_mean
            # get the next generated xi (pre-transformation)
            outputs = self.GN.apply(zi)
            xti = outputs[-1]
            # apply the observation "mean" transform
            xgi = self.xt_transform(xti)
            # compute NLL for this step
            if self.chain_type == 'walkout':
                x_true = self.x_d
            else:
                x_true = xim1
            nlli = self._log_prob(x_true, xgi).flatten()
            kldi = T.sum(gaussian_kld(zi_mean, zi_logvar, \
                         self.p_z_mean, self.p_z_logvar), axis=1)
            return xgi, nlli, kldi

        # apply the scan op
        init_values = [self.x_d, None, None]
        self.scan_results, self.scan_updates = \
                theano.scan(chain_step_func, outputs_info=init_values, \
                            sequences=self.z_zmuv)
        # get the outputs of the scan op
        self.xgi = self.scan_results[0]
        self.nlli = self.scan_results[1]
        self.kldi = self.scan_results[2]
        self.xgi_list = [self.xgi[i] for i in range(self.chain_len)]

        # make a clone of the desired discriminator network, which will try
        # to discriminate between samples from the training data and samples
        # generated by the self-looped VAE chain.
        self.DN = d_net.shared_param_clone(rng=rng, \
                          Xd=T.vertical_stack(self.x_t, *self.xgi_list))

        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        # init shared var for weighting nll of data given posterior sample
        self.lam_chain_nll = theano.shared(value=zero_ary, name='vcg_lam_chain_nll')
        self.set_lam_chain_nll(lam_chain_nll=1.0)
        # init shared var for weighting posterior KL-div from prior
        self.lam_chain_kld = theano.shared(value=zero_ary, name='vcg_lam_chain_kld')
        self.set_lam_chain_kld(lam_chain_kld=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='vcg_lam_l2w')
        self.set_lam_l2w(lam_l2w=1e-4)
        # shared var learning rates for all networks
        self.lr_dn = theano.shared(value=zero_ary, name='vcg_lr_dn')
        self.lr_gn = theano.shared(value=zero_ary, name='vcg_lr_gn')
        self.lr_in = theano.shared(value=zero_ary, name='vcg_lr_in')
        # shared var momentum parameters for all networks
        self.mom_1 = theano.shared(value=zero_ary, name='vcg_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='vcg_mom_2')
        # shared var weights for adversarial classification objective
        self.dw_dn = theano.shared(value=zero_ary, name='vcg_dw_dn')
        self.dw_gn = theano.shared(value=zero_ary, name='vcg_dw_gn')
        # init parameters for controlling learning dynamics
        self.set_all_sgd_params()
        # init adversarial cost weights for GN/DN
        self.set_disc_weights()  
        # set a shared var for regularizing the output of the discriminator
        self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \
                                     name='vcg_lam_l2d')

        # Grab the full set of "optimizable" parameters from the generator
        # and discriminator networks that we'll be working with. We need to
        # ignore parameters in the final layers of the proto-networks in the
        # discriminator network (a generalized pseudo-ensemble). We ignore them
        # because the VCGair requires that they be "bypassed" in favor of some
        # binary classification layers that will be managed by this VCGair.
        self.dn_params = []
        for pn in self.DN.proto_nets:
            for pnl in pn[0:-1]:
                self.dn_params.extend(pnl.params)
        self.in_params = [p for p in self.IN.mlp_params]
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.in_params + self.gn_params + self.dn_params

        # Now construct a binary discriminator layer for each proto-net in the
        # discriminator network. And, add their params to optimization list.
        self._construct_disc_layers(rng)
        self.disc_reg_cost = self.lam_l2d[0] * \
                T.sum([dl.act_l2_sum for dl in self.disc_layers])

        # Construct costs for the generator and discriminator networks based 
        # on adversarial binary classification
        self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs()

        # first, build the cost to be optimized by the discriminator network,
        # in general this will be treated somewhat indepedently of the
        # optimization of the generator and inferencer networks.
        self.dn_cost = self.disc_cost_dn + self.disc_reg_cost

        # construct costs relevant to the optimization of the generator and
        # discriminator networks
        self.chain_nll_cost = self.lam_chain_nll[0] * \
                self._construct_chain_nll_cost(cost_decay=self.cost_decay)
        self.chain_kld_cost = self.lam_chain_kld[0] * \
                self._construct_chain_kld_cost(cost_decay=self.cost_decay)
        self.other_reg_cost = self._construct_other_reg_cost()
        self.osm_cost = self.disc_cost_gn + self.chain_nll_cost + \
                        self.chain_kld_cost + self.other_reg_cost
        # compute total cost on the discriminator and VB generator/inferencer
        self.joint_cost = self.dn_cost + self.osm_cost

        print("Computing VCGLoop joint_grad...")
        # grab the gradients for all parameters to optimize
        self.joint_grads = OrderedDict()
        for p in self.dn_params:
            self.joint_grads[p] = T.grad(self.dn_cost, p)
        for p in self.in_params:
            self.joint_grads[p] = T.grad(self.osm_cost, p)
        for p in self.gn_params:
            self.joint_grads[p] = T.grad(self.osm_cost, p)

        # construct the updates for the discriminator, generator and 
        # inferencer networks. all networks share the same first/second
        # moment momentum and iteration count. the networks each have their
        # own learning rates, which lets you turn their learning on/off.
        self.dn_updates = get_adam_updates(params=self.dn_params, \
                grads=self.joint_grads, alpha=self.lr_dn, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)
        self.in_updates = get_adam_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)
        self.gn_updates = get_adam_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=10.0)

        # bag up all the updates required for training
        self.joint_updates = OrderedDict()
        for k in self.dn_updates:
            self.joint_updates[k] = self.dn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]

        print("Compiling VCGLoop train_joint...")
        # construct the function for training on training data
        self.train_joint = self._construct_train_joint()
        return
def atData(input, left, right, Slen):
    sentence = input[0]

    min = T.switch(T.lt(left, right), left, right)
    max = T.switch(T.lt(left, right), right, left)

    sentenceHead = sentence[:(min + _N_PAD_HEAD)]
    sentenceMiddle = sentence[(min + _N_PAD_HEAD + 1):(max + _N_PAD_HEAD)]
    sentenceTail = sentence[(max + _N_PAD_HEAD + 1):]

    # 去掉了两个entityPair
    # 86×60
    newSentence = T.vertical_stack(sentenceHead, sentenceMiddle, sentenceTail)

    # (Slen-2)×60
    originSentence = newSentence[4:Slen + 2]

    leftEntity = sentence[min + _N_PAD_HEAD]
    rightEntity = sentence[max + _N_PAD_HEAD]

    LRConnect = T.concatenate([leftEntity, rightEntity])

    # def AtLayerData(LRConnect):
    #     def forEveryWord(word):
    #         temp = T.concatenate([word, LRConnect])
    #         # return T.concatenate(temp, rightEntity)
    #         return temp
    #
    #     # 将两个entitypair加在了每个句子的后面
    #     # 86×180
    #     sentenceAfAdd, _ = theano.scan(forEveryWord, sequences=newSentence)
    #
    #     # 86×1
    #     eForWord = T.dot(sentenceAfAdd, WForATData)
    #
    #     eAfterNonL = T.tanh(eForWord + BForATData)
    #     # (Slen - 2)×60
    #     eAfterNonL = eAfterNonL[4:Slen + 2]
    #
    #     # Slen-2×1
    #     aForWord = T.nnet.softmax(eAfterNonL)[0]
    #
    #     def mulWeight(word, weight):
    #         return word * weight
    #
    #     # 句子长度×60
    #     newSRep, _ = theano.scan(mulWeight, sequences=[originSentence, aForWord])
    #
    #     # 1×60
    #     finalSRep = T.sum(newSRep, axis=0)
    #     # 1×120
    #     finSRepAfNon = T.dot(finalSRep, linearW)
    #
    #     finSRepAfNon = finSRepAfNon + T.dot(LRConnect, WForEP) + BForEP
    #
    #     return [finSRepAfNon, newSRep]
    #
    # [finalSRep, myob], _ = theano.scan(AtLayerData, outputs_info=[LRConnect, None], n_steps=NUMBER_DATA)

    # return [finalSRep[-1], myob[-1]]
    return originSentence
Ejemplo n.º 19
0
def vstack(tensors):
    return T.vertical_stack(*tensors)
Ejemplo n.º 20
0
    def __init__(self, rng=None, \
            Xd=None, Yd=None, Xc=None, Xm=None, \
            g_net=None, i_net=None, p_net=None, \
            data_dim=None, prior_dim=None, label_dim=None, \
            params=None):
        # TODO: refactor for use with "encoded" inferencer/generator
        assert(not (i_net.use_encoder or g_net.use_decoder))

        # setup a rng for this GIStack
        self.rng = RandStream(rng.randint(100000))
        # record the symbolic variables that will provide inputs to the
        # computation graph created for this GIStack
        self.Xd = Xd
        self.Yd = Yd
        self.Xc = Xc
        self.Xm = Xm
        self.Xd2 = T.vertical_stack(self.Xd, self.Xd)
        self.Yd2 = T.vertical_stack(self.Yd, self.Yd)
        self.Xc2 = T.vertical_stack(self.Xc, self.Xc)
        self.Xm2 = T.vertical_stack(self.Xm, self.Xm)
        self.obs_count = T.cast(self.Xd2.shape[0], 'floatX')
        # record the dimensionality of the data handled by this GIStack
        self.data_dim = data_dim
        self.label_dim = label_dim
        self.prior_dim = prior_dim
        # create a "shared-parameter" clone of the latent inferencer
        self.IN2 = i_net.shared_param_clone(rng=rng, \
                Xd=self.Xd2, Xc=self.Xc2, Xm=self.Xm2)
        # capture a handle for latent samples from the inferencer
        self.Xp2 = self.IN2.output
        # feed it into a shared-parameter clone of the generator
        self.GN2 = g_net.shared_param_clone(rng=rng, Xp=self.Xp2)
        # capture a handle for outputs from the observation generator
        self.Xg2 = self.GN2.output
        # and feed it into a shared-parameter clone of the label generator
        self.PN2 = p_net.shared_param_clone(rng=rng, Xd=self.Xp2)
        # capture handles for noisy/clean outputs of the label generator
        self.Yp2 = self.PN2.output_spawn[0] # noisy predictions
        self.Yp2_proto = self.PN2.output_proto # noise-free predictions

        # we require the PeaNet to have one proto-net and one spawn net
        assert(len(self.PN2.proto_nets) == 1)
        assert(len(self.PN2.spawn_nets) == 1)
        # check that all networks agree on the latent variable dimension
        assert(self.prior_dim == self.IN2.mu_layers[-1].out_dim)
        assert(self.prior_dim == self.IN2.sigma_layers[-1].out_dim)
        assert(self.prior_dim == self.GN2.mlp_layers[0].in_dim)
        assert(self.prior_dim == self.PN2.proto_nets[0][0].in_dim)
        # check that we've been told the correct cardinality for the
        # categorical variable we will be "decoding"
        assert(self.label_dim == self.PN2.proto_nets[0][-1].out_dim)

        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        # shared var learning rates for all networks
        self.lr_gn = theano.shared(value=zero_ary, name='gis_lr_gn')
        self.lr_in = theano.shared(value=zero_ary, name='gis_lr_in')
        self.lr_pn = theano.shared(value=zero_ary, name='gis_lr_pn')
        # shared var momentum parameters for all networks
        self.mom_1 = theano.shared(value=zero_ary, name='gis_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='gis_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='gis_it_count')
        # init parameters for controlling learning dynamics
        self.set_all_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='gis_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting posterior KL-div from prior
        self.lam_kld = theano.shared(value=zero_ary, name='gis_lam_kld')
        self.set_lam_kld(lam_kld=1.0)
        # init shared var for weighting semi-supervised classification
        self.lam_cat = theano.shared(value=zero_ary, name='gis_lam_cat')
        self.set_lam_cat(lam_cat=0.0)
        # init shared var for weighting PEA cost on (un)supervised inputs
        self.lam_pea_su = theano.shared(value=zero_ary, name='gis_lam_pea_su')
        self.lam_pea_un = theano.shared(value=zero_ary, name='gis_lam_pea_un')
        self.set_lam_pea(lam_pea_su=1.0, lam_pea_un=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='gis_lam_l2w')
        self.set_lam_l2w(lam_l2w=1e-3)

        # grab the full set of "optimizable" parameters from the generator
        # and inferencer networks that we'll be working with.
        self.gn_params = [p for p in self.GN2.mlp_params]
        self.in_params = [p for p in self.IN2.mlp_params]
        self.pn_params = [p for p in self.PN2.proto_params]
        self.joint_params = self.pn_params + self.in_params + self.gn_params

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        pea_cost_su, pea_cost_un = self._construct_post_pea_costs()
        self.data_nll_cost = self.lam_nll[0] * self._construct_data_nll_cost()
        self.post_kld_cost = self.lam_kld[0] * self._construct_post_kld_cost()
        self.post_cat_cost = self.lam_cat[0] * self._construct_post_cat_cost()
        self.post_pea_cost = (self.lam_pea_su[0] * pea_cost_su) + \
                (self.lam_pea_un[0] * pea_cost_un)
        self.other_reg_cost = self._construct_other_reg_cost()
        self.joint_cost = self.data_nll_cost + self.post_kld_cost + self.post_cat_cost + \
                self.post_pea_cost + self.other_reg_cost

        # grab the gradients for all parameters to optimize
        self.joint_grads = OrderedDict()
        for p in self.joint_params:
            self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.1, 0.1)

        # construct the updates for all parameters to optimize
        self.gn_updates = get_adam_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)
        self.in_updates = get_adam_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)
        self.pn_updates = get_adam_updates(params=self.pn_params, \
                grads=self.joint_grads, alpha=self.lr_pn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)
        #self.gn_updates = get_adadelta_updates(params=self.gn_params, \
        #        grads=self.joint_grads, alpha=self.lr_gn, beta1=0.98)
        #self.in_updates = get_adadelta_updates(params=self.in_params, \
        #        grads=self.joint_grads, alpha=self.lr_in, beta1=0.98)
        #self.pn_updates = get_adadelta_updates(params=self.pn_params, \
        #        grads=self.joint_grads, alpha=self.lr_dn, beta1=0.98)

        # bag up all the updates required for training
        self.joint_updates = OrderedDict()
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]
        for k in self.pn_updates:
            self.joint_updates[k] = self.pn_updates[k]

        # construct a training function for all parameters. training for the
        # various networks can be switched on and off via learning rates
        self.train_joint = self._construct_train_joint()
        return
Ejemplo n.º 21
0
 def raw_activation_fast(sl, e1, e2):
     return T.batched_dot(theano.dot(e1, self.W[:,:,sl]), e2) +\
     theano.dot(T.reshape(self.V[:,sl],(1,-1)), T.vertical_stack(e1.T, e2.T)) +\
     self.b[sl] # Bias part
Ejemplo n.º 22
0
    def __init__(self,
                 d,
                 V,
                 r,
                 nc,
                 nf,
                 pairwise_constraint=False,
                 embeddings=None,
                 fix_embeddings=False):
        #d = dimensionality of embeddings
        #V = size of vocabulary
        #r = number of dependency relations
        #nc = number of classes for classification

        #|V| x d embedding matrix
        if embeddings is None:
            self.We = theano.shared(
                name='embeddings',
                value=0.2 * np.random.uniform(-1.0, 1.0, (V, d))).astype(
                    theano.config.floatX)
        else:
            self.We = theano.shared(name='embeddings',
                                    value=embeddings).astype(
                                        theano.config.floatX)

        #r x d x d tensor (matrix for each dependency relation)
        self.Wr = theano.shared(
            name='dependencies',
            value=0.2 * np.random.uniform(-1.0, 1.0, (r, d, d))).astype(
                theano.config.floatX)

        #d x d map from embedding to hidden vector
        self.Wv = theano.shared(
            name='Wv',
            value=0.2 * np.random.uniform(-1.0, 1.0,
                                          (d, d))).astype(theano.config.floatX)

        #d long bias vector
        self.b = theano.shared(name='b',
                               value=np.zeros(d, dtype=theano.config.floatX))

        #weights for fine grained features plus bias
        #self.beta = theano.shared(name='beta',
        #                          value=0.2 * np.random.uniform(-1.0, 1.0, (nc, nf))
        #                          ).astype(theano.config.floatX)

        #low dimension approximation to classification parameters
        self.a = []
        for i in range(nc):
            a = []
            for j in range(3):
                a.append(
                    theano.shared(name='a_{}_{}'.format(i, j),
                                  value=0.2 *
                                  np.random.uniform(-1.0, 1.0, d)).astype(
                                      theano.config.floatX))
                #value=np.zeros(d, dtype=theano.config.floatX)))
            self.a.append(a)

        self.pairwise_constraint = pairwise_constraint

        if fix_embeddings:
            self.params = [self.Wr, self.Wv, self.b
                           ] + [j for i in self.a for j in i]  # + [self.beta]
        else:
            self.params = [self.We, self.Wr, self.Wv, self.b
                           ] + [j for i in self.a for j in i]  # + [self.beta]

        self.descender = Adagrad(self.params)

        #self.f = T.tanh
        self.f = normalized_tanh

        def recurrence(n, hidden_states, hidden_sums, x, r, p):
            #at each node n in the tree, calculate Wr(p,n) \dot f(W_v \dot We_word(n) + b + sum_n) and add to sum_p
            h_n = self.f(T.dot(self.Wv, x[n]) + self.b + hidden_sums[n])
            sum_n = T.dot(r[n], h_n)

            return T.set_subtensor(hidden_states[n], h_n), T.inc_subtensor(
                hidden_sums[p[n]], sum_n)

        idxs = []
        x = []
        rel_idxs = []
        r = []
        p = []
        hidden_sums = []
        hidden_states = []
        h = []
        s = []
        if pairwise_constraint:
            num_events = 4
        else:
            num_events = 2

        for i in range(num_events):
            idxs.append(T.ivector('idxs'))
            x.append(self.We[idxs[i]])

            rel_idxs.append(T.ivector('rel_idxs'))
            r.append(self.Wr[rel_idxs[i]])

            p.append(T.ivector('parents'))

            hidden_states.append(
                T.zeros((idxs[i].shape[0], d), dtype=theano.config.floatX))
            #needs to be sent_length + 1 to store final sum
            hidden_sums.append(
                T.zeros((idxs[i].shape[0] + 1, d), dtype=theano.config.floatX))

            h.append(None)
            s.append(None)
            [h[i], s[i]], updates = theano.scan(
                fn=recurrence,
                sequences=T.arange(x[i].shape[0]),
                outputs_info=[hidden_states[i], hidden_sums[i]],
                non_sequences=[x[i], r[i], p[i]])

        #A = T.dot(self.a_1, self.a_2.reshape((1, d))) + T.nlinalg.diag(self.a_3)
        #cost = T.dot(T.dot(h[0][-1, -1], A), h[1][-1, -1])
        #cost = T.dot(h[0][-1, -1], h[1][-1, -1])
        #grad = T.grad(cost, self.params)
        #self.cost_and_grad = theano.function(inputs=[idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1]],
        #                                     outputs=[cost] + grad)

        A_stack = []
        for i in range(len(self.a)):
            A_stack.append(
                T.dot(self.a[i][0].reshape((d, 1)), self.a[i][1].reshape(
                    (1, d))) + T.nlinalg.diag(self.a[i][2]))
        A = T.vertical_stack(*A_stack).reshape((d, d, nc))

        self.states = theano.function(
            inputs=[idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1]],
            outputs=[h[0], h[1]])

        #add fine-grained features
        #phi = T.vector('phi')

        p_y_given_x = T.nnet.softmax(
            T.dot(h[0][-1, -1], A).T.dot(h[1][-1,
                                              -1]))  # + T.dot(self.beta, phi))
        y_pred = T.argmax(p_y_given_x, axis=1)

        self.classify = theano.function(
            inputs=[idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1],
                    p[1]],  # , phi],
            outputs=y_pred)

        y = T.iscalar('y')

        if not pairwise_constraint:
            sentence_nll = -(T.log(p_y_given_x)[0, y])

            grad = T.grad(sentence_nll, self.params)

            self.cost_and_grad = theano.function(
                inputs=[
                    idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1], y
                ],  #, phi, y],
                outputs=[sentence_nll] + grad)
        else:
            lambda_e = T.scalar('lambda_e')

            phi2 = T.vector('phi2')
            p_y_given_x1 = T.nnet.softmax(
                T.dot(h[0][-1, -1], A).T.dot(h[1][-1, -1]) +
                T.dot(self.beta, phi))
            p_y_given_x2 = T.nnet.softmax(
                T.dot(h[2][-1, -1], A).T.dot(h[3][-1, -1]) +
                T.dot(self.beta, phi2))

            sentence_nll = -(T.log(p_y_given_x1)[0, y]) - (
                T.log(p_y_given_x2)[0, y])

            #add constraint that events should be maximally similar
            cost = sentence_nll - lambda_e * T.dot(h[0][-1, -1], h[2][
                -1, -1]) - lambda_e * T.dot(h[1][-1, -1], h[3][-1, -1])

            #grad = T.grad(sentence_nll, self.params[:4] + [A])
            grad = T.grad(cost, self.params)

            self.cost_and_grad = theano.function(inputs=[
                idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1], phi,
                idxs[2], rel_idxs[2], p[2], idxs[3], rel_idxs[3], p[3], phi2,
                y,
                theano.In(lambda_e, value=1)
            ],
                                                 outputs=[cost] + grad)
Ejemplo n.º 23
0
    def __init__(self, rng=None, Xd=None, Xp=None, d_net=None, g_net=None, \
                data_dim=None, params=None):
        # Do some stuff!
        self.rng = theano.tensor.shared_randomstreams.RandomStreams( \
                rng.randint(100000))
        self.data_dim = data_dim

        # symbolic var for inputting samples from the data distribution
        self.Xd = Xd
        # symbolic var for inputting samples from the generator's prior
        self.Xp = Xp
        # symbolic matrix of indices for data inputs
        self.Id = T.lvector(name='gcp_Id')
        # symbolic matrix of indices for noise inputs
        self.In = T.lvector(name='gcp_In')

        # create clones of the given generator and discriminator, after
        # rewiring their computation graphs to take the right inputs
        self.GN = g_net.shared_param_clone(rng=rng, Xp=self.Xp)
        self.DN = d_net.shared_param_clone(rng=rng, \
                Xd=T.vertical_stack(Xd, self.GN.output))

        # shared var learning rate for generator and discriminator
        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        self.lr_gn = theano.shared(value=zero_ary, name='gcp_lr_gn')
        self.lr_dn = theano.shared(value=zero_ary, name='gcp_lr_dn')
        # shared var momentum parameters for generator and discriminator
        self.mo_gn = theano.shared(value=zero_ary, name='gcp_mo_gn')
        self.mo_dn = theano.shared(value=zero_ary, name='gcp_mo_dn')
        # shared var weights for collaborative classification objective
        self.dw_gn = theano.shared(value=zero_ary, name='gcp_dw_gn')
        self.dw_dn = theano.shared(value=zero_ary, name='gcp_dw_dn')
        # init parameters for controlling learning dynamics
        self.set_gn_sgd_params() # init SGD rate/momentum for GN
        self.set_dn_sgd_params() # init SGD rate/momentum for DN
        self.set_disc_weights()  # initcollaborative cost weights for GN/DN
        self.lam_l2d = theano.shared(value=(zero_ary + params['lam_l2d']), \
                name='gcp_lam_l2d')

        #######################################################
        # Welcome to: Moment Matching Cost Information Center #
        #######################################################
        #
        # Get parameters for managing the moment matching cost. The moment
        # matching is based on exponentially-decaying estimates of the mean
        # and covariance of the distribution induced by the generator network
        # and the (latent) noise being fed to it.
        #
        # We provide the option of performing moment matching with either the
        # raw generator output, or with linearly-transformed generator output.
        # Either way, the given target mean and covariance should have the
        # appropriate dimension for the space in which we'll be matching the
        # generator's 1st/2nd moments with the target's 1st/2nd moments. For
        # clarity, the computation we'll perform looks like:
        #
        #   Xm = X - np.mean(X, axis=0)
        #   XmP = np.dot(Xm, P)
        #   C = np.dot(XmP.T, XmP)
        #
        # where Xm is the mean-centered samples from the generator and P is
        # the matrix for the linear transform to apply prior to computing
        # the moment matching cost. For simplicity, the above code ignores the
        # use of an exponentially decaying average to track the estimated mean
        # and covariance of the generator's output distribution.
        #
        # The relative contribution of the current batch to these running
        # estimates is determined by self.mom_mix_rate. The mean estimate is
        # first updated based on the current batch, then the current batch
        # is centered with the updated mean, then the covariance estimate is
        # updated with the mean-centered samples in the current batch.
        #
        # Strength of the moment matching cost is given by self.mom_match_cost.
        # Target mean/covariance are given by self.target_mean/self.target_cov.
        # If a linear transform is to be applied prior to matching, it is given
        # by self.mom_match_proj.
        #
        zero_ary = np.zeros((1,))
        mmr = zero_ary + params['mom_mix_rate']
        self.mom_mix_rate = theano.shared(name='gcp_mom_mix_rate', \
            value=mmr.astype(theano.config.floatX))
        mmw = zero_ary + params['mom_match_weight']
        self.mom_match_weight = theano.shared(name='gcp_mom_match_weight', \
            value=mmw.astype(theano.config.floatX))
        targ_mean = params['target_mean'].astype(theano.config.floatX)
        targ_cov = params['target_cov'].astype(theano.config.floatX)
        assert(targ_mean.size == targ_cov.shape[0]) # mean and cov use same dim
        assert(targ_cov.shape[0] == targ_cov.shape[1]) # cov must be square
        self.target_mean = theano.shared(value=targ_mean, name='gcp_target_mean')
        self.target_cov = theano.shared(value=targ_cov, name='gcp_target_cov')
        mmp = np.identity(targ_cov.shape[0]) # default to identity transform
        if 'mom_match_proj' in params:
            mmp = params['mom_match_proj'] # use a user-specified transform
        assert(mmp.shape[0] == self.data_dim) # transform matches data dim
        assert(mmp.shape[1] == targ_cov.shape[0]) # and matches mean/cov dims
        mmp = mmp.astype(theano.config.floatX)
        self.mom_match_proj = theano.shared(value=mmp, name='gcp_mom_map_proj')
        # finally, we can construct the moment matching cost! and the updates
        # for the running mean/covariance estimates too!
        self.mom_match_cost, self.mom_updates = self._construct_mom_stuff()
        #########################################
        # Thank you for visiting the M.M.C.I.C. #
        #########################################

        # Grab the full set of "optimizable" parameters from the generator
        # and discriminator networks that we'll be working with. We need to
        # ignore parameters in the final layers of the proto-networks in the
        # discriminator network (a generalized pseudo-ensemble). We ignore them
        # because the GCPair requires that they be "bypassed" in favor of some
        # binary classification layers that will be managed by this GCPair.
        self.dn_params = []
        for pn in self.DN.proto_nets:
            for pnl in pn[0:-1]:
                self.dn_params.extend(pnl.params)
        self.gn_params = [p for p in self.GN.mlp_params]
        # Now construct a binary discriminator layer for each proto-net in the
        # discriminator network. And, add their params to optimization list.
        self._construct_disc_layers(rng)
        self.disc_reg_cost = self.lam_l2d[0] * \
                T.sum([dl.act_l2_sum for dl in self.disc_layers])

        # Construct costs for the generator and discriminator networks based 
        # on collaborative binary classification
        self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs()

        # Cost w.r.t. discriminator parameters is only the collaborative binary
        # classification cost. Cost w.r.t. comprises a collaborative binary
        # classification cost and the (weighted) moment matching cost.
        self.dn_cost = self.disc_cost_dn + self.DN.act_reg_cost + self.disc_reg_cost
        self.gn_cost = self.disc_cost_gn + self.mom_match_cost + self.GN.act_reg_cost
        self.joint_cost = self.dn_cost + self.gn_cost

        # Initialize momentums for mini-batch SGD updates. All parameters need
        # to be safely nestled in their lists by now.
        self.joint_moms = OrderedDict()
        self.dn_moms = OrderedDict()
        self.gn_moms = OrderedDict()
        for p in self.gn_params:
            p_mo = np.zeros(p.get_value(borrow=True).shape) + 2.0
            self.gn_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX))
            self.joint_moms[p] = self.gn_moms[p]
        for p in self.dn_params:
            p_mo = np.zeros(p.get_value(borrow=True).shape) + 2.0
            self.dn_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX))
            self.joint_moms[p] = self.dn_moms[p]

        # Construct the updates for the generator and discriminator network
        self.joint_updates = OrderedDict()
        self.dn_updates = OrderedDict()
        self.gn_updates = OrderedDict()
        ###########################################
        # Construct updates for the discriminator #
        ###########################################
        for var in self.dn_params:
            # these updates are for trainable params in the inferencer net...
            # first, get gradient of cost w.r.t. var
            var_grad = T.grad(self.dn_cost, var, \
                    consider_constant=[self.GN.dist_mean, self.GN.dist_cov])
            # get the momentum for this var
            var_mom = self.dn_moms[var]
            # update the momentum for this var using its grad
            self.dn_updates[var_mom] = (self.mo_dn[0] * var_mom) + \
                    ((1.0 - self.mo_dn[0]) * (var_grad**2.0))
            self.joint_updates[var_mom] = self.dn_updates[var_mom]
            # make basic update to the var
            var_new = var - (self.lr_dn[0] * (var_grad / T.sqrt(var_mom + 1e-2)))
            self.dn_updates[var] = var_new
            # add this var's update to the joint updates too
            self.joint_updates[var] = self.dn_updates[var]
        ########################################################
        # Construct updates for the moment tracking parameters #
        ########################################################
        for var in self.mom_updates:
            # these updates are for the generator distribution's running first
            # and second-order moment estimates
            self.gn_updates[var] = self.mom_updates[var]
            self.joint_updates[var] = self.gn_updates[var]
        #######################################
        # Construct updates for the generator #
        #######################################
        for var in self.gn_params:
            # these updates are for trainable params in the generator net...
            # first, get gradient of cost w.r.t. var
            var_grad = T.grad(self.gn_cost, var, \
                    consider_constant=[self.GN.dist_mean, self.GN.dist_cov])
            # get the momentum for this var
            var_mom = self.gn_moms[var]
            # update the momentum for this var using its grad
            self.gn_updates[var_mom] = (self.mo_gn[0] * var_mom) + \
                    ((1.0 - self.mo_gn[0]) * (var_grad**2.0))
            self.joint_updates[var_mom] = self.gn_updates[var_mom]
            # make basic update to the var
            var_new = var - (self.lr_gn[0] * (var_grad / T.sqrt(var_mom + 1e-2)))
            self.gn_updates[var] = var_new
            # add this var's update to the joint updates too
            self.joint_updates[var] = self.gn_updates[var]

        # Construct batch-based training functions for the generator and
        # discriminator networks, as well as a joint training function.
        self.train_gn = self._construct_train_gn()
        self.train_dn = self._construct_train_dn()
        self.train_joint = self._construct_train_joint()

        # Construct a function for computing the ouputs of the generator
        # network for a batch of noise. Presumably, the noise will be drawn
        # from the same distribution that was used in training....
        self.sample_from_gn = self.GN.sample_from_model
        return
Ejemplo n.º 24
0
    def __init__(self, rng=None, Xd=None, Xp=None, d_net=None, g_net=None, \
                 obs_dim=None, z_dim=None, params=None):
        # Do some stuff!
        self.rng = RandStream(rng.randint(100000))
        self.obs_dim = obs_dim
        self.z_dim = z_dim
        self.params = params
        # check that z_dim agrees with input dim for g_net
        assert(self.z_dim == g_net.shared_layers[0].in_dim)
        # set the transform on generator's raw output
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)

        # symbolic var for inputting samples from the data distribution
        self.Xd = Xd
        # symbolic var for inputting samples from the generator's prior
        self.Xp = Xp
        # symbolic matrix of indices for data inputs
        self.Id = T.lvector(name='gcp_Id')
        # symbolic matrix of indices for noise inputs
        self.In = T.lvector(name='gcp_In')

        # create clones of the given generator and discriminator, after
        # rewiring their computation graphs to take the right inputs
        self.GN = g_net.shared_param_clone(rng=rng, Xd=self.Xp)
        self.out_mean, self.out_logvar, self.out_samples = \
                self.GN.apply(self.Xp, do_samples=True)
        self.Xg = self.obs_transform(self.out_samples)
        self.DN = d_net.shared_param_clone(rng=rng, \
                Xd=T.vertical_stack(self.Xd, self.Xg))

        # shared var learning rate for generator and discriminator
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr_gn = theano.shared(value=zero_ary, name='gcp_lr_gn')
        self.lr_dn = theano.shared(value=zero_ary, name='gcp_lr_dn')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='msm_it_count')
        # shared var weights for collaborative classification objective
        self.dw_gn = theano.shared(value=zero_ary, name='gcp_dw_gn')
        self.dw_dn = theano.shared(value=zero_ary, name='gcp_dw_dn')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()    # init SGD rate/momentum
        self.set_disc_weights()  # initcollaborative cost weights for GN/DN
        self.lam_l2d = theano.shared(value=(zero_ary + self.params['lam_l2d']), \
                name='gcp_lam_l2d')

        #######################################################
        # Welcome to: Moment Matching Cost Information Center #
        #######################################################
        #
        # Get parameters for managing the moment matching cost. The moment
        # matching is based on exponentially-decaying estimates of the mean
        # and covariance of the distribution induced by the generator network
        # and the (latent) noise being fed to it.
        #
        # We provide the option of performing moment matching with either the
        # raw generator output, or with linearly-transformed generator output.
        # Either way, the given target mean and covariance should have the
        # appropriate dimension for the space in which we'll be matching the
        # generator's 1st/2nd moments with the target's 1st/2nd moments. For
        # clarity, the computation we'll perform looks like:
        #
        #   Xm = X - np.mean(X, axis=0)
        #   XmP = np.dot(Xm, P)
        #   C = np.dot(XmP.T, XmP)
        #
        # where Xm is the mean-centered samples from the generator and P is
        # the matrix for the linear transform to apply prior to computing
        # the moment matching cost. For simplicity, the above code ignores the
        # use of an exponentially decaying average to track the estimated mean
        # and covariance of the generator's output distribution.
        #
        # The relative contribution of the current batch to these running
        # estimates is determined by self.mom_mix_rate. The mean estimate is
        # first updated based on the current batch, then the current batch
        # is centered with the updated mean, then the covariance estimate is
        # updated with the mean-centered samples in the current batch.
        #
        # Strength of the moment matching cost is given by self.mom_match_cost.
        # Target mean/covariance are given by self.target_mean/self.target_cov.
        # If a linear transform is to be applied prior to matching, it is given
        # by self.mom_match_proj.
        #
        C_init = to_fX( np.zeros((self.obs_dim, self.obs_dim)) )
        m_init = to_fX( np.zeros((self.obs_dim,)) )
        self.dist_cov = theano.shared(C_init, name='gcp_dist_cov')
        self.dist_mean = theano.shared(m_init, name='gcp_dist_mean')
        

        zero_ary = np.zeros((1,))
        mmr = zero_ary + self.params['mom_mix_rate']
        self.mom_mix_rate = theano.shared(name='gcp_mom_mix_rate', \
            value=to_fX(mmr))
        mmw = zero_ary + self.params['mom_match_weight']
        self.mom_match_weight = theano.shared(name='gcp_mom_match_weight', \
            value=to_fX(mmw))
        targ_mean = to_fX( self.params['target_mean'] )
        targ_cov = to_fX( self.params['target_cov'] )
        assert(targ_mean.size == targ_cov.shape[0]) # mean and cov use same dim
        assert(targ_cov.shape[0] == targ_cov.shape[1]) # cov must be square
        self.target_mean = theano.shared(value=targ_mean, name='gcp_target_mean')
        self.target_cov = theano.shared(value=targ_cov, name='gcp_target_cov')
        mmp = np.identity(targ_cov.shape[0]) # default to identity transform
        if 'mom_match_proj' in self.params:
            mmp = self.params['mom_match_proj'] # use a user-specified transform
        assert(mmp.shape[0] == self.obs_dim) # transform matches data dim
        assert(mmp.shape[1] == targ_cov.shape[0]) # and matches mean/cov dims
        mmp = to_fX( mmp )
        self.mom_match_proj = theano.shared(value=mmp, name='gcp_mom_map_proj')
        # finally, we can construct the moment matching cost! and the updates
        # for the running mean/covariance estimates too!
        self.mom_match_cost, self.mom_updates = self._construct_mom_stuff()
        #########################################
        # Thank you for visiting the M.M.C.I.C. #
        #########################################

        # Grab the full set of "optimizable" parameters from the generator
        # and discriminator networks that we'll be working with. We need to
        # ignore parameters in the final layers of the proto-networks in the
        # discriminator network (a generalized pseudo-ensemble). We ignore them
        # because the GCPair requires that they be "bypassed" in favor of some
        # binary classification layers that will be managed by this GCPair.
        self.dn_params = []
        for pn in self.DN.proto_nets:
            for pnl in pn[0:-1]:
                self.dn_params.extend(pnl.params)
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.dn_params + self.gn_params
        # Now construct a binary discriminator layer for each proto-net in the
        # discriminator network. And, add their params to optimization list.
        self._construct_disc_layers(rng)
        self.disc_reg_cost = self.lam_l2d[0] * \
                T.sum([dl.act_l2_sum for dl in self.disc_layers])

        # Construct costs for the generator and discriminator networks based 
        # on collaborative binary classification
        self.disc_cost_dn, self.disc_cost_gn = self._construct_disc_costs()

        # compute small l2 penalty on params
        self.dn_l2_cost = constFX(1e-4) * T.sum([T.sum(p**2.0) for p in self.dn_params])
        self.gn_l2_cost = constFX(1e-4) * T.sum([T.sum(p**2.0) for p in self.gn_params])

        # Cost w.r.t. discriminator parameters is only the collaborative binary
        # classification cost. Cost w.r.t. comprises a collaborative binary
        # classification cost and the (weighted) moment matching cost.
        self.dn_cost = self.disc_cost_dn + self.disc_reg_cost + self.dn_l2_cost
        self.gn_cost = self.disc_cost_gn + self.mom_match_cost + self.gn_l2_cost
        self.joint_cost = self.dn_cost + self.gn_cost

        # Compute gradients on generator and dicriminator parameters
        print("Computing gradients on generator...")
        self.gn_grads = OrderedDict()
        grad_list = T.grad(self.gn_cost, self.gn_params)
        for i, p in enumerate(self.gn_params):
            self.gn_grads[p] = grad_list[i]
        print("Computing gradients on discriminator...")
        self.dn_grads = OrderedDict()
        grad_list = T.grad(self.dn_cost, self.dn_params)
        for i, p in enumerate(self.dn_params):
            self.dn_grads[p] = grad_list[i]

        # Construct the updates for the generator and discriminator network
        self.joint_updates = OrderedDict()
        self.dn_updates = OrderedDict()
        self.gn_updates = OrderedDict()
        for var in self.mom_updates:
            # these updates are for the generator distribution's running first
            # and second-order moment estimates
            self.gn_updates[var] = self.mom_updates[var]
            self.joint_updates[var] = self.gn_updates[var]
        # Construct the updates for the generator and inferencer networks
        self.dn_updates = get_adam_updates(params=self.dn_params, \
                grads=self.dn_grads, alpha=self.lr_dn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.gn_updates = get_adam_updates(params=self.gn_params, \
                grads=self.gn_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        for k in self.dn_updates:
            self.joint_updates[k] = self.dn_updates[k]
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]

        # Construct batch-based training functions for the generator and
        # discriminator networks, as well as a joint training function.
        print("Compiling generator training function...")
        self.train_gn = self._construct_train_gn()
        print("Compiling discriminator training function...")
        self.train_dn = self._construct_train_dn()
        print("Compiling joint training function...")
        self.train_joint = self._construct_train_joint()

        # Construct a function for computing the ouputs of the generator
        # network for a batch of noise. Presumably, the noise will be drawn
        # from the same distribution that was used in training....
        self.sample_from_gn = self._construct_model_sampler()
        return