def srr_step_func(ss, zi_zmuv, p_masks, q_masks, si, mi_p, mi_q):
            # transform the current belief state into an observation
            si_as_x = self._from_si_to_x(si)
            full_grad = T.log(1.0 + T.exp(ss)) * (self.x_out - si_as_x)

            # get the masked belief state and gradient for primary policy
            xi_for_p = (mi_p * self.x_out) + ((1.0 - mi_p) * si_as_x)
            grad_for_p = mi_p * full_grad

            # update the guide policy's revelation mask
            new_to_q = (1.0 - mi_q) * q_masks
            mip1_q = mi_q + new_to_q
            # get the masked belief state and gradient for guide policy
            # xi_for_q = (mip1_q * self.x_out) + ((1.0 - mip1_q) * si_as_x)
            xi_for_q = xi_for_p
            grad_for_q = mip1_q * full_grad

            # get samples of next zi, according to the primary policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply(
                T.horizontal_stack(xi_for_p, grad_for_p), do_samples=False
            )
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply(
                T.horizontal_stack(xi_for_q, grad_for_q), do_samples=False
            )
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)
            # make zi samples that can be switched between zi_p and zi_q
            zi = (self.train_switch[0] * zi_q) + ((1.0 - self.train_switch[0]) * zi_p)

            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar)  # KL(q || p)
            kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar)  # KL(p || q)
            kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0)  # KL(p || N(0, I))

            # compute next si, given sampled zi (i.e. update the belief state)
            hydra_out = self.p_sip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if self.step_type == "jump":
                # jump steps always do a full swap of belief state
                sip1 = si_step
            else:
                # additive steps adjust the belief state like an LSTM
                write_gate = T.nnet.sigmoid(2.0 + hydra_out[1])
                erase_gate = T.nnet.sigmoid(2.0 + hydra_out[2])
                sip1 = (erase_gate * si) + (write_gate * si_step)
            # update the primary policy's revelation mask
            new_to_p = (1.0 - mi_p) * p_masks
            mip1_p = mi_p + new_to_p
            # compute NLL only for the newly revealed values
            nlli = self._construct_nll_costs(sip1, self.x_out, new_to_p)
            # each loop iteration produces the following values:
            #   sip1: belief state at end of current step
            #   mip1_p: revealed values mask to use in next step (primary)
            #   mip1_q: revealed values mask to use in next step (guide)
            #   nlli: NLL for values revealed at end of current step
            #   kldi_q2p: KL(q || p) for the current step
            #   kldi_p2q: KL(p || q) for the current step
            #   kldi_p2g: KL(p || N(0,I)) for the current step
            return sip1, mip1_p, mip1_q, nlli, kldi_q2p, kldi_p2q, kldi_p2g
Beispiel #2
0
    def __call__(self, v, output_type='h'):

        if hasattr(self, 'layer1_model'):
            h1 = self.l1model.h_given_v(v)
            s1 = self.l1model.s_given_hv(h1, v)

        if hasattr(self, 'layer2_model'):
            # preprocessor for input data
            h2 = self.l2model.h_given_vx(h1, s1)
            s2 = self.l2model.s_given_vxh(h1, s1, h2)
            h3 = self.h_given_vx(h2, s2)
            return T.horizontal_stack(h1, h2, h3)
        else:
            h2 = self.h_given_vx(h1, s1)
            return T.horizontal_stack(h1, h2)
Beispiel #3
0
    def __call__(self, v, output_type='h'):

        if hasattr(self, 'layer1_model'):
            h1 = self.l1model.h_given_v(v)
            s1 = self.l1model.s_given_hv(h1, v)

        if hasattr(self, 'layer2_model'):
           # preprocessor for input data
            h2 = self.l2model.h_given_vx(h1, s1)
            s2 = self.l2model.s_given_vxh(h1, s1, h2)
            h3 = self.h_given_vx(h2, s2)
            return T.horizontal_stack(h1,h2,h3)
        else:            
            h2 = self.h_given_vx(h1, s1)
            return T.horizontal_stack(h1,h2)
Beispiel #4
0
    def __call__(self, v, output_type='fg+fh'):
        print 'Building representation with %s' % output_type
        [g, h, s] = self.e_step(v, n_steps=self.pos_mf_steps)

        atoms = {
                'g_s' : T.dot(g, self.Wg),  # g in s-space
                'h_s' : T.dot(h, self.Wh),  # h in s-space
                's_g' : T.sqrt(T.dot(s**2, self.Wg.T)),
                's_h' : T.sqrt(T.dot(s**2, self.Wh.T)),
                's_g__h' : T.sqrt(T.dot(s**2 * T.dot(h, self.Wh), self.Wg.T)),
                's_h__g' : T.sqrt(T.dot(s**2 * T.dot(g, self.Wg), self.Wh.T))
                }

        output_prods = {
                ## factored representations
                'g' : g,
                'h' : h,
                'gs': g * atoms['s_g'],
                'hs': h * atoms['s_h'],
                's_g': atoms['s_g'],
                's_h': atoms['s_h'],
                ## unfactored representations
                'sg_s' : atoms['g_s'] * s,
                'sh_s' : atoms['h_s'] * s,
                }

        toks = output_type.split('+')
        output = output_prods[toks[0]]
        for tok in toks[1:]:
            output = T.horizontal_stack(output, output_prods[tok])

        return output
Beispiel #5
0
def get_eem_predict_function(metric_name):
    W = T.dmatrix('W')
    X = T.dmatrix('X')

    beta = T.dmatrix('beta')
    m_plus = T.dmatrix('m_plus')
    m_minus = T.dmatrix('m_minus')
    sigma_plus = T.dmatrix('sigma_plus')
    sigma_minus = T.dmatrix('sigma_minus')

    H = metric_theano[metric_name](X, W)

    def gaussian(x, mu, sigma):
        return T.exp(T.power((x - mu[0]), 2) / (-2 * sigma)[0]) / (sigma * T.sqrt(2 * np.pi))[0]

    x = T.dot(H, beta)
    r_plus = gaussian(x, T.dot(beta.T, m_plus),
                      T.dot(T.dot(beta.T, sigma_plus), beta))
    r_minus = gaussian(x, T.dot(beta.T, m_minus),
                       T.dot(T.dot(beta.T, sigma_minus), beta))

    result = T.argmax(T.horizontal_stack(r_minus, r_plus), axis=1)

    eem_predict_function = theano.function(
        [X, W, beta, m_plus, m_minus, sigma_plus, sigma_minus], result)
    return eem_predict_function
        def imp_step_func(zi_zmuv, si):
            si_as_x = self._from_si_to_x(si)
            xi_unmasked = self.x_out
            xi_masked = (self.x_mask * xi_unmasked) + \
                        ((1.0 - self.x_mask) * si_as_x)
            grad_unmasked = self.x_out - si_as_x
            grad_masked = (self.x_mask * grad_unmasked) + \
                          ((1.0 - self.x_mask) * self.grad_null)
            # get samples of next zi, according to the global policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( \
                    T.horizontal_stack(xi_masked, grad_masked), \
                    do_samples=False)
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( \
                    T.horizontal_stack(xi_masked, grad_unmasked), \
                    do_samples=False)
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)

            # make zi samples that can be switched between zi_p and zi_q
            zi = ((self.train_switch[0] * zi_q) + \
                 ((1.0 - self.train_switch[0]) * zi_p))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, \
                                    zi_p_mean, zi_p_logvar) # KL(q || p)
            kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, \
                                    zi_q_mean, zi_q_logvar) # KL(p || q)
            kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, \
                                    0.0, 0.0) # KL(p || global prior)

            # compute the next si, given the sampled zi
            hydra_out = self.p_sip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if (self.step_type == 'jump'):
                # jump steps always completely overwrite the current guesses
                sip1 = si_step
            else:
                # additive steps update the current guesses like an LSTM
                write_gate = T.nnet.sigmoid(3.0 + hydra_out[1])
                erase_gate = T.nnet.sigmoid(3.0 + hydra_out[2])
                sip1 = (erase_gate * si) + (write_gate * si_step)
            # compute NLL for the current imputation
            nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask)
            return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g
Beispiel #7
0
    def fit_cross_cov(self, n_exp=2, n_gauss=2, range_mu=None):
        """
        Fit an analytical covariance to the experimental data.
        Args:
            n_exp (int): number of exponential basic functions
            n_gauss (int): number of gaussian basic functions
            range_mu: prior mean of the range. Default mean of the lags

        Returns:
            pymc.Model: PyMC3 model to be sampled using MCMC
        """
        self.n_exp = n_exp
        self.n_gauss = n_gauss
        n_var = self.n_properties
        df = self.exp_var
        lags = self.lags

        # Prior standard deviation for the error of the regression
        prior_std_reg = df.std(0).max() * 10

        # Prior value for the mean of the ranges
        if not range_mu:
            range_mu = lags.mean()

        # pymc3 Model
        with pm.Model() as model:  # model specifications in PyMC3 are wrapped in a with-statement
            # Define priors
            sigma = pm.HalfCauchy('sigma', beta=prior_std_reg, testval=1., shape=n_var)

            psill = pm.Normal('sill', prior_std_reg, sd=.5 * prior_std_reg, shape=(n_exp + n_gauss))
            range_ = pm.Normal('range', range_mu, sd=range_mu * .3, shape=(n_exp + n_gauss))

            lambda_ = pm.Uniform('weights', 0, 1, shape=(n_var * (n_exp + n_gauss)))

            # Exponential covariance
            exp = pm.Deterministic('exp',
                                   # (lambda_[:n_exp*n_var]*
                                   psill[:n_exp] *
                                   (1. - T.exp(T.dot(-lags.as_matrix().reshape((len(lags), 1)),
                                                     (range_[:n_exp].reshape((1, n_exp)) / 3.) ** -1))))

            gauss = pm.Deterministic('gaus',
                                     psill[n_exp:] *
                                     (1. - T.exp(T.dot(-lags.as_matrix().reshape((len(lags), 1)) ** 2,
                                                       (range_[n_exp:].reshape((1, n_gauss)) * 4 / 7.) ** -2))))

            # We stack the basic functions in the same matrix and tile it to match the number of properties we have
            func = pm.Deterministic('func', T.tile(T.horizontal_stack(exp, gauss), (n_var, 1, 1)))

            # We weight each basic function and sum them
            func_w = pm.Deterministic("func_w", T.sum(func * lambda_.reshape((n_var, 1, (n_exp + n_gauss))), axis=2))

            for e, cross in enumerate(df.columns):
                # Likelihoods
                pm.Normal(cross + "_like", mu=func_w[e], sd=sigma[e], observed=df[cross].as_matrix())
        return model
Beispiel #8
0
 def __init__(self, inpt, in_sz, n_classes, tied=False):
     if tied:
         b = share(init_wts(n_classes-1))
         w = share(init_wts(in_sz, n_classes-1))
         w1 = tt.horizontal_stack(w, tt.zeros((in_sz, 1)))
         b1 = tt.concatenate((b, tt.zeros(1)))
         self.output = tt.dot(inpt, w1) + b1
     else:
         b = share(init_wts(n_classes))
         w = share(init_wts(in_sz, n_classes))
         self.output = tt.dot(inpt, w) + b
     self.params = [w, b]
Beispiel #9
0
 def __init__(self, inpt, in_sz, n_classes, tied=False):
     if tied:
         b = share(init_wts(n_classes-1))
         w = share(init_wts(in_sz, n_classes-1))
         w1 = TT.horizontal_stack(w, TT.zeros((in_sz, 1)))
         b1 = TT.concatenate((b, TT.zeros(1)))
         self.output = TT.dot(inpt, w1) + b1
     else:
         b = share(init_wts(n_classes))
         w = share(init_wts(in_sz, n_classes))
         self.output = TT.dot(inpt, w) + b
     self.params = [w, b]
Beispiel #10
0
 def __init__(self, theta):
     self.x = T.dmatrix('x')
     self.y = T.dmatrix('y')
     self.n = self.x.shape[0]
     self.theta = theano.shared(theta)
     self.a = T.horizontal_stack((self.x.dot(self.theta)).reshape([self.n, 1]), T.zeros([self.n, 1]))
     self.prob = T.nnet.softmax(self.a)# T.exp(self.log_prob)
     self.l = T.dscalar('l')
     self.cost = -(T.sum(T.log(T.nnet.softmax(self.a))*self.y) / self.n) + self.l*T.sum(self.theta[2:]**2)/self.n
     self.grad = T.grad(self.cost, wrt=self.theta)
     self.hessian = theano.gradient.hessian(self.cost, self.theta)
     self.pred = self.prob > .5
     self.predict = theano.function(inputs=[self.x], outputs=[self.pred])
Beispiel #11
0
    def get_output_for(self, inputs, **kwargs):
        """Compute diffusion convolutional activation of inputs."""

        Apow = T.horizontal_stack(*inputs[:-1])

        X = inputs[-1]

        Apow_dot_X = T.dot(Apow, X)

        Apow_dot_X_times_W = Apow_dot_X * self.W

        out = self.nonlinearity(Apow_dot_X_times_W)

        return out
    def __call__(self, v, output_type='g+h'):
        print 'Building representation with %s' % output_type
        init_state = OrderedDict()
        init_state['g'] = T.ones(
            (v.shape[0], self.n_g)) * T.nnet.sigmoid(self.gbias)
        init_state['h'] = T.ones(
            (v.shape[0], self.n_h)) * T.nnet.sigmoid(self.hbias)
        [g, h, s2_1, s2_0, v, pos_counter] = self.pos_phase(
            v, init_state, n_steps=self.pos_steps)
        s = s2_1

        atoms = {
            'g_s': self.from_g(g),  # g in s-space
            'h_s': self.from_h(h),  # h in s-space
            's_g': T.sqrt(self.to_g(s**2)),
            's_h': T.sqrt(self.to_h(s**2)),
            's_g__h': T.sqrt(self.to_g(s**2 * self.from_h(h))),
            's_h__g': T.sqrt(self.to_h(s**2 * self.from_g(g))),
        }

        output_prods = {
            ## factored representations
            'g':
            g,
            'h':
            h,
            'gh': (g.dimshuffle(0, 1, 'x') * h.dimshuffle(0, 'x', 1)).flatten(
                ndim=2),
            'gs':
            g * atoms['s_g'],
            'hs':
            h * atoms['s_h'],
            's_g':
            atoms['s_g'],
            's_h':
            atoms['s_h'],
            ## unfactored representations
            'sg_s':
            atoms['g_s'] * s,
            'sh_s':
            atoms['h_s'] * s,
        }

        toks = output_type.split('+')
        output = output_prods[toks[0]]
        for tok in toks[1:]:
            output = T.horizontal_stack(output, output_prods[tok])

        return output
        def imp_step_func(zi_zmuv, si):
            si_as_x = self.obs_transform(si)
            xi_masked = (self.x_mask * self.x_out) + \
                        ((1.0 - self.x_mask) * si_as_x)
            #grad_ll = self.x_out - xi_masked
            # get samples of next zi, according to the global policy
            zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( \
                    xi_masked, do_samples=False)
            zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv)
            # get samples of next zi, according to the guide policy
            zi_q_mean, zi_q_logvar = self.q_zi_given_x_xi.apply( \
                    T.horizontal_stack(xi_masked, self.x_out), \
                    do_samples=False)
            zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv)

            if self.use_osm_mode:
                zi = zi_p
                # compute relevant KLds for this step
                kldi_q2p = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0)
                kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0)
            else:
                # make zi samples that can be switched between zi_p and zi_q
                zi = ((self.train_switch[0] * zi_q) + \
                     ((1.0 - self.train_switch[0]) * zi_p))
                # compute relevant KLds for this step
                kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, \
                                        zi_p_mean, zi_p_logvar)
                kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, \
                                        zi_q_mean, zi_q_logvar)

            # compute the next si, given the sampled zi
            hydra_out = self.p_xip1_given_zi.apply(zi)
            si_step = hydra_out[0]
            if (self.step_type == 'jump'):
                # jump steps always do a full swap (like standard VAE)
                sip1 = si_step
            else:
                # additive steps adjust the current guesses incrementally
                write_gate = T.nnet.sigmoid(2.0 + hydra_out[1])
                erase_gate = T.nnet.sigmoid(2.0 + hydra_out[2])
                # LSTM-style update
                sip1 = (erase_gate * si) + (write_gate * si_step)
                # normal update (this was used in workshop papers)
                #sip1 = si + si_step
            # compute NLL for the current imputation
            nlli = self._construct_nll_costs(sip1, self.x_out, 0.0*self.x_mask)
            return sip1, nlli, kldi_q2p, kldi_p2q
Beispiel #14
0
    def get_output_for(self, input, **kwargs):
        distances = conv_pairwise_distance(input, self.V)
        similarities = T.exp(-distances / T.abs_(self.gamma))
        norm = T.sum(similarities, 1).reshape((similarities.shape[0], 1, similarities.shape[2], similarities.shape[3]))
        membership = similarities / (norm + self.eps)

        histogram = T.mean(membership, axis=(2, 3))
        if self.spatial_level == 1:
            pivot1, pivot2 = membership.shape[2] / 2, membership.shape[3] / 2
            h1 = T.mean(membership[:, :, :pivot1, :pivot2], axis=(2, 3))
            h2 = T.mean(membership[:, :, :pivot1, pivot2:], axis=(2, 3))
            h3 = T.mean(membership[:, :, pivot1:, :pivot2], axis=(2, 3))
            h4 = T.mean(membership[:, :, pivot1:, pivot2:], axis=(2, 3))
            # Pyramid is not used in the paper
            # histogram = T.horizontal_stack(h1, h2, h3, h4)
            histogram = T.horizontal_stack(histogram, h1, h2, h3, h4)
        return histogram
Beispiel #15
0
    def get_output_for(self, inputs, deterministic=False, **kwargs):

        # extract inputs
        H1, H2 = inputs

        # running average projection matrix update
        if not deterministic:

            # compute batch mean
            mean1 = T.mean(H1, axis=0)
            mean2 = T.mean(H2, axis=0)

            # running average updates of means
            mean1 = (floatX(1.0 - self.alpha) * self.mean1 +
                     self.alpha * mean1)
            running_mean1 = theano.clone(self.mean1, share_inputs=False)
            running_mean1.default_update = mean1
            mean1 += 0 * running_mean1

            mean2 = (floatX(1.0 - self.alpha) * self.mean2 +
                     self.alpha * mean2)
            running_mean2 = theano.clone(self.mean2, share_inputs=False)
            running_mean2.default_update = mean2
            mean2 += 0 * running_mean2

            # hidden representations
            H1bar = H1 - mean1
            H2bar = H2 - mean2

        # use means of layer
        else:

            # hidden representations
            H1bar = H1 - self.mean1
            H2bar = H2 - self.mean2

        # re-project data
        lv1_cca = H1bar.dot(self.U)
        lv2_cca = H2bar.dot(self.V)

        output = T.horizontal_stack(lv1_cca, lv2_cca)

        return output
Beispiel #16
0
def fit_cross_cov(df, lags, n_exp=2, n_gaus=2, range_mu=None):
    n_var = df.columns.shape[0]
    n_basis_f = n_var * (n_exp + n_gaus)
    prior_std_reg = df.std(0).max() * 10
    #
    if not range_mu:
        range_mu = lags.mean()

    # Because is a experimental variogram I am not going to have outliers
    nugget_max = df.values.max()
    # print(n_basis_f, n_var*n_exp, nugget_max, range_mu, prior_std_reg)
    # pymc3 Model
    with pm.Model() as model:  # model specifications in PyMC3 are wrapped in a with-statement
        # Define priors
        sigma = pm.HalfCauchy('sigma', beta=prior_std_reg, testval=1., shape=n_var)

        psill = pm.Normal('sill', prior_std_reg, sd=.5 * prior_std_reg, shape=(n_exp + n_gaus))
        range_ = pm.Normal('range', range_mu, sd=range_mu * .3, shape=(n_exp + n_gaus))
        #  nugget = pm.Uniform('nugget', 0, nugget_max, shape=n_var)

        lambda_ = pm.Uniform('weights', 0, 1, shape=(n_var * (n_exp + n_gaus)))

        # Exponential covariance
        exp = pm.Deterministic('exp',
                               # (lambda_[:n_exp*n_var]*
                               psill[:n_exp] *
                               (1. - T.exp(T.dot(-lags.as_matrix().reshape((len(lags), 1)),
                                                 (range_[:n_exp].reshape((1, n_exp)) / 3.) ** -1))))

        gaus = pm.Deterministic('gaus',
                                psill[n_exp:] *
                                (1. - T.exp(T.dot(-lags.as_matrix().reshape((len(lags), 1)) ** 2,
                                                  (range_[n_exp:].reshape((1, n_gaus)) * 4 / 7.) ** -2))))

        func = pm.Deterministic('func', T.tile(T.horizontal_stack(exp, gaus), (n_var, 1, 1)))

        func_w = pm.Deterministic("func_w", T.sum(func * lambda_.reshape((n_var, 1, (n_exp + n_gaus))), axis=2))
        #           nugget.reshape((n_var,1)))

        for e, cross in enumerate(df.columns):
            # Likelihoods
            pm.Normal(cross + "_like", mu=func_w[e], sd=sigma[e], observed=df[cross].as_matrix())
    return model
    def __call__(self, v, output_type='g+h', mean_field=True):
        print 'Building representation with %s' % output_type
        init_state = OrderedDict()
        init_state['g'] = T.ones((v.shape[0],self.n_g)) * T.nnet.sigmoid(self.gbias)
        init_state['h'] = T.ones((v.shape[0],self.n_h)) * T.nnet.sigmoid(self.hbias)
        init_state['l'] = T.ones((v.shape[0],self.n_l)) * T.nnet.softmax(self.lbias)
        [g, h, l] = self.pos_phase(v, init_state, n_steps=self.pos_steps, mean_field=mean_field)
        s = self.s_given_ghv(g, h, v)

        atoms = {
            'g_s' : T.dot(g, self.Wg),  # g in s-space
            'h_s' : T.dot(h, self.Wh),  # h in s-space
            's_g' : T.sqrt(T.dot(s**2, self.Wg.T)),
            's_h' : T.sqrt(T.dot(s**2, self.Wh.T)),
            's_g__h' : T.sqrt(T.dot(s**2 * T.dot(h, self.Wh), self.Wg.T)),
            's_h__g' : T.sqrt(T.dot(s**2 * T.dot(g, self.Wg), self.Wh.T))
        }

        output_prods = {
            ## factored representations
            'g' : g,
            'h' : h,
            'gh' : (g.dimshuffle(0,1,'x') * h.dimshuffle(0,'x',1)).flatten(ndim=2),
            'gs': g * atoms['s_g'],
            'hs': h * atoms['s_h'],
            's_g': atoms['s_g'],
            's_h': atoms['s_h'],
            ## unfactored representations
            'sg_s' : atoms['g_s'] * s,
            'sh_s' : atoms['h_s'] * s,
        }

        toks = output_type.split('+')
        output = output_prods[toks[0]]
        for tok in toks[1:]:
            output = T.horizontal_stack(output, output_prods[tok])

        return output
Beispiel #18
0
    def __call__(self, v, output_type='h'):
        print 'Building representation with %s' % output_type
        init_state = OrderedDict()
        h = self.h_given_v(v)
        s = self.s_given_hv(h, v)

        atoms = {
            'h_s': self.from_h(h),  # h in s-space
            's_h': T.sqrt(self.to_h(s**2)),
        }

        output_prods = {
            'h': h,
            's': s,
            'hs': h * atoms['s_h'],
        }

        toks = output_type.split('+')
        output = output_prods[toks[0]]
        for tok in toks[1:]:
            output = T.horizontal_stack(output, output_prods[tok])

        return output
Beispiel #19
0
    def __call__(self, v, output_type='h'):
        print 'Building representation with %s' % output_type
        init_state = OrderedDict()
        h = self.h_given_v(v)
        s = self.s_given_hv(h, v)

        atoms = {
            'h_s' : self.from_h(h),  # h in s-space
            's_h' : T.sqrt(self.to_h(s**2)),
        }

        output_prods = {
            'h' : h,
            's' : s,
            'hs': h * atoms['s_h'],
        }

        toks = output_type.split('+')
        output = output_prods[toks[0]]
        for tok in toks[1:]:
            output = T.horizontal_stack(output, output_prods[tok])

        return output
Beispiel #20
0
 def apply(self, x_t, h_tm1, c_tm1):
     """
     Apply propagate the current input x_t and the previous exposed state
     and memory state h_tm1/c_tm1 through this LSTM layer.
     """
     hd = self.hid_dim
     # merge exogenous (i.e. x_t) and endogenous (i.e. h_tm1) inputs
     joint_input = T.horizontal_stack(x_t, h_tm1)
     joint_output = T.dot(joint_input, self.W_all) + self.b_all
     jo_T = joint_output.T
     # compute transformed input to the layer
     g_t = T.tanh( jo_T[:,0:(1*hd)].T )
     # compute input gate
     i_t = T.nnet.sigmoid( jo_T[:,(1*hd):(2*hd)].T )
     # compute forget gate
     f_t = T.nnet.sigmoid( jo_T[:,(2*hd):(3*hd)].T )
     # compute output gate
     o_t = T.nnet.sigmoid( jo_T[:,(3*hd):(4*hd)].T )
     # compute updated memory state
     c_t = (f_t * c_tm1) + (i_t * g_t)
     # compute updated exposed state
     h_t = (o_t * T.tanh(c_t))
     return h_t, c_t
Beispiel #21
0
    def __call__(self, v, output_type='g+h', mean_field=True):
        print 'Building representation with %s' % output_type
        init_state = OrderedDict()
        init_state['g'] = T.ones((v.shape[0],self.n_g)) * T.nnet.sigmoid(self.gbias)
        init_state['h'] = T.ones((v.shape[0],self.n_h)) * T.nnet.sigmoid(self.hbias)
        [g, h, pos_counter] = self.pos_phase(v, init_state, n_steps=self.pos_steps, mean_field=mean_field)

        atoms = {
                'g_s' : T.dot(g, self.Wg),  # g in s-space
                'h_s' : T.dot(h, self.Wh),  # h in s-space
                }

        output_prods = {
                'g' : g,
                'h' : h,
                'gh' : (g.dimshuffle(0,1,'x') * h.dimshuffle(0,'x',1)).flatten(ndim=2),
                }

        toks = output_type.split('+')
        output = output_prods[toks[0]]
        for tok in toks[1:]:
            output = T.horizontal_stack(output, output_prods[tok])

        return output
        def ir_step_func(hi_zmuv, sim1):
            # get variables used throughout this refinement step
            sim1_obs = self.obs_transform(sim1) # transform state -> obs
            grad_ll = self.x_out - sim1_obs

            # get samples of next hi, conditioned on current si
            hi_p_mean, hi_p_logvar = self.p_hi_given_si.apply( \
                    sim1_obs, do_samples=False)
            # now we build the model for variational hi given si
            hi_q_mean, hi_q_logvar = self.q_hi_given_x_si.apply( \
                    T.horizontal_stack(grad_ll, sim1_obs), \
                    do_samples=False)
            hi_q = (T.exp(0.5 * hi_q_logvar) * hi_zmuv) + hi_q_mean
            hi_p = (T.exp(0.5 * hi_p_logvar) * hi_zmuv) + hi_p_mean

            # make hi samples that can be switched between hi_p and hi_q
            hi = ( ((self.train_switch[0] * hi_q) + \
                    ((1.0 - self.train_switch[0]) * hi_p)) )

            # p_sip1_given_si_hi is conditioned on si and  hi.
            ig_vals, fg_vals, in_vals = self.p_sip1_given_si_hi.apply(hi)
                    
            # get the transformed values (for an LSTM style update)
            i_gate = 1.0 * T.nnet.sigmoid(ig_vals + 2.0)
            f_gate = 1.0 * T.nnet.sigmoid(fg_vals + 2.0)
            # perform an LSTM-like update of the state sim1 -> si
            si = (in_vals * i_gate) + (sim1 * f_gate)

            # compute generator NLL for this step
            nlli = self.log_prob_func(self.x_out, self.obs_transform(si))
            # compute relevant KLds for this step
            kldi_q2p = gaussian_kld(hi_q_mean, hi_q_logvar, \
                                    hi_p_mean, hi_p_logvar)
            kldi_p2q = gaussian_kld(hi_p_mean, hi_p_logvar, \
                                    hi_q_mean, hi_q_logvar)
            return si, nlli, kldi_q2p, kldi_p2q
    def __init__(self, rng=None, \
            x_in=None, x_out=None, \
            p_h_given_z=None, \
            p_x_given_h=None, \
            q_z_given_x=None, \
            q_h_given_z_x=None, \
            x_dim=None, \
            z_dim=None, \
            h_dim=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        self.shared_param_dicts = shared_param_dicts

        # record the dimensions of various spaces relevant to this model
        self.x_dim = x_dim
        self.z_dim = z_dim
        self.h_dim = h_dim

        # grab handles to the relevant InfNets
        self.q_z_given_x = q_z_given_x
        self.q_h_given_z_x = q_h_given_z_x
        self.p_h_given_z = p_h_given_z
        self.p_x_given_h = p_x_given_h

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch')
        self.set_train_switch(1.0)

        if self.shared_param_dicts is None:
            # initialize "optimizable" parameters specific to this MSM
            init_vec = to_fX( np.zeros((1,self.z_dim)) )
            self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean')
            self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar')
            self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)
            self.shared_param_dicts = {}
            self.shared_param_dicts['p_z_mean'] = self.p_z_mean
            self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            self.p_z_mean = self.shared_param_dicts['p_z_mean']
            self.p_z_logvar = self.shared_param_dicts['p_z_logvar']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)

        ##############################################
        # Setup the TwoStageModels main computation. #
        ##############################################
        print("Building TSM...")
        # samples of "hidden" latent state (from both p and q)
        z_q_mean, z_q_logvar, z_q = \
                self.q_z_given_x.apply(self.x_in, do_samples=True)
        z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0)
        z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0)
        zmuv = self.rng.normal(size=z_q.shape, avg=0.0, std=1.0, \
                               dtype=theano.config.floatX)
        z_p = (T.exp(0.5*z_p_logvar) * zmuv) + z_p_mean
        self.z = (self.train_switch[0] * z_q) + \
                 ((1.0 - self.train_switch[0]) * z_p)
        # compute relevant KLds for this step
        self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar, \
                                      z_p_mean, z_p_logvar)
        self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar, \
                                      z_q_mean, z_q_logvar)
        # samples of "hidden" latent state (from both p and q)
        h_p_mean, h_p_logvar, h_p = self.p_h_given_z.apply(self.z)
        h_q_mean, h_q_logvar, h_q = self.q_h_given_z_x.apply( \
                T.horizontal_stack(h_p_mean, h_p_logvar, self.x_out))
        self.h = (self.train_switch[0] * h_q) + \
                 ((1.0 - self.train_switch[0]) * h_p)
        # compute relevant KLds for this step
        self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar, \
                                      h_p_mean, h_p_logvar)
        self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar, \
                                      h_q_mean, h_q_logvar)

        # p_x_given_h generates an observation x conditioned on the "hidden"
        # latent variables h.
        self.x_gen, _ = self.p_x_given_h.apply(self.h, do_samples=False)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr = theano.shared(value=zero_ary, name='tsm_lr')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # get optimizable parameters belonging to the TwoStageModel
        self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar]
        # get optimizable parameters belonging to the underlying networks
        child_params = []
        child_params.extend(self.q_z_given_x.mlp_params)
        child_params.extend(self.q_h_given_z_x.mlp_params)
        child_params.extend(self.p_h_given_z.mlp_params)
        child_params.extend(self.p_x_given_h.mlp_params)
        # make a joint list of all optimizable parameters
        self.joint_params = self_params + child_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_h_p2q)
        self.kld_costs = T.sum(self.kld_z, axis=1) + \
                         T.sum(self.kld_h, axis=1)
        # compute "mean" (rather than per-input) costs
        self.kld_cost = T.mean(self.kld_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self._construct_nll_costs(self.x_out)
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_costs

        # get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # construct the updates for the generator and inferencer networks
        all_updates = get_adam_updates(params=self.joint_params, \
                grads=self.joint_grads, alpha=self.lr, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0)
        self.joint_updates = OrderedDict()
        for k in all_updates:
            self.joint_updates[k] = all_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling open-loop model sampler...")
        self.sample_from_prior = self._construct_sample_from_prior()
        return
Beispiel #24
0
def learnAndPredict(Ti, C, TOList):
 
    rng = np.random.RandomState(SEED)
    learning_rate = learning_rate0
    print np.mean(Ti[1000,:])
    aminW = np.amin(Ti[:1000,:])
    amaxW = np.amax(Ti[:1000,:]) 
    Ti[:1000,:] = (Ti[:1000,:] - aminW) / (amaxW - aminW)
    astdW = np.std(Ti[:1000,:])
    ameanW = np.mean(Ti[:1000,:])
    Ti[:1000,:] = (Ti[:1000,:] - ameanW) / astdW
    aminacW = np.amin(Ti[1000,:])
    amaxacW = np.amax(Ti[1000,:])
    print aminW, amaxW, aminacW, amaxacW
    Ti[1000,:] =  (Ti[1000,:] - aminacW) / (amaxacW - aminacW)
    astdacW = np.std(Ti[1000,:])
    ameanacW = np.mean(Ti[1000,:])
    Ti[1000,:] =  (Ti[1000,:] - ameanacW) / astdacW
    
    ile__ = len(TOList)
    ileList = np.zeros(ile__)
    for titer in range(len(TOList)):
        print np.mean(TOList[titer][1000,:])
        TOList[titer][:1000,:] = (TOList[titer][:1000,:] - aminW)/(amaxW - aminW)
        TOList[titer][:1000,:] = (TOList[titer][:1000,:] - ameanW)/astdW
        TOList[titer][1000,:] =  (TOList[titer][1000,:] - aminacW)/(amaxacW - aminacW)
        TOList[titer][1000,:] =  (TOList[titer][1000,:] - ameanacW)/astdacW
        _, ileList[titer] = TOList[titer].shape
        
    _, ile = Ti.shape
    N = NN
  
    data = []; yyy = []; need = 1; BYL = {}; j= 0; dwa = 0; ONES = []; ZEROS = []
    for i in range(NN):
        for j in range(NN):
            if i!= j:
                if C[i][j]==1:
                    ONES.append((i,j))
                else:
                    ZEROS.append((i,j))
    Nones = len(ONES)
    rng.shuffle(ONES)
    Nzeros = len(ZEROS)
    print Nones
    print Nzeros
    Needed = NUM_TRAIN/2
    onesPerPair = Needed / Nones + 1
    onesIter = 0
    jj = 0
    while jj < NUM_TRAIN:
        if jj%300000 == 0:
            print jj/300000,
        need = 1 - need
        if need == 1:
            pairNo = onesIter % Nones
            ppp = onesIter / Nones
            s,t = ONES[pairNo]
            shift = rng.randint(0, ile - L)
            onesIter += 1
        if need == 0:
            zer = rng.randint(Nzeros)
            s,t = ZEROS[zer]
            del ZEROS[zer]
            Nzeros -= 1
            shift = rng.randint(0, ile - L)
        x = np.hstack(( Ti[s][shift:shift+L], Ti[t][shift:shift+L], Ti[1000][shift:shift+L]))
        y = C[s][t]
        data.append(x); yyy.append(y)
        jj+=1

    data = np.array(data, dtype=theano.config.floatX)  
    is_train = np.array(  ([0]*96 + [1,1,2,2]) * (NUM_TRAIN / 100))
    yyy = np.array(yyy)
    
    train_set_x0, train_set_y0 = np.array(data[is_train==0]), yyy[is_train==0]
    test_set_x,   test_set_y = np.array(data[is_train==1]), yyy[is_train==1]
    valid_set_x, valid_set_y = np.array(data[is_train==2]), yyy[is_train==2]
    n_train_batches = len(train_set_y0) / batch_size
    n_valid_batches = len(valid_set_y)  / batch_size
    n_test_batches  = len(test_set_y)  / batch_size  
    epoch = T.scalar() 
    index = T.lscalar() 
    x = T.matrix('x')   
    inone2 = T.matrix('inone2') 
    y = T.ivector('y') 
    print '... building the model'
#-------- my layers -------------------
    
    #---------------------
    layer0_input = x.reshape((batch_size, 1, 3, L))
    Cx = 5
    layer0 = ConvolutionalLayer(rng, input=layer0_input,
            image_shape=(batch_size, 1, 3, L),
            filter_shape=(nkerns[0], 1, 2, Cx), poolsize=(1, 1), fac = 0)
    ONE = (3 - 2 + 1) / 1
    L2 = (L - Cx + 1) / 1
    #---------------------
    Cx2 = 5
    layer1 = ConvolutionalLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], ONE, L2),
            filter_shape=(nkerns[1], nkerns[0], 2, Cx2), poolsize=(1, 1), activation=ReLU, fac = 0)
    ONE = (ONE - 2 + 1) /1
    L3 = (L2 - Cx2 + 1) /1
    #---------------------
    Cx3 = 1
    layer1b = ConvolutionalLayer(rng, input=layer1.output,
            image_shape=(batch_size, nkerns[1], ONE, L3),
            filter_shape=(nkerns[2], nkerns[1], 1, Cx3), poolsize=(1, POOL), activation=ReLU, fac = 0)
    ONE = (ONE - 1 + 1) /1
    L4 = (L3 - Cx3 + 1) /POOL
    
    REGx = 100
    #---------------------    
    layer2_input = layer1b.output.flatten(2) 
    print layer2_input.shape
    use_b = False
    layer2 =         HiddenLayer(rng, input=layer2_input, n_in=nkerns[2]*L4 , n_out=REGx, activation=T.tanh,
                                 use_bias = use_b)
    layer3 =  LogisticRegression(input=layer2.output, n_in=REGx, n_out=2)
 
    
    cost = layer3.negative_log_likelihood(y)
    out_x2 = theano.shared(np.asarray(np.zeros((N,L)), dtype=theano.config.floatX))
    inone2 = theano.shared(np.asarray(np.zeros((1,L)), dtype=theano.config.floatX))
    inone3 = theano.shared(np.asarray(np.zeros((1,L)), dtype=theano.config.floatX))
    inone4 = theano.shared(np.asarray(np.zeros((1,L)), dtype=theano.config.floatX))
    test_set_x = theano.shared(np.asarray(test_set_x, dtype=theano.config.floatX))
    train_set_x = theano.shared(np.asarray(train_set_x0, dtype=theano.config.floatX))
    train_set_y = T.cast(theano.shared(np.asarray(train_set_y0, dtype=theano.config.floatX)), 'int32')
    test_set_y = T.cast(theano.shared(np.asarray(test_set_y, dtype=theano.config.floatX)), 'int32')
    valid_set_y =  T.cast(theano.shared(np.asarray(valid_set_y, dtype=theano.config.floatX)), 'int32')
    valid_set_x = theano.shared(np.asarray(valid_set_x, dtype=theano.config.floatX))   
    
    test_model = theano.function([index], layer3.errors(y),
             givens={
                x: test_set_x[index * batch_size: (index + 1) * batch_size],
                y: test_set_y[index * batch_size: (index + 1) * batch_size]})

    validate_model = theano.function([index], layer3.errors(y),
            givens={
                x: valid_set_x[index * batch_size: (index + 1) * batch_size],
                y: valid_set_y[index * batch_size: (index + 1) * batch_size]})

       
    mom_start = 0.5; mom_end = 0.98;  mom_epoch_interval = n_epochs * 1.0
    #### @@@@@@@@@@@
    class_params0  =  [layer3, layer2, layer1, layer1b, layer0]  
    class_params = [ param for layer in class_params0 for param in layer.params ]

    gparams = []
    for param in class_params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)
    gparams_mom = []
    for param in class_params:
        gparam_mom = theano.shared(np.zeros(param.get_value(borrow=True).shape,
            dtype=theano.config.floatX))
        gparams_mom.append(gparam_mom)
    mom = ifelse(epoch < mom_epoch_interval,
            mom_start*(1.0 - epoch/mom_epoch_interval) + mom_end*(epoch/mom_epoch_interval),
            mom_end)
    updates = OrderedDict()
    for gparam_mom, gparam in zip(gparams_mom, gparams):
        updates[gparam_mom] = mom * gparam_mom - (1. - mom) * learning_rate * gparam
    for param, gparam_mom in zip(class_params, gparams_mom):
        stepped_param = param + updates[gparam_mom]
        squared_filter_length_limit = 15.0
        if param.get_value(borrow=True).ndim == 2:
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
            desired_norms = T.clip(col_norms, 0, T.sqrt(squared_filter_length_limit))
            scale = desired_norms / (1e-7 + col_norms)
            updates[param] = stepped_param * scale
        else:
            updates[param] = stepped_param

    output = cost
    train_model = theano.function(inputs=[epoch, index], outputs=output,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size],
                y: train_set_y[index * batch_size:(index + 1) * batch_size]})
    
    keep = theano.function([index], layer3.errorsFull(y),
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size],
                y: train_set_y[index * batch_size:(index + 1) * batch_size]}, on_unused_input='warn')

    timer = time.clock()
    print "finished reading", (timer - start_time0) /60. , "minutes "
             
    # TRAIN MODEL # 
    print '... training'
    validation_frequency = n_train_batches; best_params = None; best_validation_loss = np.inf
    best_iter = 0; test_score = 0.;  epochc = 0;
    
    while (epochc < n_epochs):
        epochc = epochc + 1            
        learning_rate = learning_rate0 * (1.2 - ((1.0 * epochc)/n_epochs))
        for minibatch_index in xrange(n_train_batches):      
            iter = (epochc - 1) * n_train_batches + minibatch_index
            cost_ij = train_model(epochc, minibatch_index)  
            if (iter + 1) % validation_frequency == 0:
                validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
                this_validation_loss = np.mean(validation_losses)
                print(' %i) err %.2f ' %  (epochc, this_validation_loss/10)), L, nkerns, REGx, "|", Cx, Cx2, Cx3, batch_size
                if this_validation_loss < best_validation_loss or epochc % 30 == 0:
                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
                    test_score = np.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') % (epochc, minibatch_index + 1, n_train_batches, test_score/10))
    ############        
    timel = time.clock()
    print "finished learning", (timel - timer) /60. , "minutes "
    ppm = theano.function([index], layer3.pred_proba_mine(),
        givens={
            x: T.horizontal_stack(T.tile(inone2, (batch_size ,1)), 
               out_x2[index * batch_size: (index + 1) * batch_size], T.tile(inone3, (batch_size ,1))),
            y: train_set_y[0 * (batch_size): (0 + 1) * (batch_size)]
            }, on_unused_input='warn')

    NONZERO = (N*N-N)
    gc.collect()
    RESList = [np.zeros((N,N)) for it in range(ile__)]
    for __net in range(ile__):
        TO = TOList[__net]
        ileO = ileList[__net]
        RES  = RESList[__net]
        shift = 0.1 
        DELTAshift = (ileO-L) / (Q-1)
        print "DELTAshift:", DELTAshift
        for q in range (Q):
            dataO = [];  print (q+1),"/", Q , "  ",
            out_x2.set_value(np.asarray(np.array(TO[:,shift:shift+L]), dtype=theano.config.floatX)) 
            PARTIAL = np.zeros((N,N))
            inone3.set_value(np.asarray(np.array(TO[1000][shift:shift+L]).reshape(1,L), dtype=theano.config.floatX))
            for i in range(N):
                inone2.set_value(np.asarray(np.array(TO[i][shift:shift+L]).reshape(1,L), dtype=theano.config.floatX))
                p = [ppm(ii) for ii in xrange( N / batch_size)]
                for pos in range(N):
                    if pos != i:
                        PARTIAL[i][pos] += p[pos / batch_size][pos % batch_size][1]
            for i in range(N):
                for j in range(N):
                    RES[i][j] += PARTIAL[i][j]
            shift += DELTAshift
        print "Finished", __net
        RESList[__net] = RES/np.max(RES)            
        gc.collect()
        
    end_time = time.clock()
    print "finished predicting", (end_time - timel) /60. , "minutes ", str(nkerns), "using SEED = ", SEED
    print('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time0) / 60.))
    return RESList
Beispiel #25
0
def train_conv_net(
        datasets,
        rel_tr,
        rel_te,
        rel_de,
        hlen,
        U,  # yluo: embedding matrix
        fnres,
        img_w=300,
        filter_hs=[3, 4, 5],
        hidden_units=[100, 2],  # hidden_units[1] is number of classes
        dropout_rate=[0.5],
        shuffle_batch=True,
        n_epochs=25,
        batch_size=50,  # yluo: how many sentences to extract to compute gradient
        lr_decay=0.95,
        conv_non_linear="relu",
        activations=[Iden],
        sqr_norm_lim=9,
        non_static=True,
        relname=None):
    """
    Train a simple conv net
    img_h = sentence length (padded where necessary)
    img_w = word vector length (300 for word2vec)
    filter_hs = filter window sizes    
    hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer
    sqr_norm_lim = s^2 in the paper
    lr_decay = adadelta decay parameter
    """
    hrel_tr = make_rel_hash(rel_tr)
    hrel_te = make_rel_hash(rel_te)
    hrel_de = make_rel_hash(rel_de)
    rng = np.random.RandomState()
    img_h_tot = len(
        datasets[0][0]
    ) - 2  # SS: exclude 2 dimensions: (iid, y). compa1 and compa2 are included
    pad = max(filter_hs) - 1
    filter_w = img_w
    # yluo: what does different feature maps correspond to?
    feature_maps = hidden_units[0]
    filter_shapes = []

    for filter_h in filter_hs:
        # yluo: what does 1 in the filter shape mean?
        # (number of filters, num input feature maps, filter height, filter width)
        # how to interpet different filters?
        filter_shapes.append((feature_maps, 1, filter_h, filter_w))

        parameters = [("image shape", img_h_tot, img_w),
                      ("filter shape", filter_shapes),
                      ("hidden_units", hidden_units),
                      ("dropout", dropout_rate), ("batch_size", batch_size),
                      ("non_static", non_static), ("learn_decay", lr_decay),
                      ("conv_non_linear", conv_non_linear),
                      ("non_static", non_static),
                      ("sqr_norm_lim", sqr_norm_lim),
                      ("shuffle_batch", shuffle_batch)]
    print parameters

    #define model architecture
    index = T.lscalar()
    # x = T.matrix('x')
    c1 = T.matrix('c1')
    c2 = T.matrix('c2')
    prec = T.matrix('prec')
    mid = T.matrix('mid')
    succ = T.matrix('succ')
    y = T.ivector('y')
    iid = T.vector('iid')
    compa1 = T.vector('compa1')  # compatibility1 of c1/c2
    compa2 = T.vector('compa2')  # compatibility2 of c1/c2
    semclass1 = T.vector('semclass1')  # semclass of a "predicate"
    semclass2 = T.vector('semclass2')  # semclass of a "predicate"
    semclass3 = T.vector('semclass3')  # semclass of a "predicate"
    semclass4 = T.vector('semclass4')  # semclass of a "predicate"
    semclass5 = T.vector('semclass5')  # semclass of a "predicate"
    #pr = theano.printing.Print("COMPA")(compa)
    Words = theano.shared(value=U, name="Words")
    zero_vec_tensor = T.vector()
    zero_vec = np.zeros(img_w)
    set_zero = theano.function([zero_vec_tensor],
                               updates=[
                                   (Words,
                                    T.set_subtensor(Words[0, :],
                                                    zero_vec_tensor))
                               ],
                               allow_input_downcast=True)
    c1_input = Words[T.cast(c1.flatten(), dtype="int32")].reshape(
        (c1.shape[0], 1, c1.shape[1], Words.shape[1]))  # reshape to 3d array
    # Words[T.cast(c1.flatten(),dtype="int32")] >>> len c1 flattened*emb_dim
    # c1_input >>> n_insts * 1 * n_ws_per_inst * emb_dim
    c2_input = Words[T.cast(c2.flatten(), dtype="int32")].reshape(
        (c2.shape[0], 1, c2.shape[1], Words.shape[1]))  # reshape to 3d array
    prec_input = Words[T.cast(prec.flatten(), dtype="int32")].reshape(
        (prec.shape[0], 1, prec.shape[1],
         Words.shape[1]))  # reshape to 3d array
    mid_input = Words[T.cast(mid.flatten(), dtype="int32")].reshape(
        (mid.shape[0], 1, mid.shape[1], Words.shape[1]))  # reshape to 3d array
    succ_input = Words[T.cast(succ.flatten(), dtype="int32")].reshape(
        (succ.shape[0], 1, succ.shape[1],
         Words.shape[1]))  # reshape to 3d array
    layer0_input = {
        'c1': c1_input,
        'c2': c2_input,
        'prec': prec_input,
        'mid': mid_input,
        'succ': succ_input
    }
    conv_layers = []
    layer1_inputs = []

    for i in xrange(len(filter_hs)):
        for seg in hlen.keys():  # used hlen as a global var, to fix
            filter_shape = filter_shapes[i]
            img_h = hlen[seg] + 2 * pad
            pool_size = (img_h - filter_h + 1, img_w - filter_w + 1)
            conv_layer = LeNetConvPoolLayer(rng,
                                            input=layer0_input[seg],
                                            image_shape=(batch_size, 1, img_h,
                                                         img_w),
                                            filter_shape=filter_shape,
                                            poolsize=pool_size,
                                            non_linear=conv_non_linear)
            layer1_input = conv_layer.output.flatten(
                2)  # yluo: 2 dimensions >>>
            conv_layers.append(conv_layer)  # yluo: layer 0
            layer1_inputs.append(layer1_input)  # yluo: 3 dimensions
    layer1_input = T.concatenate(
        layer1_inputs, 1)  # yluo: 2 dimensions >>> n_insts * concat_dim?
    layer1_input = T.horizontal_stack(
        layer1_input, compa1.reshape((compa1.shape[0], 1)),
        compa2.reshape((compa2.shape[0], 1)),
        semclass1.reshape((semclass1.shape[0], 1)),
        semclass2.reshape((semclass2.shape[0], 1)),
        semclass3.reshape((semclass3.shape[0], 1)),
        semclass4.reshape((semclass4.shape[0], 1)),
        semclass5.reshape((semclass5.shape[0], 1)))
    hidden_units[0] = feature_maps * len(filter_hs) * len(
        hlen
    ) + 2 + 5  # compa: plus 2 (we have two compa feats); semclass: plus 5
    classifier = MLPDropout(rng,
                            input=layer1_input,
                            layer_sizes=hidden_units,
                            activations=activations,
                            dropout_rates=dropout_rate)

    #define parameters of the model and update functions using adadelta
    params = classifier.params
    for conv_layer in conv_layers:
        params += conv_layer.params
    if non_static:
        #if word vectors are allowed to change, add them as model parameters
        params += [Words]
    cost = classifier.negative_log_likelihood(y)
    dropout_cost = classifier.dropout_negative_log_likelihood(y)
    grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6,
                                        sqr_norm_lim)

    #shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate , stochastic gradient descent
    #extra data (at random)

    tr_size = datasets[0].shape[0]
    de_size = datasets[2].shape[0]
    hi_seg = datasets[3]
    print(hi_seg)
    c1s, c1e = hi_seg['c1']
    c2s, c2e = hi_seg['c2']
    mids, mide = hi_seg['mid']
    precs, prece = hi_seg['prec']
    succs, succe = hi_seg['succ']
    yi = hi_seg['y']
    idi = hi_seg['iid']
    compa1i = hi_seg['compa1']
    compa2i = hi_seg['compa2']
    semclass1i = hi_seg['semclass1']
    semclass2i = hi_seg['semclass2']
    semclass3i = hi_seg['semclass3']
    semclass4i = hi_seg['semclass4']
    semclass5i = hi_seg['semclass5']

    if tr_size % batch_size > 0:
        extra_data_num = batch_size - tr_size % batch_size
        train_set = rng.permutation(datasets[0])
        extra_data = train_set[:extra_data_num]
        new_data = np.append(datasets[0], extra_data, axis=0)
    else:
        new_data = datasets[0]
    new_data = rng.permutation(new_data)
    n_batches = new_data.shape[0] / batch_size
    #n_train_batches = int(np.round(n_batches*0.9))
    n_train_batches = n_batches

    if de_size % batch_size > 0:
        extra_data_num = batch_size - de_size % batch_size
        dev_set = rng.permutation(datasets[2])
        extra_data = dev_set[:extra_data_num]
        new_data_de = np.append(datasets[2], extra_data, axis=0)
    else:
        new_data_de = datasets[2]
    new_data_de = rng.permutation(new_data_de)
    n_dev_batches = new_data_de.shape[0] / batch_size

    #divide train set into train/val sets
    c1_te = datasets[1][:, c1s:c1e]
    c2_te = datasets[1][:, c2s:c2e]
    prec_te = datasets[1][:, precs:prece]
    mid_te = datasets[1][:, mids:mide]
    succ_te = datasets[1][:, succs:succe]
    test_set = datasets[1]
    y_te = np.asarray(test_set[:, yi], "int32")
    compa1_te = np.asarray(test_set[:, compa1i], "float32")
    compa2_te = np.asarray(test_set[:, compa2i], "float32")
    semclass1_te = np.asarray(test_set[:, semclass1i], "float32")
    semclass2_te = np.asarray(test_set[:, semclass2i], "float32")
    semclass3_te = np.asarray(test_set[:, semclass3i], "float32")
    semclass4_te = np.asarray(test_set[:, semclass4i], "float32")
    semclass5_te = np.asarray(test_set[:, semclass5i], "float32")

    train_set = new_data[:n_train_batches * batch_size, :]
    dev_set = new_data_de[:n_dev_batches * batch_size:, :]
    x_tr, y_tr = shared_dataset((train_set[:, :img_h_tot], train_set[:, -1]))
    x_de, y_de = shared_dataset((dev_set[:, :img_h_tot], dev_set[:, -1]))
    iid_tr = train_set[:, idi].flatten()
    iid_de = dev_set[:, idi].flatten()
    iid_te = test_set[:, idi].flatten()
    print('len iid_de %d' % (len(iid_de)))

    #compile theano functions to get train/val/test errors
    dev_model = theano.function(
        [index],
        classifier.preds(y),
        givens={
            c1:
            x_de[index * batch_size:(index + 1) * batch_size, c1s:c1e],
            c2:
            x_de[index * batch_size:(index + 1) * batch_size, c2s:c2e],
            prec:
            x_de[index * batch_size:(index + 1) * batch_size, precs:prece],
            mid:
            x_de[index * batch_size:(index + 1) * batch_size, mids:mide],
            succ:
            x_de[index * batch_size:(index + 1) * batch_size, succs:succe],
            compa1:
            x_de[index * batch_size:(index + 1) * batch_size, compa1i],
            compa2:
            x_de[index * batch_size:(index + 1) * batch_size, compa2i],
            semclass1:
            x_de[index * batch_size:(index + 1) * batch_size, semclass1i],
            semclass2:
            x_de[index * batch_size:(index + 1) * batch_size, semclass2i],
            semclass3:
            x_de[index * batch_size:(index + 1) * batch_size, semclass3i],
            semclass4:
            x_de[index * batch_size:(index + 1) * batch_size, semclass4i],
            semclass5:
            x_de[index * batch_size:(index + 1) * batch_size, semclass5i],
            y:
            y_de[index * batch_size:(index + 1) * batch_size],
        },
        allow_input_downcast=True,
        on_unused_input='warn')
    # this test_model is batch test model for train
    test_model = theano.function(
        [index],
        classifier.errors(y),
        givens={
            c1:
            x_tr[index * batch_size:(index + 1) * batch_size, c1s:c1e],
            c2:
            x_tr[index * batch_size:(index + 1) * batch_size, c2s:c2e],
            prec:
            x_tr[index * batch_size:(index + 1) * batch_size, precs:prece],
            mid:
            x_tr[index * batch_size:(index + 1) * batch_size, mids:mide],
            succ:
            x_tr[index * batch_size:(index + 1) * batch_size, succs:succe],
            compa1:
            x_tr[index * batch_size:(index + 1) * batch_size, compa1i],
            compa2:
            x_tr[index * batch_size:(index + 1) * batch_size, compa2i],
            semclass1:
            x_tr[index * batch_size:(index + 1) * batch_size, semclass1i],
            semclass2:
            x_tr[index * batch_size:(index + 1) * batch_size, semclass2i],
            semclass3:
            x_tr[index * batch_size:(index + 1) * batch_size, semclass3i],
            semclass4:
            x_tr[index * batch_size:(index + 1) * batch_size, semclass4i],
            semclass5:
            x_tr[index * batch_size:(index + 1) * batch_size, semclass5i],
            y:
            y_tr[index * batch_size:(index + 1) * batch_size]
        },
        allow_input_downcast=True)
    train_model = theano.function(
        [index],
        cost,
        updates=grad_updates,
        givens={
            c1:
            x_tr[index * batch_size:(index + 1) * batch_size, c1s:c1e],
            c2:
            x_tr[index * batch_size:(index + 1) * batch_size, c2s:c2e],
            prec:
            x_tr[index * batch_size:(index + 1) * batch_size, precs:prece],
            mid:
            x_tr[index * batch_size:(index + 1) * batch_size, mids:mide],
            succ:
            x_tr[index * batch_size:(index + 1) * batch_size, succs:succe],
            compa1:
            x_tr[index * batch_size:(index + 1) * batch_size, compa1i],
            compa2:
            x_tr[index * batch_size:(index + 1) * batch_size, compa2i],
            semclass1:
            x_tr[index * batch_size:(index + 1) * batch_size, semclass1i],
            semclass2:
            x_tr[index * batch_size:(index + 1) * batch_size, semclass2i],
            semclass3:
            x_tr[index * batch_size:(index + 1) * batch_size, semclass3i],
            semclass4:
            x_tr[index * batch_size:(index + 1) * batch_size, semclass4i],
            semclass5:
            x_tr[index * batch_size:(index + 1) * batch_size, semclass5i],
            y:
            y_tr[index * batch_size:(index + 1) * batch_size]
        },
        allow_input_downcast=True)
    test_pred_layers = []
    test_size = len(y_te)
    c1_te_input = Words[T.cast(c1.flatten(), dtype="int32")].reshape(
        (c1_te.shape[0], 1, c1_te.shape[1], Words.shape[1]))
    c2_te_input = Words[T.cast(c2.flatten(), dtype="int32")].reshape(
        (c2_te.shape[0], 1, c2_te.shape[1], Words.shape[1]))
    prec_te_input = Words[T.cast(prec.flatten(), dtype="int32")].reshape(
        (prec_te.shape[0], 1, prec_te.shape[1], Words.shape[1]))
    mid_te_input = Words[T.cast(mid.flatten(), dtype="int32")].reshape(
        (mid_te.shape[0], 1, mid_te.shape[1], Words.shape[1]))
    succ_te_input = Words[T.cast(succ.flatten(), dtype="int32")].reshape(
        (succ_te.shape[0], 1, succ_te.shape[1], Words.shape[1]))
    test_layer0_input = {
        'c1': c1_te_input,
        'c2': c2_te_input,
        'prec': prec_te_input,
        'mid': mid_te_input,
        'succ': succ_te_input
    }

    cl_id = 0  # conv layer id
    for i in xrange(len(filter_hs)):
        for seg in hlen.keys():
            conv_layer = conv_layers[cl_id]
            test_layer0_output = conv_layer.predict(
                test_layer0_input[seg], test_size
            )  ## doesn't seeem to matter if just use layer0_input here
            test_pred_layers.append(test_layer0_output.flatten(2))
            cl_id += 1
    test_layer1_input = T.concatenate(test_pred_layers, 1)
    #test_layer1_input = T.horizontal_stack(test_layer1_input, compa_te.reshape((compa_te.shape[0], 1)))
    test_layer1_input = T.horizontal_stack(
        test_layer1_input, compa1.reshape((compa1.shape[0], 1)),
        compa2.reshape((compa2.shape[0], 1)),
        semclass1.reshape((semclass1.shape[0], 1)),
        semclass2.reshape((semclass2.shape[0], 1)),
        semclass3.reshape((semclass3.shape[0], 1)),
        semclass4.reshape((semclass4.shape[0], 1)),
        semclass5.reshape((semclass5.shape[0], 1)))
    test_y_pred = classifier.predict(test_layer1_input)

    test_error = T.mean(T.neq(test_y_pred, y))
    test_model_all = theano.function([
        c1, c2, prec, mid, succ, compa1, compa2, semclass1, semclass2,
        semclass3, semclass4, semclass5
    ],
                                     test_y_pred,
                                     allow_input_downcast=True)

    #start training over mini-batches
    print '... training'
    epoch = 0
    best_dev_perf = 0
    test_perf = 0
    cost_epoch = 0
    while (epoch < n_epochs):
        start_time = time.time()
        epoch = epoch + 1
        if shuffle_batch:
            for minibatch_index in rng.permutation(range(n_train_batches)):
                cost_epoch = train_model(minibatch_index)
                set_zero(zero_vec)
        else:
            for minibatch_index in xrange(n_train_batches):
                cost_epoch = train_model(minibatch_index)
                set_zero(zero_vec)
        train_losses = [
            np.mean(test_model(i)) for i in xrange(n_train_batches)
        ]
        train_perf = 1 - np.mean(train_losses)
        dev_preds = np.asarray([])
        for i in xrange(n_dev_batches):
            dev_sb_preds = dev_model(i)
            y_sb = y_de[i * batch_size:(i + 1) * batch_size].eval()
            dev_sb_errors = dev_sb_preds != y_sb
            err_ind = [j for j, x in enumerate(dev_sb_errors) if x == 1]
            dev_sb = iid_de[i * batch_size:(i + 1) * batch_size]
            dev_preds = np.append(dev_preds, dev_sb_preds)

        dev_perf = 1 - np.mean(y_de.eval() != dev_preds)
        dev_cm = su.confMat(y_de.eval(), dev_preds, hidden_units[1])

        (dev_pres, dev_recs, dev_f1s, dev_mipre, dev_mirec,
         dev_mif) = su.cmPRF(dev_cm, ncstart=1)
        print(
            'epoch: %i, training time: %.2f secs, train perf: %.2f %%, dev_mipre: %.2f %%, dev_mirec: %.2f %%, dev_mif: %.2f %%'
            % (epoch, time.time() - start_time, train_perf * 100.,
               dev_mipre * 100., dev_mirec * 100., dev_mif * 100.))
        if dev_mif >= best_dev_perf:
            best_dev_perf = dev_mif
            test_pred = test_model_all(c1_te, c2_te, prec_te, mid_te, succ_te,
                                       compa1_te, compa2_te, semclass1_te,
                                       semclass2_te, semclass3_te,
                                       semclass4_te, semclass5_te)
            test_preds = extract_preds(rel_te, test_pred, relname)
            test_errors = test_pred != y_te
            err_ind = [j for j, x in enumerate(test_errors) if x == 1]
            test_cm = su.confMat(y_te, test_pred, hidden_units[1])
            print('\n'.join([
                ''.join(['{:10}'.format(int(item)) for item in row])
                for row in test_cm
            ]))
            (pres, recs, f1s, mipre, mirec, mif) = su.cmPRF(test_cm, ncstart=1)
            mipre_de = dev_mipre
            mirec_de = dev_mirec
            mif_de = dev_mif
            print('mipre %s, mirec %s, mif %s' % (mipre, mirec, mif))
    cPickle.dump([y_te, test_pred], open(fnres, "wb"))
    return (mipre, mirec, mif, mipre_de, mirec_de, mif_de, test_cm, test_preds)
Beispiel #26
0
    def get_output_for(self, inputs, deterministic=False, **kwargs):

        # extract inputs
        H1, H2 = inputs

        # train set size
        m = H1.shape[0].astype(theano.config.floatX)

        # running average projection matrix update
        if not deterministic:

            # compute batch mean
            mean1 = T.mean(H1, axis=0)
            mean2 = T.mean(H2, axis=0)

            # running average updates of means
            mean1 = (floatX(1.0 - self.alpha) * self.mean1 +
                     self.alpha * mean1)
            running_mean1 = theano.clone(self.mean1, share_inputs=False)
            running_mean1.default_update = mean1
            mean1 += 0 * running_mean1

            mean2 = (floatX(1.0 - self.alpha) * self.mean2 +
                     self.alpha * mean2)
            running_mean2 = theano.clone(self.mean2, share_inputs=False)
            running_mean2.default_update = mean2
            mean2 += 0 * running_mean2

            # hidden representations
            H1bar = H1 - mean1
            H2bar = H2 - mean2

            # transpose to formulas in paper
            H1bar = H1bar.T
            H2bar = H2bar.T

            # cross-covariance
            S12 = (1.0 / (m - 1)) * T.dot(H1bar, H2bar.T)

            # covariance 1
            S11 = (1.0 / (m - 1)) * T.dot(H1bar, H1bar.T)
            S11 = S11 + self.r1 * T.identity_like(S11)

            # covariance 2
            S22 = (1.0 / (m - 1)) * T.dot(H2bar, H2bar.T)
            S22 = S22 + self.r2 * T.identity_like(S22)

            # running average updates of statistics
            S12 = (floatX(1.0 - self.alpha) * self.S12 + self.alpha * S12)
            running_S12 = theano.clone(self.S12, share_inputs=False)
            running_S12.default_update = S12
            S12 += 0 * running_S12

            S11 = (floatX(1.0 - self.alpha) * self.S11 + self.alpha * S11)
            running_S11 = theano.clone(self.S11, share_inputs=False)
            running_S11.default_update = S11
            S11 += 0 * running_S11

            S22 = (floatX(1.0 - self.alpha) * self.S22 + self.alpha * S22)
            running_S22 = theano.clone(self.S22, share_inputs=False)
            running_S22.default_update = S22
            S22 += 0 * running_S22

            S21 = S12.T

            # theano optimized version of paper
            S11c = T.slinalg.cholesky(S11)
            S11ci = T.nlinalg.matrix_inverse(S11c)
            S11_inv = T.nlinalg.matrix_inverse(S11)

            S22c = T.slinalg.cholesky(S22)
            S22ci = T.nlinalg.matrix_inverse(S22c)
            S22_inv = T.nlinalg.matrix_inverse(S22)

            # compute correlation (regularized)
            M1 = S11ci.dot(S12).dot(S22_inv).dot(S21).dot(S11ci.T)
            M2 = S22ci.dot(S21).dot(S11_inv).dot(S12).dot(S22ci.T)

            M1 += self.rT * T.identity_like(M1)
            M2 += self.rT * T.identity_like(M2)

            # compute eigen decomposition
            E1, E = T.nlinalg.eigh(M1)
            _, F = T.nlinalg.eigh(M2)

            # maximize correlation
            E1 = T.clip(E1, 1e-7, 1.0)
            E1 = T.sqrt(E1)
            self.loss = -T.mean(E1) * self.wl
            self.corr = E1

            # compute projection matrices
            U = S11ci.T.dot(E)
            V_prime = S22ci.T.dot(F)

            # project data
            lv1_cca = H1bar.T.dot(U)
            lv2_cca = H2bar.T.dot(V_prime)

            # workaround to flip axis of projection vector
            def compute_corr(d, lv1_cca, lv2_cca):
                CX = lv1_cca[:, d].T.dot(lv2_cca[:, d])
                C1 = lv1_cca[:, d].T.dot(lv1_cca[:, d])
                C2 = lv2_cca[:, d].T.dot(lv2_cca[:, d])
                c = CX / (T.sqrt(C1) * T.sqrt(C2))
                return T.sgn(c)

            dims = T.arange(0, lv1_cca.shape[1])
            corrs, _ = theano.scan(fn=compute_corr,
                                   outputs_info=None,
                                   sequences=[dims],
                                   non_sequences=[lv1_cca, lv2_cca])

            # fix projection matrix and reproject data
            V = V_prime * corrs

            # some casting is required here
            U = T.cast(U, 'float32')
            V = T.cast(V, 'float32')

            # update of projection matrices
            running_U = theano.clone(self.U, share_inputs=False)
            running_U.default_update = U
            U += floatX(0) * running_U

            running_V = theano.clone(self.V, share_inputs=False)
            running_V.default_update = V
            V += floatX(0) * running_V

        # use projections of layer
        else:

            # hidden representations
            H1bar = H1 - self.mean1
            H2bar = H2 - self.mean2

            # transpose to formulas in paper
            H1bar = H1bar.T
            H2bar = H2bar.T

            U, V = self.U, self.V

        # re-project data
        lv1_cca = H1bar.T.dot(U)
        lv2_cca_fixed = H2bar.T.dot(V)

        output = T.horizontal_stack(lv1_cca, lv2_cca_fixed)

        return output
Beispiel #27
0
 def __call__(self, v):
     return T.horizontal_stack(*self.psamples[1:])
Beispiel #28
0
    def __init__(self, rng=None, x_in=None, \
            p_s0_obs_given_z_obs=None, p_hi_given_si=None, p_sip1_given_si_hi=None, \
            p_x_given_si_hi=None, q_z_given_x=None, q_hi_given_x_si=None, \
            obs_dim=None, z_rnn_dim=None, z_obs_dim=None, h_dim=None, \
            model_init_obs=True, model_init_rnn=True, ir_steps=2, \
            params=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # TODO: implement functionality for working with "latent" si
        assert (p_x_given_si_hi is None)

        # decide whether to initialize from a model or from a "constant"
        self.model_init_obs = model_init_obs
        self.model_init_rnn = model_init_rnn

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(x)

        # record the dimensions of various spaces relevant to this model
        self.obs_dim = obs_dim
        self.rnn_dim = z_rnn_dim
        self.z_dim = z_rnn_dim + z_obs_dim
        self.z_rnn_dim = z_rnn_dim
        self.z_obs_dim = z_obs_dim
        self.h_dim = h_dim
        self.ir_steps = ir_steps

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x = x_in
        self.batch_reps = T.lscalar()

        # setup switching variable for changing between sampling/training
        zero_ary = np.zeros((1, )).astype(theano.config.floatX)
        self.train_switch = theano.shared(value=zero_ary,
                                          name='msm_train_switch')
        self.set_train_switch(1.0)
        # setup a weight for pulling priors over hi given si towards a
        # shared global prior -- e.g. zero mean and unit variance.
        self.kzg_weight = theano.shared(value=zero_ary, name='msm_kzg_weight')
        self.set_kzg_weight(0.1)
        # this weight balances l1 vs. l2 penalty on posterior KLds
        self.l1l2_weight = theano.shared(value=zero_ary,
                                         name='msm_l1l2_weight')
        self.set_l1l2_weight(1.0)

        #############################
        # Setup self.z and self.s0. #
        #############################
        print("Building MSM step 0...")
        obs_scale = 0.0
        rnn_scale = 0.0
        if self.model_init_obs:  # initialize obs state from generative model
            obs_scale = 1.0
        if self.model_init_rnn:  # initialize rnn state from generative model
            rnn_scale = 1.0
        self.q_z_given_x = q_z_given_x.shared_param_clone(rng=rng, Xd=self.x)
        self.z = self.q_z_given_x.output
        self.z_rnn = self.z[:, :self.z_rnn_dim]
        self.z_obs = self.z[:, self.z_rnn_dim:]
        self.p_s0_obs_given_z_obs = p_s0_obs_given_z_obs.shared_param_clone( \
                rng=rng, Xd=self.z_obs)
        _s0_obs_model = self.p_s0_obs_given_z_obs.output_mean
        _s0_obs_const = self.p_s0_obs_given_z_obs.mu_layers[-1].b
        self.s0_obs = (obs_scale * _s0_obs_model) + \
                ((1.0 - obs_scale) * _s0_obs_const)
        _s0_rnn_model = self.z_rnn
        _s0_rnn_const = self.q_z_given_x.mu_layers[-1].b[:self.z_rnn_dim]
        self.s0_rnn = (rnn_scale * _s0_rnn_model) + \
                ((1.0 - rnn_scale) * _s0_rnn_const)
        self.s0_jnt = T.horizontal_stack(self.s0_obs, self.s0_rnn)
        self.output_logvar = self.p_s0_obs_given_z_obs.sigma_layers[-1].b
        self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.output_logvar)

        ###############################################################
        # Setup the iterative refinement loop, starting from self.s0. #
        ###############################################################
        self.p_hi_given_si = []  # holds p_hi_given_si for each i
        self.p_sip1_given_si_hi = []  # holds p_sip1_given_si_hi for each i
        self.q_hi_given_x_si = []  # holds q_hi_given_x_si for each i
        self.si = [self.s0_jnt]  # holds si for each i
        self.hi = []  # holds hi for each i
        for i in range(self.ir_steps):
            print("Building MSM step {0:d}...".format(i + 1))
            _si = self.si[i]
            si_obs = _si[:, :self.obs_dim]
            si_rnn = _si[:, self.obs_dim:]
            # get samples of next hi, conditioned on current si
            self.p_hi_given_si.append( \
                    p_hi_given_si.shared_param_clone(rng=rng, \
                    Xd=T.horizontal_stack( \
                    self.obs_transform(si_obs), si_rnn)))
            hi_p = self.p_hi_given_si[i].output
            # now we build the model for variational hi given si
            grad_ll = self.x - self.obs_transform(si_obs)
            self.q_hi_given_x_si.append(\
                    q_hi_given_x_si.shared_param_clone(rng=rng, \
                    Xd=T.horizontal_stack( \
                    grad_ll, self.obs_transform(si_obs), si_rnn)))
            hi_q = self.q_hi_given_x_si[i].output
            # make hi samples that can be switched between hi_p and hi_q
            self.hi.append( ((self.train_switch[0] * hi_q) + \
                    ((1.0 - self.train_switch[0]) * hi_p)) )
            # p_sip1_given_si_hi is conditioned on hi and the "rnn" part of si.
            self.p_sip1_given_si_hi.append( \
                    p_sip1_given_si_hi.shared_param_clone(rng=rng, \
                    Xd=T.horizontal_stack(self.hi[i], si_rnn)))
            # construct the update from si_obs/si_rnn to sip1_obs/sip1_rnn
            sip1_obs = si_obs + self.p_sip1_given_si_hi[i].output_mean
            sip1_rnn = si_rnn
            sip1_jnt = T.horizontal_stack(sip1_obs, sip1_rnn)
            # record the updated state of the generative process
            self.si.append(sip1_jnt)
        # check that input/output dimensions of our models agree
        self._check_model_shapes()

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = np.zeros((1, )).astype(theano.config.floatX)
        self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1')
        self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='msm_it_count')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_1 = theano.shared(value=zero_ary, name='msm_lam_kld_1')
        self.lam_kld_2 = theano.shared(value=zero_ary, name='msm_lam_kld_2')
        self.set_lam_kld(lam_kld_1=1.0, lam_kld_2=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.group_1_params = []
        self.group_1_params.extend(self.q_z_given_x.mlp_params)
        self.group_1_params.extend(self.p_s0_obs_given_z_obs.mlp_params)
        # Grab all of the "optimizable" parameters in "group 2"
        self.group_2_params = []
        for i in range(self.ir_steps):
            self.group_2_params.extend(self.q_hi_given_x_si[i].mlp_params)
            self.group_2_params.extend(self.p_hi_given_si[i].mlp_params)
            self.group_2_params.extend(self.p_sip1_given_si_hi[i].mlp_params)
        # Make a joint list of parameters group 1/2
        self.joint_params = self.group_1_params + self.group_2_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z, self.kld_hi_cond, self.kld_hi_glob = \
                self._construct_kld_costs()
        self.kld_cost = (self.lam_kld_1[0] * T.mean(self.kld_z)) + \
                (self.lam_kld_2[0] * (T.mean(self.kld_hi_cond) + \
                (self.kzg_weight[0] * T.mean(self.kld_hi_glob))))
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self._construct_nll_costs()
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost

        # Get the gradient of the joint cost for all optimizable parameters
        self.joint_grads = OrderedDict()
        for p in self.joint_params:
            self.joint_grads[p] = T.grad(self.joint_cost, p)

        # Construct the updates for the generator and inferencer networks
        self.group_1_updates = get_param_updates(params=self.group_1_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.group_2_updates = get_param_updates(params=self.group_2_params, \
                grads=self.joint_grads, alpha=self.lr_2, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0)
        self.joint_updates = OrderedDict()
        for k in self.group_1_updates:
            self.joint_updates[k] = self.group_1_updates[k]
        for k in self.group_2_updates:
            self.joint_updates[k] = self.group_2_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        self.compute_post_klds = self._construct_compute_post_klds()
        self.compute_fe_terms = self._construct_compute_fe_terms()
        self.sample_from_prior = self._construct_sample_from_prior()
        # make easy access points for some interesting parameters
        self.inf_1_weights = self.q_z_given_x.shared_layers[0].W
        self.gen_1_weights = self.p_s0_obs_given_z_obs.mu_layers[-1].W
        self.inf_2_weights = self.q_hi_given_x_si[0].shared_layers[0].W
        self.gen_2_weights = self.p_sip1_given_si_hi[0].mu_layers[-1].W
        self.gen_inf_weights = self.p_hi_given_si[0].shared_layers[0].W
        return
Beispiel #29
0
 def __call__(self, v):
     return T.horizontal_stack(*self.psamples[1:])
Beispiel #30
0
def model_mimlcnn(datasets, Wordv, PF1v, PF2v, img_h, WForATData, linearW):
    """
    模型建模.
    :param datasets: 放进来的数据集.
    :param Wordv: "Word/Token - Embedding" 矩阵.
    :param PF1v: "PositionFeature1 - Embedding" 矩阵
    :param PF2v: "PositionFeature2 - Embedding" 矩阵
    :param img_h: Padding之后的句子长度.
    :param RForAT: 关系 - embedding矩阵
    :param WForAT: attention算每个句子权值时需要用到的矩阵
    :return: 建模之后的所有模型
    """

    # 1. 确定超参
    logging.info('-------------- Model Settings --------------------')
    # if word embedding is initialized with 'word2vec', then 'length' is set by the dimension of the word vector automatically;
    # if initialized with 'rand', then the specified value of 'length' is used.
    # is_static = False
    w2v_static = conf.getboolean('word_vector', 'is_static')

    # pfv_length = 5 wordv_length = 50 img_W = 50 + 5 * 2 = 60
    image_shape = (None, 1, img_h, _IMG_W)

    cp1_filter_shapes = []
    cp1_pool_sizes = []

    cp2_filter_shape = None
    cp2_pool_size = None

    assert len(_CP1_FILTER_HS) == 1

    # use_stacked_cp = False
    if not _USE_STACK_CP:
        # _CP1_FILTER_HS = [3]
        for filter_h in _CP1_FILTER_HS:
            # cp1_n_filters = 230
            # _IMG_W = _LEN_WORDV + 2 * _LEN_PFV _CP1_FILTER_W = _IMG_W
            # 230,1,3,60
            # 每次抽取3个句子的特征
            cp1_filter_shapes.append(
                (_CP1_N_FILTERS, 1, filter_h, _CP1_FILTER_W))
            # filter完后的行数,86,1
            cp1_pool_sizes.append(
                (img_h - filter_h + 1, _IMG_W - _CP1_FILTER_W + 1))
    else:

        cp1_filter_shapes.append(
            (_CP1_N_FILTERS, 1, _CP1_FILTER_HS[0], _CP1_FILTER_W))

        cp1_pool_sizes.append(_CP1_POOL_SIZE_4SCP)

        cp2_filter_shape = [_CP2_N_FILTERS, _CP1_N_FILTERS, _CP2_FILTER_H, 1]
        cp1_fm_img_h = image_shape[2] - _CP1_FILTER_HS[0] + 1
        cp2_img_h = int(np.ceil(cp1_fm_img_h / float(cp1_pool_sizes[0][0])))
        cp2_pool_size = [cp2_img_h - _CP2_FILTER_H + 1, 1]

    logging.info('|     - image_shape: {0}'.format(image_shape))
    logging.info('|     - cp1_filter_shapes: {0}'.format(cp1_filter_shapes))
    logging.info('|     - cp1_non_linear: {0}'.format(_CP1_NON_LINEAR))
    logging.info('|     - cp1_pool_sizes: {0}'.format(cp1_pool_sizes))

    if _USE_STACK_CP:
        logging.info('|     - cp2_filter_shape: {0}'.format(cp2_filter_shape))
        logging.info('|     - cp2_non_linear: {0}'.format(_CP2_NON_LINEAR))
        logging.info('|     - cp2_pool_sizes: {0}'.format(cp2_pool_size))

    logging.info('|     - initial mlp_shape: {0}'.format(_MLP_SHAPE))
    logging.info('|     - dropout_rates: {0}'.format(_DROPOUT_RATES))
    logging.info('|     - batch_size: {0}'.format(_BATCH_SIZE))
    logging.info('|     - word_embedding_length: {0}'.format(_LEN_WORDV))
    logging.info('|     - word_embedding_initialization: {0}'.format(
        conf.get('word_vector', 'initialization')))
    logging.info('|     - word_embedding_static: {0}'.format(w2v_static))
    logging.info('|     - shuffle_batch: {0}'.format(_SHUFFLE_BATCH))
    logging.info('|     - lr_decay: {0}'.format(_LR_DECAY))
    logging.info('|     - sqr_norm_lim: {0}'.format(_SQR_NORM_LIM))
    logging.info('|     - learning_rate: {0}'.format(_LR))
    logging.info('|     - cost_type: {0}'.format(conf.get('mode',
                                                          'cost_type')))
    logging.info('|         - pr_margin: {0}'.format(
        conf.getfloat('mode', 'pr_margin')))
    logging.info('|         - score_margin: {0}'.format(
        conf.getfloat('mode', 'score_margin')))
    logging.info('|     - prediction_type: larger than 0.5 for each label')
    logging.info('--------------------------------------------------')

    # 2. 计算模型的输入
    logging.info('  - Defining model variables for one mini-batch')

    bch_idx = T.scalar('batch_idx', dtype='int32')
    # bch_idx.tag.test_value = 1

    xs = T.matrix('xs', dtype='int32')

    # 3个句子
    # 不要被test_value欺骗,xs是句子数×88维的
    xs.tag.test_value = np.asarray(
        [[0, 0, 0, 0, 3, 2, 3, 7, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 1, 2, 4, 1, 8, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 3, 1, 6, 5, 3, 0, 0, 0, 0, 0, 0]],
        dtype='int32')

    pfinfos = T.matrix('pfinfos', dtype='int32')
    pfinfos.tag.test_value = np.array([[4, 2, 1], [5, 1, 3], [5, 0, 1]])

    # p0_in_PFv = 52
    p0_in_PFv = conf.getint("settings", "p0_in_PFv")

    # padding位置信息
    def cal_padded_sentpf(pfinfo_m):

        slen = pfinfo_m[0]
        e1i = pfinfo_m[1]
        e2i = pfinfo_m[2]

        pf1 = T.arange(p0_in_PFv - e1i,
                       p0_in_PFv + (slen - e1i),
                       dtype='int32')

        pf2 = T.arange(p0_in_PFv - e2i,
                       p0_in_PFv + (slen - e2i),
                       dtype='int32')

        # 调整到最小1,最大101,长度与句子长度相同
        clipped_pf1 = T.clip(pf1, 1, 101)
        clipped_pf2 = T.clip(pf2, 1, 101)

        # _N_PAD_HEAD = 4
        pad_head = T.zeros(shape=(_N_PAD_HEAD, ), dtype='int32')
        pad_tail = T.zeros(shape=(img_h - _N_PAD_HEAD - slen, ), dtype='int32')

        # 把两端列表拼成一段列表
        pf1_padded = T.concatenate([pad_head, clipped_pf1,
                                    pad_tail])  # 三部分相加=pad后长度.
        pf2_padded = T.concatenate([pad_head, clipped_pf2, pad_tail])

        return pf1_padded, pf2_padded

    # 句子长度,实体1实体2位置 三种信息的padding,padding后向量长度为88,头和尾里面数字是0,实体1和实体2相对位置部分的数字最小是1最大是101
    # 返回的是[实体1的位置padding],[实体2的位置padding],都是88维,前4维都为空,中间长度就是句子长度,剩下的也是0
    (pf1s, pf2s), _ = theano.scan(fn=cal_padded_sentpf, sequences=[pfinfos])

    e1is = pfinfos[:, 1]
    e2is = pfinfos[:, 2]

    # 每次传进来的连续x_m片段都是index从0开始的, 而ep2m依旧是按照数据集中所有的x_m来定位的. 所以在这里让ep2m中的所有元素减一个初始值, 让xs的index和ep2m的每个起始位置对应上.
    ep2m_raw = T.matrix('ep2m_raw', dtype='int32')
    ep2m_raw.tag.test_value = np.asarray([[25, 27], [27, 28]], dtype='int32')

    ep2m = ep2m_raw - ep2m_raw[0][0]
    ep2m.tag.test_value = np.asarray([[0, 2], [2, 3]], dtype='int32')

    ys = T.matrix('ys', dtype='int32')
    # _N_RELATIONS = 26
    yl1 = [0] * _N_RELATIONS
    yl1[2] = 1
    yl2 = [0] * _N_RELATIONS
    yl2[5] = 1
    ys.tag.test_value = np.asarray([yl1, yl2], dtype='int32')

    # 3. 定义模型结构, 定义输出, 定义损失
    assert pf1s.dtype == 'int32' and pf2s.dtype == 'int32' and xs.dtype == 'int32'

    _use_my_input = True

    if _use_my_input:
        # 1. 我的拼接方法
        # 看到这终于明白了,Wordv是用word2Vec初始好的词向量
        # 以维度为1连接传入数据,一个句子本来是向量,现在转成了矩阵
        # Wordv[xs.flatten()] = [50维的词向量]
        # pf1s =
        fltn_vec_stk = T.horizontal_stack(Wordv[xs.flatten()],
                                          PF1v[pf1s.flatten()],
                                          PF2v[pf2s.flatten()])
        # 句子数×1×88×60
        cp_layer_input = fltn_vec_stk.reshape(
            (xs.shape[0], 1, xs.shape[1], _IMG_W))

    else:
        # 2. Zeng的拼接方法
        input_words = Wordv[xs.flatten()].reshape(
            (xs.shape[0], 1, xs.shape[1], _LEN_WORDV))
        input_pf1s = PF1v[pf1s.flatten()].reshape(
            (pf1s.shape[0], 1, pf1s.shape[1], _LEN_PFV))
        input_pf2s = PF2v[pf2s.flatten()].reshape(
            (pf2s.shape[0], 1, pf2s.shape[1], _LEN_PFV))
        cp_layer_input = T.concatenate([input_words, input_pf1s, input_pf2s],
                                       axis=3)

    logging.info('  - Defining and assembling CP layer')
    cp_params = []

    # 句子数×1×88×60
    # cp_layer_input =

    # 从这里出一个60维的向量,放在卷积层后面
    # input = 1×88×60
    def atData(input, left, right):
        sentence = input[0]

        min = T.switch(T.lt(left, right), left, right)
        max = T.switch(T.lt(left, right), right, left)

        sentenceHead = sentence[:(min + _N_PAD_HEAD)]
        sentenceMiddle = sentence[(min + _N_PAD_HEAD + 1):(max + _N_PAD_HEAD)]
        sentenceTail = sentence[(max + _N_PAD_HEAD + 1):]

        # 去掉了两个entityPair
        # 86×60
        newSentence = T.vertical_stack(sentenceHead, sentenceMiddle,
                                       sentenceTail)

        leftEntity = sentence[min + _N_PAD_HEAD]
        rightEntity = sentence[max + _N_PAD_HEAD]

        LRConnect = T.concatenate([leftEntity, rightEntity])

        def AtLayerData(LRConnect, newSentenceCon):
            def forEveryWord(word):
                temp = T.concatenate([word, LRConnect])
                # return T.concatenate(temp, rightEntity)
                return temp

            # 将两个entitypair加在了每个句子的后面
            # 86×180
            sentenceAfAdd, _ = theano.scan(forEveryWord,
                                           sequences=newSentenceCon)

            eForWord = T.dot(sentenceAfAdd, WForATData)

            aForWord = T.nnet.softmax(eForWord)[0]

            def mulWeight(word, weight):
                return word * weight

            # 86×60
            newSRep, _ = theano.scan(mulWeight,
                                     sequences=[newSentence, aForWord])

            # 1×60
            finalSRep = T.sum(newSRep, axis=0)

            return T.dot(finalSRep, linearW)

        finalSRep, _ = theano.scan(AtLayerData,
                                   outputs_info=LRConnect,
                                   non_sequences=newSentence,
                                   n_steps=NUMBER_DATA)

        return finalSRep[-1]

    myobser1, _ = theano.scan(atData, sequences=[cp_layer_input, e1is, e2is])

    # No CNN

    # cp_out = 句子数×690 myobser1 = 句子数×120
    # new_cp_out = 句子数×120
    new_cp_out = myobser1

    # ****************
    # *****源代码******
    # *****************
    #
    #

    def ep_max_pooling(ep_mr, csmp_input):
        # 取出来的照样是句子数×120的矩阵
        input_41ep = csmp_input[ep_mr[0]:ep_mr[1]]
        # Cross-sentence Max-pooling
        max_pooling_out = T.max(input_41ep, axis=0)
        # 返回的就是 Entity-pair Representation
        return max_pooling_out

    logging.info('  - Aassembling second Max-Pooling layer')

    # Entity-pair Representation的列表
    # 例子数×(690+120)
    sec_maxPooling_out, _ = theano.scan(fn=ep_max_pooling,
                                        sequences=ep2m,
                                        non_sequences=new_cp_out)

    logging.info('  - Defining MLP layer')
    if not _USE_STACK_CP and _USE_PIECEWISE_POOLING_41CP:
        _MLP_SHAPE[0] = 2 * _IMG_W
        logging.info(
            "    - MLP shape changes to {0}, because of piecewise max-pooling".
            format(_MLP_SHAPE))

    mlp_layer = MLPDropout(rng,
                           layer_sizes=_MLP_SHAPE,
                           activations=_MLP_ACTIVATIONS,
                           dropout_rates=_DROPOUT_RATES)
    mlp_layer.feed(sec_maxPooling_out,
                   input_shape=(_BATCH_SIZE, _MLP_SHAPE[0]))
    dropout_score_batch = mlp_layer.dropout_layers[-1].score
    score_batch = mlp_layer.layers[-1].score
    dropout_p_ygx_batch = T.nnet.sigmoid(dropout_score_batch)
    p_ygx_batch = T.nnet.sigmoid(score_batch)
    obz_lr_masks = mlp_layer.lrmask
    predictions = predict_relations(p_ygx_batch)
    pred_pscores = p_ygx_batch

    # **************************
    # *****加入attention机制******
    # **************************

    # 针对1个关系

    # def forEveryExample(ep_mr, csmp_input):
    #     # 取出来的照样是句子数×690的矩阵
    #     # 这些句子是对应一个实体对的
    #
    #     input_41ep = csmp_input[ep_mr[0]: ep_mr[1]]
    #
    #     def forEverySentence(item):
    #         temp = T.dot(item, WForAT)
    #         # ???? change this
    #         re = T.dot(temp, RForAT[0])
    #         return re
    #
    #     slist, noup = theano.scan(forEverySentence, sequences=input_41ep)
    #
    #     aForRj = T.nnet.softmax(slist)[0]
    #
    #     def mulWeight(sentence, weight):
    #         return sentence * weight
    #
    #     newSRep, noup = theano.scan(mulWeight, sequences=[input_41ep, aForRj])
    #
    #     finalresult = T.sum(newSRep, axis=0)
    #
    #     # return finalresult
    #     return finalresult
    #
    # # # Entity-pair Representation的列表
    # my_sec_add_out, _ = theano.scan(fn=forEveryExample, sequences=ep2m, non_sequences=cp_out)
    #
    # logging.info('  - Defining MLP layer')
    # # _USE_STACK_CP = False
    # # _USE_PIECEWISE_POOLING_41CP = True
    # if not _USE_STACK_CP and _USE_PIECEWISE_POOLING_41CP:
    #     # _MLP_SHAPE = [230, 26]
    #     _MLP_SHAPE[0] *= 3
    #     # _MLP_SHAPE = [690, 26]
    #     logging.info("    - MLP shape changes to {0}, because of piecewise max-pooling".format(_MLP_SHAPE))
    #
    # # _MLP_SHAPE = [690,26] _MLP_ACTIVATIONS = [Iden] dropout_rates = [0.5]
    # mlp_layer = MLPDropout(rng, layer_sizes=_MLP_SHAPE, activations=_MLP_ACTIVATIONS, dropout_rates=_DROPOUT_RATES)
    # # input_shape = (batch_size = 50,_MLP_SHAPE[0] = 690)
    # mlp_layer.feed(my_sec_add_out, input_shape=(_BATCH_SIZE, _MLP_SHAPE[0]))

    #
    # # 针对26个关系
    #
    # logging.info('  - Defining MLP layer')
    #
    # assert _USE_STACK_CP == False
    # assert _USE_PIECEWISE_POOLING_41CP == True
    #
    # if not _USE_STACK_CP and _USE_PIECEWISE_POOLING_41CP:
    #     # _MLP_SHAPE = [230, 26]
    #     _MLP_SHAPE[0] *= 3
    #     # _MLP_SHAPE = [690, 26]
    #     logging.info("    - MLP shape changes to {0}, because of piecewise max-pooling".format(_MLP_SHAPE))
    #
    # # _MLP_SHAPE = [690,26] _MLP_ACTIVATIONS = [Iden] dropout_rates = [0.5]
    #
    #
    # my_mlp_layer = MyMLPDropout(rng, layer_sizes=[[_MLP_SHAPE[0] + _IMG_W * 2, 2]], activations=_MLP_ACTIVATIONS,
    #                             dropout_rates=_DROPOUT_RATES)
    #
    # def forEveryRelation(idx, ep2m, cp_out):
    #     def forEveryExample(ep_mr, csmp_input):
    #         # 取出来的照样是句子数×(690+120)的矩阵
    #         # 这些句子是对应一个实体对的
    #
    #         input_41ep = csmp_input[ep_mr[0]: ep_mr[1]]
    #
    #         def attentionLayer(R, input_41ep_out):
    #             def forEverySentence(item):
    #                 temp = T.dot(item, WForAT)
    #                 # ???? change this
    #                 re = T.dot(temp, R)
    #                 return re
    #
    #             # slist就是ei
    #             slist, noup = theano.scan(forEverySentence, sequences=input_41ep_out)
    #
    #             aForRj = T.nnet.softmax(slist)[0]
    #
    #             def mulWeight(sentence, weight):
    #                 return sentence * weight
    #
    #             # 句子数×(690+120)
    #             newSRep, noup = theano.scan(mulWeight, sequences=[input_41ep_out, aForRj])
    #
    #             # 1×(690+120)
    #             finalresult = T.sum(newSRep, axis=0)
    #
    #             return finalresult
    #
    #         # AT层数×1×(690+120)
    #         newSRepAf, _ = theano.scan(attentionLayer, outputs_info=RForAT[idx],
    #                                    non_sequences=input_41ep, n_steps=NUMBER)
    #
    #         # finalresult = T.sum(newSRepAf[-1], axis=0)
    #
    #         # return finalresult
    #         # 一次做完吧
    #
    #         return newSRepAf[-1]
    #
    #     # (50, (690+120))
    #     my_sec_add_out, _ = theano.scan(fn=forEveryExample, sequences=ep2m, non_sequences=[cp_out])
    #
    #     return my_sec_add_out
    #
    # idx = T.ivector()
    # # ok = (26,50,(690+120))
    # ok, up = theano.scan(forEveryRelation, sequences=[idx],
    #                      non_sequences=[ep2m, new_cp_out])
    #
    # # (26, 50, (690 + 120))
    # normalre, dropoutre = my_mlp_layer.feed(idx, ok,
    #                                         input_shape=(_N_RELATIONS, _BATCH_SIZE, (_MLP_SHAPE[0] + 2 * _IMG_W)))

    logging.info(' - Cost, params and grads ...')
    # 用这个更新权重
    # 计算损失的时候没有用到前面的score,score就是乘出来的26维的向量
    # 第二个参数是把score用sigmoid做归一化得出来的结果
    dropout_cost = compute_cost(dropout_p_ygx_batch, ys)
    cost = compute_cost(p_ygx_batch, ys)

    op_params = []
    params = []
    op_params += [Wordv]
    # is_static = False
    if not w2v_static:  # if word vectors are allowed to change, add them as model hyper_parameters
        params += [Wordv]

    op_params += [PF1v, PF2v]
    params += [PF1v, PF2v]

    op_params += cp_params
    params += cp_params

    op_params += mlp_layer.params
    params += mlp_layer.params

    op_params += [WForATData, linearW]
    params += [WForATData, linearW]

    # op_params += [WForAT, RForAT]
    # params += [WForAT, RForAT]

    logging.info('Params to update: ' +
                 str(', '.join([param.name for param in params])))
    logging.info('Params to output: ' +
                 str(', '.join([op_param.name for op_param in op_params])))

    # 5. 权重更新方式.
    grad_updates = lasagne.updates.adadelta(dropout_cost, params)

    # 6. 定义theano_function
    train_x_m, train_PFinfo_m, train_ep2m, train_y, test_x_m, test_PFinfo_m, test_ep2m, test_y = datasets

    logging.info('Compiling train_update_model...')
    output_list = [cost, dropout_cost]
    if conf.getboolean('mode', 'output_details'):
        output_list += ([obz_lr_masks, p_ygx_batch] + op_params)
    train_update_model = theano.function(
        [bch_idx],
        output_list,
        updates=grad_updates,
        name='train_update_model',
        givens={
            xs: get_1batch_x_m(bch_idx, train_x_m, train_ep2m),
            pfinfos: get_1batch_x_m(bch_idx, train_PFinfo_m, train_ep2m),
            ep2m_raw:
            train_ep2m[bch_idx * _BATCH_SIZE:(bch_idx + 1) * _BATCH_SIZE],
            ys: train_y[bch_idx * _BATCH_SIZE:(bch_idx + 1) * _BATCH_SIZE],
        },
    )
    #  }, on_unused_input='warn')

    logging.info('Compiling set_zero function ...')
    Wordv_0sline = T.vector("Wordv_0sline", dtype=theano.config.floatX)
    PFv_0sline = T.vector("PFv_0sline", dtype=theano.config.floatX)
    set_zero = theano.function(
        [Wordv_0sline, PFv_0sline],
        updates=[(Wordv, T.set_subtensor(Wordv[0, :], Wordv_0sline)),
                 (PF1v, T.set_subtensor(PF1v[0, :], PFv_0sline)),
                 (PF2v, T.set_subtensor(PF2v[0, :], PFv_0sline))])

    logging.info('Compiling trainset_error_model ...')
    trainset_error_model = theano.function(
        [bch_idx], [predictions, pred_pscores],
        givens={
            xs:
            get_1batch_x_m(bch_idx, train_x_m, train_ep2m),
            pfinfos:
            get_1batch_x_m(bch_idx, train_PFinfo_m, train_ep2m),
            ep2m_raw:
            train_ep2m[bch_idx * _BATCH_SIZE:(bch_idx + 1) * _BATCH_SIZE],
        })

    logging.info('Compiling testset_error_model ...')
    testset_error_model = theano.function(
        [bch_idx], [predictions, pred_pscores],
        givens={
            xs: get_1batch_x_m(bch_idx, test_x_m, test_ep2m),
            pfinfos: get_1batch_x_m(bch_idx, test_PFinfo_m, test_ep2m),
            ep2m_raw:
            test_ep2m[bch_idx * _BATCH_SIZE:(bch_idx + 1) * _BATCH_SIZE],
        })

    init_LR_W = mlp_layer.dropout_layers[-1].W.get_value()
    init_LR_b = mlp_layer.dropout_layers[-1].b.get_value()

    return train_update_model, trainset_error_model, testset_error_model, set_zero, init_LR_W, init_LR_b
Beispiel #31
0
import theano
import theano.tensor as T
import numpy as np

a = T.imatrix()
b = T.imatrix()

ok = T.horizontal_stack(a, b)

myfunc = theano.function([a, b], ok)

a_init = np.reshape(np.arange(10, dtype='int32'), (2, 5))
b_init = np.reshape(np.arange(10, 20, dtype='int32'), (2, 5))

ok = myfunc(a_init, b_init)

print ok
    def __init__(self, rng=None, x_in=None, \
            p_s0_obs_given_z_obs=None, p_hi_given_si=None, p_sip1_given_si_hi=None, \
            p_x_given_si_hi=None, q_z_given_x=None, q_hi_given_x_si=None, \
            obs_dim=None, z_dim=None, h_dim=None, \
            model_init_obs=True, ir_steps=2, \
            params=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # TODO: implement functionality for working with "latent" si
        assert(p_x_given_si_hi is None)

        # decide whether to initialize from a model or from a "constant"
        self.model_init_obs = model_init_obs

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(x)

        # record the dimensions of various spaces relevant to this model
        self.obs_dim = obs_dim
        self.z_dim = z_dim
        self.h_dim = h_dim
        self.ir_steps = ir_steps

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x = x_in
        self.batch_reps = T.lscalar()

        # setup switching variable for changing between sampling/training
        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch')
        self.set_train_switch(1.0)
        # setup a weight for pulling priors over hi given si towards a
        # shared global prior -- e.g. zero mean and unit variance.
        self.kzg_weight = theano.shared(value=zero_ary, name='msm_kzg_weight')
        self.set_kzg_weight(0.1)
        # this weight balances l1 vs. l2 penalty on posterior KLds
        self.l1l2_weight = theano.shared(value=zero_ary, name='msm_l1l2_weight')
        self.set_l1l2_weight(1.0)
        # this parameter controls dropout rate in the generator read function
        self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate')
        self.set_drop_rate(0.0)

        #############################
        # Setup self.z and self.s0. #
        #############################
        print("Building MSM step 0...")
        obs_scale = 0.0
        if self.model_init_obs: # initialize obs state from generative model
            obs_scale = 1.0
        self.q_z_given_x = q_z_given_x.shared_param_clone(rng=rng, Xd=self.x)
        self.z = self.q_z_given_x.output
        self.p_s0_obs_given_z_obs = p_s0_obs_given_z_obs.shared_param_clone( \
                rng=rng, Xd=self.z)
        _s0_obs_model = self.p_s0_obs_given_z_obs.output_mean
        _s0_obs_const = self.p_s0_obs_given_z_obs.mu_layers[-1].b
        self.s0_obs = (obs_scale * _s0_obs_model) + \
                ((1.0 - obs_scale) * _s0_obs_const)
        self.output_logvar = self.p_s0_obs_given_z_obs.sigma_layers[-1].b
        self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.output_logvar)

        ###############################################################
        # Setup the iterative refinement loop, starting from self.s0. #
        ###############################################################
        self.p_hi_given_si = []       # holds p_hi_given_si for each i
        self.p_sip1_given_si_hi = []  # holds p_sip1_given_si_hi for each i
        self.q_hi_given_x_si = []     # holds q_hi_given_x_si for each i
        self.si = [self.s0_obs]       # holds si for each i
        self.hi = []                  # holds hi for each i
        for i in range(self.ir_steps):
            print("Building MSM step {0:d}...".format(i+1))
            si_obs = self.si[i]
            # get samples of next hi, conditioned on current si
            self.p_hi_given_si.append( \
                    p_hi_given_si.shared_param_clone(rng=rng, \
                    Xd=self.obs_transform(si_obs)))
            hi_p = self.p_hi_given_si[i].output
            # now we build the model for variational hi given si
            grad_ll = self.x - self.obs_transform(si_obs)
            self.q_hi_given_x_si.append(\
                    q_hi_given_x_si.shared_param_clone(rng=rng, \
                    Xd=T.horizontal_stack( \
                    grad_ll, self.obs_transform(si_obs))))
            hi_q = self.q_hi_given_x_si[i].output
            # make hi samples that can be switched between hi_p and hi_q
            self.hi.append( ((self.train_switch[0] * hi_q) + \
                    ((1.0 - self.train_switch[0]) * hi_p)) )
            # p_sip1_given_si_hi is conditioned on hi.
            self.p_sip1_given_si_hi.append( \
                    p_sip1_given_si_hi.shared_param_clone(rng=rng, \
                    Xd=self.hi[i]))
            # construct the update from si_obs to sip1_obs
            sip1_obs = si_obs + self.p_sip1_given_si_hi[i].output_mean
            # record the updated state of the generative process
            self.si.append(sip1_obs)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1')
        self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_1 = theano.shared(value=zero_ary, name='msm_lam_kld_1')
        self.lam_kld_2 = theano.shared(value=zero_ary, name='msm_lam_kld_2')
        self.set_lam_kld(lam_kld_1=1.0, lam_kld_2=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.group_1_params = []
        self.group_1_params.extend(self.q_z_given_x.mlp_params)
        self.group_1_params.extend(self.p_s0_obs_given_z_obs.mlp_params)
        # Grab all of the "optimizable" parameters in "group 2"
        self.group_2_params = []
        for i in range(self.ir_steps):
            self.group_2_params.extend(self.q_hi_given_x_si[i].mlp_params)
            self.group_2_params.extend(self.p_hi_given_si[i].mlp_params)
            self.group_2_params.extend(self.p_sip1_given_si_hi[i].mlp_params)
        # Make a joint list of parameters group 1/2
        self.joint_params = self.group_1_params + self.group_2_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z, self.kld_hi_cond, self.kld_hi_glob = \
                self._construct_kld_costs()
        self.kld_cost = (self.lam_kld_1[0] * T.mean(self.kld_z)) + \
                (self.lam_kld_2[0] * (T.mean(self.kld_hi_cond) + \
                (self.kzg_weight[0] * T.mean(self.kld_hi_glob))))
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self._construct_nll_costs()
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.group_1_updates = get_adam_updates(params=self.group_1_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.group_2_updates = get_adam_updates(params=self.group_2_params, \
                grads=self.joint_grads, alpha=self.lr_2, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.joint_updates = OrderedDict()
        for k in self.group_1_updates:
            self.joint_updates[k] = self.group_1_updates[k]
        for k in self.group_2_updates:
            self.joint_updates[k] = self.group_2_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        self.compute_post_klds = self._construct_compute_post_klds()
        self.compute_fe_terms = self._construct_compute_fe_terms()
        self.sample_from_prior = self._construct_sample_from_prior()
        # make easy access points for some interesting parameters
        self.inf_1_weights = self.q_z_given_x.shared_layers[0].W
        self.gen_1_weights = self.p_s0_obs_given_z_obs.mu_layers[-1].W
        self.inf_2_weights = self.q_hi_given_x_si[0].shared_layers[0].W
        self.gen_2_weights = self.p_sip1_given_si_hi[0].mu_layers[-1].W
        self.gen_inf_weights = self.p_hi_given_si[0].shared_layers[0].W
        return
    def __init__(self, rng=None, \
            x_in=None, x_out=None, \
            p_s_given_z=None, \
            p_h_given_s=None, \
            p_x_given_s_h=None, \
            q_z_given_x=None, \
            q_h_given_x_s=None, \
            x_dim=None, \
            z_dim=None, \
            s_dim=None, \
            h_dim=None, \
            params=None, \
            shared_param_dicts=None):
        # setup a rng for this GIPair
        self.rng = RandStream(rng.randint(100000))

        # grab the user-provided parameters
        self.params = params
        self.x_type = self.params['x_type']
        assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian'))
        if 'obs_transform' in self.params:
            assert((self.params['obs_transform'] == 'sigmoid') or \
                    (self.params['obs_transform'] == 'none'))
            if self.params['obs_transform'] == 'sigmoid':
                self.obs_transform = lambda x: T.nnet.sigmoid(x)
            else:
                self.obs_transform = lambda x: x
        else:
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        if self.x_type == 'bernoulli':
            self.obs_transform = lambda x: T.nnet.sigmoid(x)
        self.shared_param_dicts = shared_param_dicts

        # record the dimensions of various spaces relevant to this model
        self.x_dim = x_dim
        self.z_dim = z_dim
        self.s_dim = s_dim
        self.h_dim = h_dim

        # grab handles to the relevant InfNets
        self.q_z_given_x = q_z_given_x
        self.q_h_given_x_s = q_h_given_x_s
        self.p_s_given_z = p_s_given_z
        self.p_h_given_s = p_h_given_s
        self.p_x_given_s_h = p_x_given_s_h

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this MultiStageModel
        self.x_in = x_in
        self.x_out = x_out
        self.batch_reps = T.lscalar()

        # setup switching variable for changing between sampling/training
        zero_ary = to_fX( np.zeros((1,)) )
        self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch')
        self.set_train_switch(1.0)
        # setup a variable for controlling dropout noise
        self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate')
        self.set_drop_rate(0.0)
        # this weight balances l1 vs. l2 penalty on posterior KLds
        self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2')
        self.set_lam_kld_l1l2(1.0)

        if self.shared_param_dicts is None:
            # initialize "optimizable" parameters specific to this MSM
            init_vec = to_fX( np.zeros((self.z_dim,)) )
            self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean')
            self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar')
            init_vec = to_fX( np.zeros((self.x_dim,)) )
            self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar')
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)
            self.shared_param_dicts = {}
            self.shared_param_dicts['p_z_mean'] = self.p_z_mean
            self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar
            self.shared_param_dicts['obs_logvar'] = self.obs_logvar
        else:
            self.p_z_mean = self.shared_param_dicts['p_z_mean']
            self.p_z_logvar = self.shared_param_dicts['p_z_logvar']
            self.obs_logvar = self.shared_param_dicts['obs_logvar']
            self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar)

        # get a drop mask that drops things with probability p
        drop_scale = 1. / (1. - self.drop_rate[0])
        drop_rnd = self.rng.uniform(size=self.x_out.shape, \
                low=0.0, high=1.0, dtype=theano.config.floatX)
        drop_mask = drop_scale * (drop_rnd > self.drop_rate[0])

        ##############################################
        # Setup the TwoStageModels main computation. #
        ##############################################
        print("Building TSM...")
        # samples of "first" latent state
        drop_x = drop_mask * self.x_in
        z_q_mean, z_q_logvar, self.z = \
                self.q_z_given_x.apply(drop_x, do_samples=True)
        # compute relevant KLds for this step
        self.kld_z_q2ps = gaussian_kld(z_q_mean, z_q_logvar, \
                                       self.p_z_mean, self.p_z_logvar)
        self.kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \
                                       z_q_mean, z_q_logvar)
        # transform "first" latent state into "second" latent state
        self.s, _ = self.p_s_given_z.apply(self.z, do_samples=False)

        # get samples of h, conditioned on current s
        h_p_mean, h_p_logvar, h_p = self.p_h_given_s.apply( \
                self.s, do_samples=True)
        # get variational samples of h, given s and x_out
        h_q_mean, h_q_logvar, h_q = self.q_h_given_x_s.apply( \
                T.horizontal_stack(self.x_out, self.s), \
                do_samples=True)

        # make h samples that can be switched between h_p and h_q
        self.h = (self.train_switch[0] * h_q) + \
                 ((1.0 - self.train_switch[0]) * h_p)

        # compute relevant KLds for this step
        self.kld_h_q2ps = gaussian_kld(h_q_mean, h_q_logvar, \
                                       h_p_mean, h_p_logvar)
        self.kld_h_p2qs = gaussian_kld(h_p_mean, h_p_logvar, \
                                       h_q_mean, h_q_logvar)

        # p_x_given_s_h is conditioned on s and  h.
        self.x_gen, _ = self.p_x_given_s_h.apply( \
                T.horizontal_stack(self.s, self.h), \
                do_samples=False)

        ######################################################################
        # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE #
        ######################################################################

        # shared var learning rate for generator and inferencer
        zero_ary = to_fX( np.zeros((1,)) )
        self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1')
        self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2')
        # init parameters for controlling learning dynamics
        self.set_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll')
        self.set_lam_nll(lam_nll=1.0)
        # init shared var for weighting prior kld against reconstruction
        self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z')
        self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p')
        self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q')
        self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w')
        self.set_lam_l2w(1e-5)

        # Grab all of the "optimizable" parameters in "group 1"
        self.group_1_params = []
        self.group_1_params.extend(self.q_z_given_x.mlp_params)
        self.group_1_params.extend(self.q_h_given_x_s.mlp_params)
        # Grab all of the "optimizable" parameters in "group 2"
        self.group_2_params = [self.p_z_mean, self.p_z_logvar]
        self.group_2_params.extend(self.p_s_given_z.mlp_params)
        self.group_2_params.extend(self.p_h_given_s.mlp_params)
        self.group_2_params.extend(self.p_x_given_s_h.mlp_params)

        # Make a joint list of parameters group 1/2
        self.joint_params = self.group_1_params + self.group_2_params

        #################################
        # CONSTRUCT THE KLD-BASED COSTS #
        #################################
        self.kld_z_q2p, self.kld_z_p2q, self.kld_h_q2p, self.kld_h_p2q = \
                self._construct_kld_costs(p=1.0)
        self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_z_p2q)
        self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \
                     (self.lam_kld_p2q[0] * self.kld_h_p2q)
        self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_h
        # now do l2 KLd costs
        self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_h_q2p, self.kl2_h_p2q = \
                self._construct_kld_costs(p=2.0)
        self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \
                     (self.lam_kld_p2q[0] * self.kl2_z_p2q)
        self.kl2_h = (self.lam_kld_q2p[0] * self.kl2_h_q2p) + \
                     (self.lam_kld_p2q[0] * self.kl2_h_p2q)
        self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_h
        # compute joint l1/l2 KLd cost
        self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \
                ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs)
        # compute "mean" (rather than per-input) costs
        self.kld_cost = T.mean(self.kld_costs)
        self.kl2_cost = T.mean(self.kl2_costs)
        self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs)
        #################################
        # CONSTRUCT THE NLL-BASED COSTS #
        #################################
        self.nll_costs = self._construct_nll_costs(self.x_out)
        self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs)
        ########################################
        # CONSTRUCT THE REST OF THE JOINT COST #
        ########################################
        param_reg_cost = self._construct_reg_costs()
        self.reg_cost = self.lam_l2w[0] * param_reg_cost
        self.joint_cost = self.nll_cost + self.kld_l1l2_cost + self.reg_cost
        ##############################
        # CONSTRUCT A PER-INPUT COST #
        ##############################
        self.obs_costs = self.nll_costs + self.kld_l1l2_costs

        # Get the gradient of the joint cost for all optimizable parameters
        print("Computing gradients of self.joint_cost...")
        self.joint_grads = OrderedDict()
        grad_list = T.grad(self.joint_cost, self.joint_params)
        for i, p in enumerate(self.joint_params):
            self.joint_grads[p] = grad_list[i]

        # Construct the updates for the generator and inferencer networks
        self.group_1_updates = get_adam_updates(params=self.group_1_params, \
                grads=self.joint_grads, alpha=self.lr_1, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.group_2_updates = get_adam_updates(params=self.group_2_params, \
                grads=self.joint_grads, alpha=self.lr_2, \
                beta1=self.mom_1, beta2=self.mom_2, \
                mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0)
        self.joint_updates = OrderedDict()
        for k in self.group_1_updates:
            self.joint_updates[k] = self.group_1_updates[k]
        for k in self.group_2_updates:
            self.joint_updates[k] = self.group_2_updates[k]

        # Construct a function for jointly training the generator/inferencer
        print("Compiling training function...")
        self.train_joint = self._construct_train_joint()
        print("Compiling free-energy sampler...")
        self.compute_fe_terms = self._construct_compute_fe_terms()
        print("Compiling open-loop model sampler...")
        self.sample_from_prior = self._construct_sample_from_prior()
        print("Compiling data-guided model sampler...")
        self.sample_from_input = self._construct_sample_from_input()
        # make easy access points for some interesting parameters
        self.gen_gen_weights = self.p_x_given_s_h.mu_layers[-1].W
        return
Beispiel #34
0
    def predict_symbolic(self, mx, Sx, unroll_scan=False):
        idims = self.D
        odims = self.E

        Ms = self.sr.shape[1]
        sf2M = (self.hyp[:, idims]**2)/tt.cast(Ms, floatX)
        sn2 = self.hyp[:, idims+1]**2

        # TODO this should just fallback to the method from the SSGP class
        if Sx is None:
            # first check if we received a vector [D] or a matrix [nxD]
            if mx.ndim == 1:
                mx = mx[None, :]

            srdotx = self.sr.dot(self.X.T).transpose(0,2,1)
            phi_x = tt.concatenate([tt.sin(srdotx), tt.cos(srdotx)], 2)
            M = (phi_x*self.beta_ss[:, None, :]).sum(-1)
            phi_x_L = tt.stack([
                solve_lower_triangular(self.Lmm[i], phi_x[i].T)
                for i in range(odims)])
            S = sn2[:, None]*(1 + (sf2M[:, None])*(phi_x_L**2).sum(-2)) + 1e-6

            return M, S

        # precompute some variables
        srdotx = self.sr.dot(mx)
        srdotSx = self.sr.dot(Sx)
        srdotSxdotsr = tt.sum(srdotSx*self.sr, 2)
        e = tt.exp(-0.5*srdotSxdotsr)
        cos_srdotx = tt.cos(srdotx)
        sin_srdotx = tt.sin(srdotx)
        cos_srdotx_e = cos_srdotx*e
        sin_srdotx_e = sin_srdotx*e

        # compute the mean vector
        mphi = tt.horizontal_stack(sin_srdotx_e, cos_srdotx_e)  # E x 2*Ms
        M = tt.sum(mphi*self.beta_ss, 1)

        # input output covariance
        mx_c = mx.dimshuffle(0, 'x')
        sin_srdotx_e_r = sin_srdotx_e.dimshuffle(0, 'x', 1)
        cos_srdotx_e_r = cos_srdotx_e.dimshuffle(0, 'x', 1)
        srdotSx_tr = srdotSx.transpose(0, 2, 1)
        c = tt.concatenate([mx_c*sin_srdotx_e_r + srdotSx_tr*cos_srdotx_e_r,
                            mx_c*cos_srdotx_e_r - srdotSx_tr*sin_srdotx_e_r],
                           axis=2)  # E x D x 2*Ms
        beta_ss_r = self.beta_ss.dimshuffle(0, 'x', 1)

        # input output covariance (notice this is not premultiplied by the
        # input covariance inverse)
        V = tt.sum(c*beta_ss_r, 2).T - tt.outer(mx, M)

        srdotSxdotsr_c = srdotSxdotsr.dimshuffle(0, 1, 'x')
        srdotSxdotsr_r = srdotSxdotsr.dimshuffle(0, 'x', 1)
        M2 = tt.zeros((odims, odims))

        # initialize indices
        triu_indices = np.triu_indices(odims)
        indices = [tt.as_index_variable(idx) for idx in triu_indices]

        def second_moments(i, j, M2, beta, iA, sn2, sf2M, sr, srdotSx,
                           srdotSxdotsr_c, srdotSxdotsr_r,
                           sin_srdotx, cos_srdotx, *args):
            # compute the second moments of the spectrum feature vectors
            siSxsj = srdotSx[i].dot(sr[j].T)  # Ms x Ms
            sijSxsij = -0.5*(srdotSxdotsr_c[i] + srdotSxdotsr_r[j])
            em = tt.exp(sijSxsij+siSxsj)      # MsxMs
            ep = tt.exp(sijSxsij-siSxsj)     # MsxMs
            si = sin_srdotx[i]       # Msx1
            ci = cos_srdotx[i]       # Msx1
            sj = sin_srdotx[j]       # Msx1
            cj = cos_srdotx[j]       # Msx1
            sicj = tt.outer(si, cj)  # MsxMs
            cisj = tt.outer(ci, sj)  # MsxMs
            sisj = tt.outer(si, sj)  # MsxMs
            cicj = tt.outer(ci, cj)  # MsxMs
            sm = (sicj-cisj)*em
            sp = (sicj+cisj)*ep
            cm = (sisj+cicj)*em
            cp = (cicj-sisj)*ep

            # Populate the second moment matrix of the feature vector
            Q_up = tt.concatenate([cm-cp, sm+sp], axis=1)
            Q_lo = tt.concatenate([sp-sm, cm+cp], axis=1)
            Q = tt.concatenate([Q_up, Q_lo], axis=0)

            # Compute the second moment of the output
            m2 = 0.5*matrix_dot(beta[i], Q, beta[j].T)

            m2 = theano.ifelse.ifelse(
                tt.eq(i, j),
                m2 + sn2[i]*(1.0 + sf2M[i]*tt.sum(self.iA[i]*Q)) + 1e-6,
                m2)
            M2 = tt.set_subtensor(M2[i, j], m2)
            return M2

        nseq = [self.beta_ss, self.iA, sn2, sf2M, self.sr, srdotSx,
                srdotSxdotsr_c, srdotSxdotsr_r, sin_srdotx, cos_srdotx,
                self.Lmm]

        if unroll_scan:
            from lasagne.utils import unroll_scan
            [M2_] = unroll_scan(second_moments, indices,
                                [M2], nseq, len(triu_indices[0]))
            updts = {}
        else:
            M2_, updts = theano.scan(fn=second_moments,
                                     sequences=indices,
                                     outputs_info=[M2],
                                     non_sequences=nseq,
                                     allow_gc=False,
                                     name="%s>M2_scan" % (self.name))

        M2 = M2_[-1]
        M2 = M2 + tt.triu(M2, k=1).T
        S = M2 - tt.outer(M, M)

        return M, S, V
Beispiel #35
0
def test_TransMatConjugateStep_subtensors():

    # Confirm that Dirichlet/non-Dirichlet mixed rows can be
    # parsed
    with pm.Model():
        d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1])
        d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1])

        p_0_rv = tt.as_tensor([0, 0, 1])
        p_1_rv = tt.zeros(3)
        p_1_rv = tt.set_subtensor(p_0_rv[[0, 2]], d_0_rv)
        p_2_rv = tt.zeros(3)
        p_2_rv = tt.set_subtensor(p_1_rv[[1, 2]], d_1_rv)

        P_tt = tt.stack([p_0_rv, p_1_rv, p_2_rv])
        P_rv = pm.Deterministic("P_tt", tt.shape_padleft(P_tt))
        DiscreteMarkovChain("S_t", P_rv, np.r_[1, 0, 0], shape=(10, ))

        transmat = TransMatConjugateStep(P_rv)

    assert transmat.row_remaps == {0: 1, 1: 2}
    exp_slices = {0: np.r_[0, 2], 1: np.r_[1, 2]}
    assert exp_slices.keys() == transmat.row_slices.keys()
    assert all(
        np.array_equal(transmat.row_slices[i], exp_slices[i])
        for i in exp_slices.keys())

    # Same thing, just with some manipulations of the transition matrix
    with pm.Model():
        d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1])
        d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1])

        p_0_rv = tt.as_tensor([0, 0, 1])
        p_1_rv = tt.zeros(3)
        p_1_rv = tt.set_subtensor(p_0_rv[[0, 2]], d_0_rv)
        p_2_rv = tt.zeros(3)
        p_2_rv = tt.set_subtensor(p_1_rv[[1, 2]], d_1_rv)

        P_tt = tt.horizontal_stack(p_0_rv[..., None], p_1_rv[..., None],
                                   p_2_rv[..., None])
        P_rv = pm.Deterministic("P_tt", tt.shape_padleft(P_tt.T))
        DiscreteMarkovChain("S_t", P_rv, np.r_[1, 0, 0], shape=(10, ))

        transmat = TransMatConjugateStep(P_rv)

    assert transmat.row_remaps == {0: 1, 1: 2}
    exp_slices = {0: np.r_[0, 2], 1: np.r_[1, 2]}
    assert exp_slices.keys() == transmat.row_slices.keys()
    assert all(
        np.array_equal(transmat.row_slices[i], exp_slices[i])
        for i in exp_slices.keys())

    # Use an observed `DiscreteMarkovChain` and check the conjugate results
    with pm.Model():
        d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1])
        d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1])

        p_0_rv = tt.as_tensor([0, 0, 1])
        p_1_rv = tt.zeros(3)
        p_1_rv = tt.set_subtensor(p_0_rv[[0, 2]], d_0_rv)
        p_2_rv = tt.zeros(3)
        p_2_rv = tt.set_subtensor(p_1_rv[[1, 2]], d_1_rv)

        P_tt = tt.horizontal_stack(p_0_rv[..., None], p_1_rv[..., None],
                                   p_2_rv[..., None])
        P_rv = pm.Deterministic("P_tt", tt.shape_padleft(P_tt.T))
        DiscreteMarkovChain("S_t",
                            P_rv,
                            np.r_[1, 0, 0],
                            shape=(4, ),
                            observed=np.r_[0, 1, 0, 2])

        transmat = TransMatConjugateStep(P_rv)
def learnAndPredict(Ti, C, TOList):

    rng = np.random.RandomState(SEED)
    learning_rate = learning_rate0
    print np.mean(Ti[1000, :])
    aminW = np.amin(Ti[:1000, :])
    amaxW = np.amax(Ti[:1000, :])
    Ti[:1000, :] = (Ti[:1000, :] - aminW) / (amaxW - aminW)
    astdW = np.std(Ti[:1000, :])
    ameanW = np.mean(Ti[:1000, :])
    Ti[:1000, :] = (Ti[:1000, :] - ameanW) / astdW
    aminacW = np.amin(Ti[1000, :])
    amaxacW = np.amax(Ti[1000, :])
    print aminW, amaxW, aminacW, amaxacW
    Ti[1000, :] = (Ti[1000, :] - aminacW) / (amaxacW - aminacW)
    astdacW = np.std(Ti[1000, :])
    ameanacW = np.mean(Ti[1000, :])
    Ti[1000, :] = (Ti[1000, :] - ameanacW) / astdacW

    ile__ = len(TOList)
    ileList = np.zeros(ile__)
    for titer in range(len(TOList)):
        print np.mean(TOList[titer][1000, :])
        TOList[titer][:1000, :] = (TOList[titer][:1000, :] - aminW) / (amaxW -
                                                                       aminW)
        TOList[titer][:1000, :] = (TOList[titer][:1000, :] - ameanW) / astdW
        TOList[titer][1000, :] = (TOList[titer][1000, :] -
                                  aminacW) / (amaxacW - aminacW)
        TOList[titer][1000, :] = (TOList[titer][1000, :] - ameanacW) / astdacW
        _, ileList[titer] = TOList[titer].shape

    _, ile = Ti.shape
    N = NN

    data = []
    yyy = []
    need = 1
    BYL = {}
    j = 0
    dwa = 0
    ONES = []
    ZEROS = []
    for i in range(NN):
        for j in range(NN):
            if i != j:
                if C[i][j] == 1:
                    ONES.append((i, j))
                else:
                    ZEROS.append((i, j))
    Nones = len(ONES)
    rng.shuffle(ONES)
    Nzeros = len(ZEROS)
    print Nones
    print Nzeros
    Needed = NUM_TRAIN / 2
    onesPerPair = Needed / Nones + 1
    onesIter = 0
    jj = 0
    while jj < NUM_TRAIN:
        if jj % 300000 == 0:
            print jj / 300000,
        need = 1 - need
        if need == 1:
            pairNo = onesIter % Nones
            ppp = onesIter / Nones
            s, t = ONES[pairNo]
            shift = rng.randint(0, ile - L)
            onesIter += 1
        if need == 0:
            zer = rng.randint(Nzeros)
            s, t = ZEROS[zer]
            del ZEROS[zer]
            Nzeros -= 1
            shift = rng.randint(0, ile - L)
        x = np.hstack((Ti[s][shift:shift + L], Ti[t][shift:shift + L],
                       Ti[1000][shift:shift + L]))
        y = C[s][t]
        data.append(x)
        yyy.append(y)
        jj += 1

    data = np.array(data, dtype=theano.config.floatX)
    is_train = np.array(([0] * 96 + [1, 1, 2, 2]) * (NUM_TRAIN / 100))
    yyy = np.array(yyy)

    train_set_x0, train_set_y0 = np.array(
        data[is_train == 0]), yyy[is_train == 0]
    test_set_x, test_set_y = np.array(data[is_train == 1]), yyy[is_train == 1]
    valid_set_x, valid_set_y = np.array(
        data[is_train == 2]), yyy[is_train == 2]
    n_train_batches = len(train_set_y0) / batch_size
    n_valid_batches = len(valid_set_y) / batch_size
    n_test_batches = len(test_set_y) / batch_size
    epoch = T.scalar()
    index = T.lscalar()
    x = T.matrix('x')
    inone2 = T.matrix('inone2')
    y = T.ivector('y')
    print '... building the model'
    #-------- my layers -------------------

    #---------------------
    layer0_input = x.reshape((batch_size, 1, 3, L))
    Cx = 5
    layer0 = ConvolutionalLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 3, L),
                                filter_shape=(nkerns[0], 1, 2, Cx),
                                poolsize=(1, 1),
                                fac=0)
    ONE = (3 - 2 + 1) / 1
    L2 = (L - Cx + 1) / 1
    #---------------------
    Cx2 = 5
    layer1 = ConvolutionalLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], ONE, L2),
                                filter_shape=(nkerns[1], nkerns[0], 2, Cx2),
                                poolsize=(1, 1),
                                activation=ReLU,
                                fac=0)
    ONE = (ONE - 2 + 1) / 1
    L3 = (L2 - Cx2 + 1) / 1
    #---------------------
    Cx3 = 1
    layer1b = ConvolutionalLayer(rng,
                                 input=layer1.output,
                                 image_shape=(batch_size, nkerns[1], ONE, L3),
                                 filter_shape=(nkerns[2], nkerns[1], 1, Cx3),
                                 poolsize=(1, POOL),
                                 activation=ReLU,
                                 fac=0)
    ONE = (ONE - 1 + 1) / 1
    L4 = (L3 - Cx3 + 1) / POOL

    REGx = 100
    #---------------------
    layer2_input = layer1b.output.flatten(2)
    print layer2_input.shape
    use_b = False
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[2] * L4,
                         n_out=REGx,
                         activation=T.tanh,
                         use_bias=use_b)
    layer3 = LogisticRegression(input=layer2.output, n_in=REGx, n_out=2)

    cost = layer3.negative_log_likelihood(y)
    out_x2 = theano.shared(
        np.asarray(np.zeros((N, L)), dtype=theano.config.floatX))
    inone2 = theano.shared(
        np.asarray(np.zeros((1, L)), dtype=theano.config.floatX))
    inone3 = theano.shared(
        np.asarray(np.zeros((1, L)), dtype=theano.config.floatX))
    inone4 = theano.shared(
        np.asarray(np.zeros((1, L)), dtype=theano.config.floatX))
    test_set_x = theano.shared(
        np.asarray(test_set_x, dtype=theano.config.floatX))
    train_set_x = theano.shared(
        np.asarray(train_set_x0, dtype=theano.config.floatX))
    train_set_y = T.cast(
        theano.shared(np.asarray(train_set_y0, dtype=theano.config.floatX)),
        'int32')
    test_set_y = T.cast(
        theano.shared(np.asarray(test_set_y, dtype=theano.config.floatX)),
        'int32')
    valid_set_y = T.cast(
        theano.shared(np.asarray(valid_set_y, dtype=theano.config.floatX)),
        'int32')
    valid_set_x = theano.shared(
        np.asarray(valid_set_x, dtype=theano.config.floatX))

    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    mom_start = 0.5
    mom_end = 0.98
    mom_epoch_interval = n_epochs * 1.0
    #### @@@@@@@@@@@
    class_params0 = [layer3, layer2, layer1, layer1b, layer0]
    class_params = [param for layer in class_params0 for param in layer.params]

    gparams = []
    for param in class_params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)
    gparams_mom = []
    for param in class_params:
        gparam_mom = theano.shared(
            np.zeros(param.get_value(borrow=True).shape,
                     dtype=theano.config.floatX))
        gparams_mom.append(gparam_mom)
    mom = ifelse(
        epoch < mom_epoch_interval,
        mom_start * (1.0 - epoch / mom_epoch_interval) + mom_end *
        (epoch / mom_epoch_interval), mom_end)
    updates = OrderedDict()
    for gparam_mom, gparam in zip(gparams_mom, gparams):
        updates[gparam_mom] = mom * gparam_mom - (1. -
                                                  mom) * learning_rate * gparam
    for param, gparam_mom in zip(class_params, gparams_mom):
        stepped_param = param + updates[gparam_mom]
        squared_filter_length_limit = 15.0
        if param.get_value(borrow=True).ndim == 2:
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
            desired_norms = T.clip(col_norms, 0,
                                   T.sqrt(squared_filter_length_limit))
            scale = desired_norms / (1e-7 + col_norms)
            updates[param] = stepped_param * scale
        else:
            updates[param] = stepped_param

    output = cost
    train_model = theano.function(
        inputs=[epoch, index],
        outputs=output,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    keep = theano.function(
        [index],
        layer3.errorsFull(y),
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        },
        on_unused_input='warn')

    timer = time.clock()
    print "finished reading", (timer - start_time0) / 60., "minutes "

    # TRAIN MODEL #
    print '... training'
    validation_frequency = n_train_batches
    best_params = None
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    epochc = 0

    while (epochc < n_epochs):
        epochc = epochc + 1
        learning_rate = learning_rate0 * (1.2 - ((1.0 * epochc) / n_epochs))
        for minibatch_index in xrange(n_train_batches):
            iter = (epochc - 1) * n_train_batches + minibatch_index
            cost_ij = train_model(epochc, minibatch_index)
            if (iter + 1) % validation_frequency == 0:
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = np.mean(validation_losses)
                print(' %i) err %.2f ' % (epochc, this_validation_loss / 10)
                      ), L, nkerns, REGx, "|", Cx, Cx2, Cx3, batch_size
                if this_validation_loss < best_validation_loss or epochc % 30 == 0:
                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score = np.mean(test_losses)
                    print(
                        ('     epoch %i, minibatch %i/%i, test error of best '
                         'model %f %%') % (epochc, minibatch_index + 1,
                                           n_train_batches, test_score / 10))
    ############
    timel = time.clock()
    print "finished learning", (timel - timer) / 60., "minutes "
    ppm = theano.function(
        [index],
        layer3.pred_proba_mine(),
        givens={
            x:
            T.horizontal_stack(
                T.tile(inone2, (batch_size, 1)),
                out_x2[index * batch_size:(index + 1) * batch_size],
                T.tile(inone3, (batch_size, 1))),
            y:
            train_set_y[0 * (batch_size):(0 + 1) * (batch_size)]
        },
        on_unused_input='warn')

    NONZERO = (N * N - N)
    gc.collect()
    RESList = [np.zeros((N, N)) for it in range(ile__)]
    for __net in range(ile__):
        TO = TOList[__net]
        ileO = ileList[__net]
        RES = RESList[__net]
        shift = 0.1
        DELTAshift = (ileO - L) / (Q - 1)
        print "DELTAshift:", DELTAshift
        for q in range(Q):
            dataO = []
            print(q + 1), "/", Q, "  ",
            out_x2.set_value(
                np.asarray(np.array(TO[:, shift:shift + L]),
                           dtype=theano.config.floatX))
            PARTIAL = np.zeros((N, N))
            inone3.set_value(
                np.asarray(np.array(TO[1000][shift:shift + L]).reshape(1, L),
                           dtype=theano.config.floatX))
            for i in range(N):
                inone2.set_value(
                    np.asarray(np.array(TO[i][shift:shift + L]).reshape(1, L),
                               dtype=theano.config.floatX))
                p = [ppm(ii) for ii in xrange(N / batch_size)]
                for pos in range(N):
                    if pos != i:
                        PARTIAL[i][pos] += p[pos / batch_size][pos %
                                                               batch_size][1]
            for i in range(N):
                for j in range(N):
                    RES[i][j] += PARTIAL[i][j]
            shift += DELTAshift
        print "Finished", __net
        RESList[__net] = RES / np.max(RES)
        gc.collect()

    end_time = time.clock()
    print "finished predicting", (end_time - timel) / 60., "minutes ", str(
        nkerns), "using SEED = ", SEED
    print('The code for file ' + os.path.split(__file__)[1] +
          ' ran for %.2fm' % ((end_time - start_time0) / 60.))
    return RESList
        # construct a fully-connected sigmoidal layer
        layer2 = HiddenLayer(rng, input=layer2_input,
                            n_in=50 * ((l1ims-4)/2)**2, n_out=500,
                            activation=T.tanh)

        # classify the values of the fully-connected sigmoidal layer
        layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=2)

    else:
        # Output (14,14) -> (5, 5)

        # the HiddenLayer being fully-connected, it operates on 2D matrices of
        # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
        # This will generate a matrix of shape (20, 32 * 4 * 4) = (20, 512)
        layer2_input = T.horizontal_stack(layer1.output.flatten(2), x_extra)

        # construct a fully-connected sigmoidal layer
        layer2 = HiddenLayer(rng, input=layer2_input,
                            n_in=50 * ((l1ims-4)/2)**2 + ExtraColumns, n_out=500,
                            activation=T.tanh)

        # classify the values of the fully-connected sigmoidal layer
        layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=2)

    model = [layer0, layer1, layer2, layer3]

    # the cost we minimize during training is the negative log likelihood of
    # the model plus the regularization terms (L1 and L2); cost is expressed
    # here symbolically
    cost = layer3.negative_log_likelihood(y)
Beispiel #38
0
    def get_output_for(self, inputs, deterministic=False, **kwargs):

        # extract inputs
        H1, H2 = inputs

        # train set size
        m = H1.shape[0].astype(theano.config.floatX)

        # running average projection matrix update
        if not deterministic:

            # compute batch mean
            mean1 = T.mean(H1, axis=0)
            mean2 = T.mean(H2, axis=0)

            # running average updates of means
            mean1 = (floatX(1.0 - self.alpha) * self.mean1 +
                     self.alpha * mean1)
            running_mean1 = theano.clone(self.mean1, share_inputs=False)
            running_mean1.default_update = mean1
            mean1 += 0 * running_mean1

            mean2 = (floatX(1.0 - self.alpha) * self.mean2 +
                     self.alpha * mean2)
            running_mean2 = theano.clone(self.mean2, share_inputs=False)
            running_mean2.default_update = mean2
            mean2 += 0 * running_mean2

            # hidden representations
            H1bar = H1 - mean1
            H2bar = H2 - mean2

            # transpose to formulas in paper
            H1bar = H1bar.T
            H2bar = H2bar.T

            # cross-covariance
            S12 = (1.0 / (m - 1)) * T.dot(H1bar, H2bar.T)

            # covariance 1
            S11 = (1.0 / (m - 1)) * T.dot(H1bar, H1bar.T)
            S11 = S11 + self.r1 * T.identity_like(S11)

            # covariance 2
            S22 = (1.0 / (m - 1)) * T.dot(H2bar, H2bar.T)
            S22 = S22 + self.r2 * T.identity_like(S22)

            # running average updates of statistics
            S12 = (floatX(1.0 - self.alpha) * self.S12 + self.alpha * S12)
            running_S12 = theano.clone(self.S12, share_inputs=False)
            running_S12.default_update = S12
            S12 += 0 * running_S12

            S11 = (floatX(1.0 - self.alpha) * self.S11 + self.alpha * S11)
            running_S11 = theano.clone(self.S11, share_inputs=False)
            running_S11.default_update = S11
            S11 += 0 * running_S11

            S22 = (floatX(1.0 - self.alpha) * self.S22 + self.alpha * S22)
            running_S22 = theano.clone(self.S22, share_inputs=False)
            running_S22.default_update = S22
            S22 += 0 * running_S22

            # theano-compatible formulation of paper
            d, A = T.nlinalg.eigh(S11)
            S11si = (A * np.reciprocal(np.sqrt(d))).dot(A.T)  # = S11^-.5
            d, A = T.nlinalg.eigh(S22)
            S22si = (A * np.reciprocal(np.sqrt(d))).dot(A.T)  # = S22^-.5

            # compute TT' and T'T (regularized)
            Tnp = S11si.dot(S12).dot(S22si)
            M1 = Tnp.dot(Tnp.T)
            M2 = Tnp.T.dot(Tnp)
            M1 += self.rT * T.identity_like(M1)
            M2 += self.rT * T.identity_like(M2)

            # compute eigen decomposition
            E1, E = T.nlinalg.eigh(M1)
            _, F = T.nlinalg.eigh(M2)

            # maximize correlation
            E1 = T.clip(E1, 1e-7, 1.0)
            E1 = T.sqrt(E1)
            self.loss = -T.mean(E1) * self.wl
            self.corr = E1

            # compute projection matrices
            U = S11si.dot(E)
            V = S22si.dot(F)

            # flip signs of projections to match
            # (needed because we do two decompositions as opposed to a SVD)
            s = T.sgn(U.T.dot(S12).dot(V).diagonal())
            U *= s

            # update of projection matrices
            running_U = theano.clone(self.U, share_inputs=False)
            running_U.default_update = U
            U += floatX(0) * running_U

            running_V = theano.clone(self.V, share_inputs=False)
            running_V.default_update = V
            V += floatX(0) * running_V

        # use projections of layer
        else:

            # hidden representations
            H1bar = H1 - self.mean1
            H2bar = H2 - self.mean2

            # transpose to formulas in paper
            H1bar = H1bar.T
            H2bar = H2bar.T

            U, V = self.U, self.V

        # re-project data
        lv1_cca = H1bar.T.dot(U)
        lv2_cca_fixed = H2bar.T.dot(V)

        output = T.horizontal_stack(lv1_cca, lv2_cca_fixed)

        return output
Beispiel #39
0
def hstack(tensors):
    return T.horizontal_stack(*tensors)
Beispiel #40
0
    def __init__(self, rng=None, \
            Xd=None, Yd=None, Xc=None, Xm=None, \
            g_net=None, i_net=None, p_net=None, \
            data_dim=None, prior_dim=None, label_dim=None, \
            batch_size=None, \
            params=None, shared_param_dicts=None):
        # TODO: refactor for use with "encoded" inferencer/generator
        assert(not (i_net.use_encoder or g_net.use_encoder))

        # setup a rng for this GITrip
        self.rng = RandStream(rng.randint(100000))
        # setup the prior distribution over the categorical variable
        if params is None:
            self.params = {}
        else:
            self.params = params

        # record the dimensionality of the data handled by this GITrip
        self.data_dim = data_dim
        self.label_dim = label_dim
        self.prior_dim = prior_dim
        self.batch_size = batch_size

        # create a mask for disabling and/or reweighting input dimensions
        row_mask = np.ones((self.data_dim,)).astype(theano.config.floatX)
        self.input_mask = theano.shared(value=row_mask, name='git_input_mask')
        
        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this GITrip
        self.Xd = self.input_mask * Xd
        self.Yd = Yd
        self.Xc = Xc
        self.Xm = Xm
        
        # construct a vertically-repeated identity matrix for marginalizing
        # over possible values of the categorical latent variable.
        Ic = np.vstack([np.identity(label_dim) for i in range(batch_size)])
        self.Ic = theano.shared(value=Ic.astype(theano.config.floatX), name='git_Ic')
        # create "shared-parameter" clones of the continuous and categorical
        # inferencers that this GITrip will be built on.
        self.IN = i_net.shared_param_clone(rng=rng, \
                Xd=self.Xd, Xc=self.Xc, Xm=self.Xm)
        self.PN = p_net.shared_param_clone(rng=rng, Xd=self.Xd)
        # create symbolic variables for the approximate posteriors over the 
        # continuous and categorical latent variables
        self.Xp = self.IN.output
        self.Yp = safe_softmax(self.PN.output_spawn[0])
        self.Yp_proto = safe_softmax(self.PN.output_proto)
        # create a symbolic variable structured to allow easy "marginalization"
        # over possible settings of the categorical latent variable. the left
        # matrix (i.e. self.Ic) comprises batch_size copies of the label_dim
        # dimensional identity matrix stacked on top of each other, and the
        # right matrix comprises a single sample from the approximate posterior
        # over the continuous latent variables for each of batch_size examples
        # with each sample repeated label_dim times.
        self.XYp = T.horizontal_stack(self.Ic, T.repeat(self.Xp, \
                self.label_dim, axis=0))
        # pipe the "convenient marginlization" matrix into a shared parameter
        # clone of the generator network
        self.GN = g_net.shared_param_clone(rng=rng, Xp=self.XYp)
        # capture a handle for sampled reconstructions from the generator
        self.Xg = self.GN.output

        # we will be assuming one proto-net in the pseudo-ensemble represented
        # by self.PN, and either one or two spawn-nets for that proto-net.
        assert(len(self.PN.proto_nets) == 1)
        assert((len(self.PN.spawn_nets) == 1) or \
                (len(self.PN.spawn_nets) == 2))
        # output of the generator and input to the inferencer should both be
        # equal to self.data_dim
        assert(self.data_dim == self.GN.mlp_layers[-1].out_dim)
        assert(self.data_dim == self.IN.shared_layers[0].in_dim)
        assert(self.data_dim == self.PN.proto_nets[0][0].in_dim)
        # mu/sigma outputs of self.IN should be equal to prior_dim, output of
        # self.PN should be equal to label_dim, and input of self.GN should be
        # equal to prior_dim + label_dim
        assert(self.prior_dim == self.IN.mu_layers[-1].out_dim)
        assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim)
        assert(self.label_dim == self.PN.proto_nets[0][-1].out_dim)
        assert((self.prior_dim + self.label_dim) == self.GN.mlp_layers[0].in_dim)

        # determine whether this GITrip is a clone or an original
        if shared_param_dicts is None:
            # This is not a clone, and we will need to make a dict for
            # referring to some important shared parameters.
            self.shared_param_dicts = {}
            self.is_clone = False
        else:
            # This is a clone, and its layer parameters can be found by
            # referring to the given param dict (i.e. shared_param_dicts).
            self.shared_param_dicts = shared_param_dicts
            self.is_clone = True

        if not self.is_clone:
            # shared var learning rate for generator and inferencer
            zero_ary = np.zeros((1,)).astype(theano.config.floatX)
            self.lr_gn = theano.shared(value=zero_ary, name='git_lr_gn')
            self.lr_in = theano.shared(value=zero_ary, name='git_lr_in')
            self.lr_pn = theano.shared(value=zero_ary, name='git_lr_pn')
            # shared var momentum parameters for generator and inferencer
            self.mo_gn = theano.shared(value=zero_ary, name='git_mo_gn')
            self.mo_in = theano.shared(value=zero_ary, name='git_mo_in')
            self.mo_pn = theano.shared(value=zero_ary, name='git_mo_pn')
            # init parameters for controlling learning dynamics
            self.set_all_sgd_params()
            # init shared var for weighting nll of data given posterior sample
            self.lam_nll = theano.shared(value=zero_ary, name='git_lam_nll')
            self.set_lam_nll(lam_nll=1.0)
            # init shared var for weighting posterior KL-div from prior
            self.lam_kld = theano.shared(value=zero_ary, name='git_lam_kld')
            self.set_lam_kld(lam_kld=1.0)
            # init shared var for weighting semi-supervised classification
            self.lam_cat = theano.shared(value=zero_ary, name='git_lam_cat')
            self.set_lam_cat(lam_cat=0.0)
            # init shared var for weighting ensemble agreement regularization
            self.lam_pea = theano.shared(value=zero_ary, name='git_lam_pea')
            self.set_lam_pea(lam_pea=0.0)
            # init shared var for weighting entropy regularization on the
            # inferred posteriors over the categorical variable of interest
            self.lam_ent = theano.shared(value=zero_ary, name='git_lam_ent')
            self.set_lam_ent(lam_ent=0.0)
            # init shared var for weighting dirichlet regularization on the
            # inferred posteriors over the categorical variable of interest
            self.lam_dir = theano.shared(value=zero_ary, name='git_lam_dir')
            self.set_lam_dir(lam_dir=0.0)
            # init shared var for controlling l2 regularization on params
            self.lam_l2w = theano.shared(value=zero_ary, name='git_lam_l2w')
            self.set_lam_l2w(lam_l2w=1e-3)
            # record shared parameters that are to be shared among clones
            self.shared_param_dicts['git_lr_gn'] = self.lr_gn
            self.shared_param_dicts['git_lr_in'] = self.lr_in
            self.shared_param_dicts['git_lr_pn'] = self.lr_pn
            self.shared_param_dicts['git_mo_gn'] = self.mo_gn
            self.shared_param_dicts['git_mo_in'] = self.mo_in
            self.shared_param_dicts['git_mo_pn'] = self.mo_pn
            self.shared_param_dicts['git_lam_nll'] = self.lam_nll
            self.shared_param_dicts['git_lam_kld'] = self.lam_kld
            self.shared_param_dicts['git_lam_cat'] = self.lam_cat
            self.shared_param_dicts['git_lam_pea'] = self.lam_pea
            self.shared_param_dicts['git_lam_ent'] = self.lam_ent
            self.shared_param_dicts['git_lam_dir'] = self.lam_dir
            self.shared_param_dicts['git_lam_l2w'] = self.lam_l2w
            self.shared_param_dicts['git_input_mask'] = self.input_mask
        else:
            # use some shared parameters that are shared among all clones of
            # some "base" GITrip
            self.lr_gn = self.shared_param_dicts['git_lr_gn']
            self.lr_in = self.shared_param_dicts['git_lr_in']
            self.lr_pn = self.shared_param_dicts['git_lr_pn']
            self.mo_gn = self.shared_param_dicts['git_mo_gn']
            self.mo_in = self.shared_param_dicts['git_mo_in']
            self.mo_pn = self.shared_param_dicts['git_mo_pn']
            self.lam_nll = self.shared_param_dicts['git_lam_nll']
            self.lam_kld = self.shared_param_dicts['git_lam_kld']
            self.lam_cat = self.shared_param_dicts['git_lam_cat']
            self.lam_pea = self.shared_param_dicts['git_lam_pea']
            self.lam_ent = self.shared_param_dicts['git_lam_ent']
            self.lam_dir = self.shared_param_dicts['git_lam_dir']
            self.lam_l2w = self.shared_param_dicts['git_lam_l2w']
            self.input_mask = self.shared_param_dicts['git_input_mask']

        # Grab the full set of "optimizable" parameters from the generator
        # and inferencer networks that we'll be working with.
        self.gn_params = [p for p in self.GN.mlp_params]
        self.in_params = [p for p in self.IN.mlp_params]
        self.pn_params = [p for p in self.PN.proto_params]

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        self.data_nll_cost = self.lam_nll[0] * self._construct_data_nll_cost()
        self.post_kld_cost = self.lam_kld[0] * self._construct_post_kld_cost()
        self.post_cat_cost = self.lam_cat[0] * self._construct_post_cat_cost()
        self.post_pea_cost = self.lam_pea[0] * self._construct_post_pea_cost()
        self.post_ent_cost = self.lam_ent[0] * self._construct_post_ent_cost()
        self.post_dir_cost = self.lam_dir[0] * self._construct_post_dir_cost()
        self.other_reg_costs = self._construct_other_reg_cost()
        self.other_reg_cost = self.other_reg_costs[0]
        self.joint_cost = self.data_nll_cost + self.post_kld_cost + self.post_cat_cost + \
                self.post_pea_cost + self.post_ent_cost + self.post_dir_cost + \
                self.other_reg_cost

        # Initialize momentums for mini-batch SGD updates. All parameters need
        # to be safely nestled in their lists by now.
        self.joint_moms = OrderedDict()
        self.gn_moms = OrderedDict()
        self.in_moms = OrderedDict()
        self.pn_moms = OrderedDict()
        for p in self.gn_params:
            p_mo = np.zeros(p.get_value(borrow=True).shape) + 5.0
            self.gn_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX))
            self.joint_moms[p] = self.gn_moms[p]
        for p in self.in_params:
            p_mo = np.zeros(p.get_value(borrow=True).shape) + 5.0
            self.in_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX))
            self.joint_moms[p] = self.in_moms[p]
        for p in self.pn_params:
            p_mo = np.zeros(p.get_value(borrow=True).shape) + 5.0
            self.pn_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX))
            self.joint_moms[p] = self.pn_moms[p]

        # Now, we need to construct updates for inferencers and the generator
        self.joint_updates = OrderedDict()
        self.gn_updates = OrderedDict()
        self.in_updates = OrderedDict()
        self.pn_updates = OrderedDict()
        self.grad_sq_sums = []
        #######################################
        # Construct updates for the generator #
        #######################################
        for var in self.gn_params:
            # these updates are for trainable params in the generator net...
            # first, get gradient of cost w.r.t. var
            var_grad = T.grad(self.joint_cost, var, \
                    consider_constant=[self.GN.dist_mean, self.GN.dist_cov]).clip(-1.0,1.0)
            #var_grad = ifelse(T.any(T.isnan(nan_grad)), T.zeros_like(nan_grad), nan_grad)
            #self.grad_sq_sums.append(T.sum(var_grad**2.0))
            # get the momentum for this var
            var_mom = self.gn_moms[var]
            # update the momentum for this var using its grad
            self.gn_updates[var_mom] = (self.mo_gn[0] * var_mom) + \
                    ((1.0 - self.mo_gn[0]) * (var_grad**2.0))
            self.joint_updates[var_mom] = self.gn_updates[var_mom]
            # make basic update to the var
            var_new = var - (self.lr_gn[0] * (var_grad / T.sqrt(var_mom + 1e-2)))
            self.gn_updates[var] = var_new
            # add this var's update to the joint updates too
            self.joint_updates[var] = self.gn_updates[var]
        ###################################################
        # Construct updates for the continuous inferencer #
        ###################################################
        for var in self.in_params:
            # these updates are for trainable params in the inferencer net...
            # first, get gradient of cost w.r.t. var
            var_grad = T.grad(self.joint_cost, var, \
                    consider_constant=[self.GN.dist_mean, self.GN.dist_cov]).clip(-1.0,1.0)
            #var_grad = ifelse(T.any(T.isnan(nan_grad)), T.zeros_like(nan_grad), nan_grad)
            #self.grad_sq_sums.append(T.sum(var_grad**2.0))
            # get the momentum for this var
            var_mom = self.in_moms[var]
            # update the momentum for this var using its grad
            self.in_updates[var_mom] = (self.mo_in[0] * var_mom) + \
                    ((1.0 - self.mo_in[0]) * (var_grad**2.0))
            self.joint_updates[var_mom] = self.in_updates[var_mom]
            # make basic update to the var
            var_new = var - (self.lr_in[0] * (var_grad / T.sqrt(var_mom + 1e-2)))
            self.in_updates[var] = var_new
            # add this var's update to the joint updates too
            self.joint_updates[var] = self.in_updates[var]
        ####################################################
        # Construct updates for the categorical inferencer #
        ####################################################
        for var in self.pn_params:
            # these updates are for trainable params in the inferencer net...
            # first, get gradient of cost w.r.t. var
            var_grad = T.grad(self.joint_cost, var, \
                    consider_constant=[self.GN.dist_mean, self.GN.dist_cov]).clip(-1.0,1.0)
            #var_grad = ifelse(T.any(T.isnan(nan_grad)), T.zeros_like(nan_grad), nan_grad)
            #self.grad_sq_sums.append(T.sum(var_grad**2.0))
            # get the momentum for this var
            var_mom = self.pn_moms[var]
            # update the momentum for this var using its grad
            self.pn_updates[var_mom] = (self.mo_pn[0] * var_mom) + \
                    ((1.0 - self.mo_pn[0]) * (var_grad**2.0))
            self.joint_updates[var_mom] = self.pn_updates[var_mom]
            # make basic update to the var
            var_new = var - (self.lr_pn[0] * (var_grad / T.sqrt(var_mom + 1e-2)))
            self.pn_updates[var] = var_new
            # add this var's update to the joint updates too
            self.joint_updates[var] = self.pn_updates[var]
        # Record the sum of squared gradients (for NaN checking)
        self.grad_sq_sum = T.sum(self.grad_sq_sums)

        # Construct batch-based training functions for the generator and
        # inferer networks, as well as a joint training function.
        #self.train_gn = self._construct_train_gn()
        #self.train_in = self._construct_train_in()
        self.train_joint = self._construct_train_joint()
        return
Beispiel #41
0
    def get_output_for(self, inputs, deterministic=False, **kwargs):

        # extract inputs
        H1, H2 = inputs

        # train set size
        m = H1.shape[0].astype(theano.config.floatX)

        # running average projection matrix update
        if not deterministic:

            # compute batch mean
            mean1 = T.mean(H1, axis=0)
            mean2 = T.mean(H2, axis=0)

            # running average updates of means
            mean1 = (floatX(1.0 - self.alpha) * self.mean1 +
                     self.alpha * mean1)
            running_mean1 = theano.clone(self.mean1, share_inputs=False)
            running_mean1.default_update = mean1
            mean1 += 0 * running_mean1

            mean2 = (floatX(1.0 - self.alpha) * self.mean2 +
                     self.alpha * mean2)
            running_mean2 = theano.clone(self.mean2, share_inputs=False)
            running_mean2.default_update = mean2
            mean2 += 0 * running_mean2

            # hidden representations
            H1bar = H1 - mean1
            H2bar = H2 - mean2

            # transpose to correlation format
            H1bar = H1bar.T
            H2bar = H2bar.T

            # cross-covariance
            S12 = (1.0 / (m - 1)) * T.dot(H1bar, H2bar.T)

            # covariance 1
            S11 = (1.0 / (m - 1)) * T.dot(H1bar, H1bar.T)
            S11 = S11 + self.r1 * T.identity_like(S11)

            # covariance 2
            S22 = (1.0 / (m - 1)) * T.dot(H2bar, H2bar.T)
            S22 = S22 + self.r2 * T.identity_like(S22)

            # theano-compatible formulation of paper
            d, A = T.nlinalg.eigh(S11)
            S11si = (A * np.reciprocal(np.sqrt(d))).dot(A.T)  # = S11^-.5
            d, A = T.nlinalg.eigh(S22)
            S22si = (A * np.reciprocal(np.sqrt(d))).dot(A.T)  # = S22^-.5

            # compute TT' and T'T (regularized)
            Tnp = S11si.dot(S12).dot(S22si)
            M1 = Tnp.dot(Tnp.T)
            M2 = Tnp.T.dot(Tnp)
            M1 += self.rT * T.identity_like(M1)
            M2 += self.rT * T.identity_like(M2)

            # compute eigen decomposition
            E1, E = T.nlinalg.eigh(M1)
            _, F = T.nlinalg.eigh(M2)

            # compute correlation
            E1 = T.clip(E1, 1e-7, 1.0)
            E1 = T.sqrt(E1)
            self.corr = E1

            # transpose back to network format
            H1bar = H1bar.T
            H2bar = H2bar.T

        # use means of layer
        else:

            # hidden representations
            H1bar = H1 - self.mean1
            H2bar = H2 - self.mean2

        # re-project data
        lv1_cca = H1bar.dot(self.U)
        lv2_cca = H2bar.dot(self.V)

        output = T.horizontal_stack(lv1_cca, lv2_cca)

        return output