def loss(lv1, lv2):
        """ Contrastive cosine distance optimization target """

        n = lv1.shape[0]

        # direction 1
        D = lv1.dot(lv2.T)
        d = D.diagonal().reshape((-1, 1))

        M = T.identity_like(D)
        O = D[(M <= 0).nonzero()].reshape((n, n - 1))

        L = gamma - d
        L = T.repeat(L, n - 1, 1)
        L += O
        L = T.clip(L, 0, 1000)

        loss = L.mean()

        # direction 2
        if symmetric:
            D = lv2.dot(lv1.T)
            d = D.diagonal().reshape((-1, 1))

            M = T.identity_like(D)
            O = D[(M <= 0).nonzero()].reshape((n, n - 1))

            L = gamma - d
            L = T.repeat(L, n - 1, 1)
            L += O
            L = T.clip(L, 0, 1000)

            loss += L.mean()

        return weight * loss
Beispiel #2
0
    def _get_orthogonal_matrix_inv(self):
        # A: skew symmetric matrix
        # O: orthogonal mattrix
        # O = (I + A)(I - A)^-1
        # 1. create upper triangular matrix using self.decorr
        # 2. create lower triangular matrix using -self.decorr
        # 3. add them up and take matrix exponential

        n = self.ortho_n
        num_triu_entries = (n - 1) * n / 2
        triu_index_matrix = np.zeros([n, n], dtype=np.int32)
        triu_index_matrix[np.triu_indices(n, 1)] = np.arange(num_triu_entries)
        triu_index_matrix[np.triu_indices(
            n, 1)[::-1]] = np.arange(num_triu_entries)
        triu_mat = self.decorr[
            triu_index_matrix]  # symmetric matrix with diagonal values be the first element of self.decorr
        triu_mat = tt.extra_ops.fill_diagonal(triu_mat,
                                              0)  # set diagonal values zero
        triu_mat = tt.set_subtensor(triu_mat[np.triu_indices(n, 1)[::-1]],
                                    triu_mat[np.triu_indices(n, 1)[::-1]] * -1)

        part1 = tt.identity_like(triu_mat) + triu_mat
        part2 = tt.nlinalg.MatrixInverse()(tt.identity_like(triu_mat) -
                                           triu_mat)
        orth_mat = K.dot(part1, part2)
        return orth_mat
 def get_opt_A(self, sn_trf, EPhiTPhi, XT_EPhi, K_MM):
     cholSigInv = sT.cholesky(EPhiTPhi + sn_trf * T.identity_like(K_MM))
     cholK_MM = sT.cholesky(K_MM + 1e-6 * T.identity_like(K_MM))
     invCholSigInv = sT.matrix_inverse(cholSigInv)
     invCholK_MM = sT.matrix_inverse(cholK_MM)
     InvSig = invCholSigInv.T.dot(invCholSigInv)
     InvK_MM = invCholK_MM.T.dot(invCholK_MM)
     Sig_EPhiT_X = InvSig.dot(XT_EPhi.T)
     return Sig_EPhiT_X, cholSigInv, cholK_MM, InvK_MM
def exact_proj_sqrtm(x, x_test, gp_params, indep_noise, batch_size):
    Ktt = cov_mat(x_test, x_test, gp_params)
    Kxt = cov_mat(x, x_test, gp_params)
    Kxx = cov_mat(x, x, gp_params)
    Kxx = Kxx + indep_noise * T.identity_like(Kxx)
    KxtT_Kxxinv = Kxt.T.dot(T.nlinalg.matrix_inverse(Kxx))
    K = Ktt - KxtT_Kxxinv.dot(Kxt)
    K = K + 1e-10 * T.identity_like(K)
    R = theano_sqrtm(K)
    eps = rng.normal(size=(batch_size, x_test.shape[0]))
    return R.dot(eps.T).T
def add_withening_regularization(hidden_x, hidden_y_reversed):
    hooks_temp = {}
    loss_withen = T.constant(0)
    for x, y in zip(hidden_x, hidden_y_reversed):
        x_value = lasagne.layers.get_output(x, moving_avg_hooks=hooks_temp)
        y_value = lasagne.layers.get_output(y, moving_avg_hooks=hooks_temp)

        cov_x = T.dot(x_value.T, x_value) / x_value.shape[0]
        cov_y = T.dot(y_value.T, y_value) / y_value.shape[0]

        loss_withen += Params.WITHEN_REG_X * T.mean(T.sum(abs(cov_x - T.identity_like(cov_x)), axis=0))
        loss_withen += Params.WITHEN_REG_Y * T.mean(T.sum(abs(cov_y - T.identity_like(cov_y)), axis=0))
    return loss_withen
def log_joint_scan_fn(n, llik, y, cov, mask):
    
    partial_cov = T.outer(mask[:, n], mask[:, n]) * cov + (1 - mask[:, n]) * T.identity_like(cov)
    llik += (-1 / 2.0) * T.log(T.nlinalg.Det()(partial_cov)) - (1 / 2.0) * T.dot(y[:, n].T, T.dot(
        T.nlinalg.MatrixInverse()(partial_cov), y[:, n]))

    return llik
Beispiel #7
0
def chi2_test_statistic(M, Obs, K, num_M, num_Obs):
    #Getting frequencies from observations
    Ns = T.dot(Obs,T.ones((K,1)))
    p = Obs/Ns
        
    #Find the zeros so we can deal with them later
    pZEROs = T.eq(p, 0)
    mZEROs = T.eq(M, 0)
    
    #log probabilities, with -INF as log(0)
    lnM = T.log(M + mZEROs) - INF*mZEROs
    lnp = T.log(p + pZEROs) - INF*pZEROs


    #Using kroneker products so every row of M hits every row of P in the difference klnM - kln
    O_ones = T.ones((num_Obs,1))
    M_ones = T.ones((num_M,1))
    klnM = kron(lnM,O_ones)
    klnP = kron(M_ones, lnp)
    klnP_M = klnP - klnM
    kObs = kron(M_ones, Obs)
    
    G = 2.0*T.dot(klnP_M ,kObs.T)
    
    G = G*T.identity_like(G)
    G = T.dot(G,T.ones((num_M*num_Obs,1)))   
    G = T.reshape(G,(num_M,num_Obs))
    
    #The following quotient improves the convergence to chi^2 by an order of magnitude
    #source: http://en.wikipedia.org/wiki/Multinomial_test
    
    #numerator = T.dot(- 1.0/(M + 0.01),T.ones((K,1))) - T.ones((num_M,1))    
    #q1 = T.ones((num_M,num_Obs)) + T.dot(numerator,1.0/Ns.T/6.0)/(K-1.0)
        
    return G#/q1 
def exact_post_mean(x, x_test, gp_params, indep_noise, y):
    Kxt = cov_mat(x, x_test, gp_params)
    Kxx = cov_mat(x, x, gp_params)
    Kxx = Kxx + indep_noise * T.identity_like(Kxx)
    KxtT_Kxxinv = Kxt.T.dot(T.nlinalg.matrix_inverse(Kxx))
    mu = KxtT_Kxxinv.dot(y)
    return mu
Beispiel #9
0
    def __init__(self, n_comp=10, verbose=False):

        # Theano initialization
        self.T_weights = shared(np.eye(n_comp, dtype=np.float32))
        self.T_bias = shared(np.ones((n_comp, 1), dtype=np.float32))

        T_p_x_white = T.fmatrix()
        T_lrate = T.fscalar()
        T_block = T.fscalar()
        T_unmixed = T.dot(self.T_weights,T_p_x_white) + T.addbroadcast(self.T_bias,1)
        T_logit = 1 - 2 / (1 + T.exp(-T_unmixed))

        T_out =  self.T_weights +  T_lrate * T.dot(T_block * T.identity_like(self.T_weights) + T.dot(T_logit, T.transpose(T_unmixed)), self.T_weights)
        T_bias_out = self.T_bias + T_lrate * T.reshape(T_logit.sum(axis=1), (-1,1))
        T_max_w = T.max(self.T_weights)
        T_isnan = T.any(T.isnan(self.T_weights))

        self.w_up_fun = theano.function([T_p_x_white, T_lrate, T_block],
                                        [T_max_w, T_isnan],
                                        updates=[(self.T_weights, T_out),
                                                 (self.T_bias, T_bias_out)],
                                        allow_input_downcast=True)

        T_matrix = T.fmatrix()
        T_cov = T.dot(T_matrix,T.transpose(T_matrix))/T_block
        self.cov_fun = theano.function([T_matrix, T_block], T_cov, allow_input_downcast=True)
        
        self.loading = None
        self.sources = None
        self.weights = None
        self.n_comp = n_comp
        self.verbose = verbose
Beispiel #10
0
    def get_output_for(self, inputs, **kwargs):
        """
        Compute diffusion convolution of inputs.

        """

        A = inputs[0]
        X = inputs[1]

        # Normalize by degree.
        A = A / (T.sum(A, 0) + 1.0)

        Apow_list = [T.identity_like(A)]
        for i in range(1, self.parameters.num_hops + 1):
            Apow_list.append(A.dot(Apow_list[-1]))
        Apow = T.stack(Apow_list)

        Apow_dot_X = T.dot(Apow, X)

        Apow_dot_X_times_W = Apow_dot_X * self.W

        out = self.nonlinearity(
            T.mean(
                T.reshape(T.mean(Apow_dot_X_times_W, 1),
                          (1,
                           (self.parameters.num_hops + 1), self.num_features)),
                2))

        return out
Beispiel #11
0
    def build(self,
              loss1="crossentropy",
              loss2="mse",
              optimizer="rmsprop",
              lr=0.01,
              rho=0.9,
              epsilon=1e-6):
        self.loss1 = losses.get(loss1)
        self.loss2 = losses.get(loss2)
        optim = optimizers.get(optimizer, inst=False)

        if optim.__name__ == "RMSprop":
            self.optimizer = optim(lr=lr, rho=rho, epsilon=epsilon)
        elif optim.__name__ == "Adagrad":
            self.optimizer = optim(lr=lr, epsilon=epsilon)
        else:
            self.optimizer = optim(lr=lr)

        # get input to model
        self.X_c = TT.fmatrix(name="X_c")  # n_batches*input_dim
        # output label
        self.Y = TT.matrix(dtype=theano.config.floatX,
                           name="Y")  # n_batches*nc
        self.X_recon, self.Y_pred, self.Y_class, self.ave_emb, self.group_ids = self.get_output(
        )  # Y_pred: n_batches*nc

        # prediction_loss + reconstruction_loss + reg_loss
        train_loss_pred = self.get_loss1(self.Y, self.Y_pred) + self.get_loss2(
            self.X_c, self.X_recon)
        reg1_loss = TT.sqr(self.W_g).mean() + TT.sqr(
            self.W_enc).mean() + TT.sqr(self.W_dec).mean()
        Wg_norm, _ = theano.scan(lambda row: row / LA.norm(row),
                                 sequences=[self.W_g])
        inter_Wg = TT.dot(Wg_norm, TT.transpose(Wg_norm))
        reg2_loss = self.get_loss2(inter_Wg, TT.identity_like(inter_Wg))
        train_loss = train_loss_pred + 0.1 * reg1_loss + 0.0001 * reg2_loss

        updates = self.optimizer.get_updates(self.params, cost=train_loss)

        self.grad_h = theano.function(inputs=[self.X_c, self.Y],
                                      on_unused_input='warn',
                                      outputs=optimizers.get_gradients(
                                          train_loss, self.X_c),
                                      allow_input_downcast=True)
        self.train = theano.function(inputs=[self.X_c, self.Y],
                                     on_unused_input='warn',
                                     outputs=train_loss_pred,
                                     updates=updates,
                                     allow_input_downcast=True,
                                     mode=None)
        self.predict = theano.function(inputs=[self.X_c],
                                       on_unused_input='warn',
                                       outputs=[self.Y_pred, self.Y_class],
                                       allow_input_downcast=True)

        self.get_emb = theano.function(inputs=[self.X_c],
                                       on_unused_input='warn',
                                       outputs=[self.ave_emb, self.group_ids],
                                       allow_input_downcast=True,
                                       mode=None)
Beispiel #12
0
def MVNormalScan(n, is_observed_matrix, covariance_matrix_mvn_scan, w_mvn_scan,
                 zY, zK, true_full_matrix):
    #construct is_unobserved_matrix a vector of 1s and 0s where the ith coord is a 1 if we haven't seen the ith coord of y_n
    is_unobserved_matrix = -(is_observed_matrix[:, n] - T.ones(D))
    #construct covariance of the observed entries where the rows/columns with nothing have a 1 on diag (so invertible)
    sigma_observed = T.outer(
        is_observed_matrix[:, n],
        is_observed_matrix[:, n]) * covariance_matrix_mvn_scan + (
            is_unobserved_matrix * T.identity_like(covariance_matrix_mvn_scan))
    sigma_unobs_obs = (T.outer(
        is_unobserved_matrix,
        is_observed_matrix[:, n])) * covariance_matrix_mvn_scan
    sigma_observed_inv = T.nlinalg.MatrixInverse()(sigma_observed)
    dummy_y = T.zeros(D)
    #draw the mean vector dummy_y from N(0, wwT+sigma^2I) using computationally fast trick
    dummy_results, dummy_updates = theano.scan(
        lambda prior_result, sigma, zY, w_mvn_scan, zK, n: T.sqrt(
            sigma) * zY[:, n] + T.dot(w_mvn_scan, zK[:, n]) + prior_result,
        sequences=None,
        outputs_info=T.zeros(D),
        non_sequences=[sigma, zY, w_mvn_scan, zK, n],
        n_steps=R)
    dummy_y = dummy_results[-1]
    dummy_y /= R
    dummy_y_obs = is_observed_matrix[:, n] * dummy_y
    dummy_y_unobs = is_unobserved_matrix * dummy_y
    y_unobserved = dummy_y_unobs + T.dot(
        T.dot(sigma_unobs_obs, sigma_observed_inv),
        (theano_observed_matrix[:, n] - dummy_y_obs))
    y_unobserved = (y_unobserved * is_unobserved_matrix) + (
        true_full_matrix[:, n] + ERROR[0]) * is_observed_matrix[:, n]
    #Add true_full_matrix for BETA discount method instead of subtracting infinity from observed entries
    return [y_unobserved, sigma_unobs_obs, sigma_observed_inv]
    def _svi(N, D, K, mnp, masknp, w_init, r_init):
        Shared = lambda shape, name: theano.shared(value=np.ones(shape, dtype=theano.config.floatX),
                                                   name=name, borrow=True)
        srng = T.shared_randomstreams.RandomStreams(seed=120)

        mask = Shared((D, N), 'mask')
        mask.set_value(masknp)
        m = T.as_tensor_variable(mnp)
        y = mask * m
        zero_y = T.as_tensor_variable(np.zeros((D, N)))
        zero2 = T.as_tensor_variable(np.zeros((D, D)))
        zero = T.as_tensor_variable(np.zeros(D))
        st = T.sum(T.neq(y, zero_y), axis=0)
        s = st.eval()

        # Define variational parameters
        m_w = Shared((D, K), 'm_w')
        m_w.set_value(w_init)
        s_w = Shared((D, K), "s_w")
        m_r = Shared((K), 'm_r')
        m_r.set_value(r_init)
        s_r = Shared((K), 's_r')
        m_gamma = Shared((1), 'm_gamma')
        s_gamma = Shared((1), 's_gamma')
        m_gamma0 = Shared((1), 'm_gamma0')
        s_gamma0 = Shared((1), 's_gamma0')
        m_c0 = Shared((1), 'm_c0')
        s_c0 = Shared((1), 's_c0')
        m_sigma = Shared((1), 'm_sigma')
        s_sigma = Shared((1), 's_sigma')

        # Define noise for model parameters
        z_w = srng.normal((D, K))
        z_r = srng.normal([K])
        z_gamma = srng.normal([1])
        z_gamma0 = srng.normal([1])
        z_c0 = srng.normal([1])
        z_sigma = srng.normal([1])

        # Define variational parameters
        # All model parameters have a log-normal variational posterior
        w = T.exp(m_w + z_w * s_w)
        r = T.exp(m_r + z_r * s_r)
        gamma = T.exp(m_gamma + z_gamma * s_gamma)
        gamma0 = T.exp(m_gamma0 + z_gamma0 * s_gamma0)
        c0 = T.exp(m_c0 + z_c0 * s_c0)
        sigma = T.exp(m_sigma + z_sigma * s_sigma)

        # Define random variables for mVNscan component
        z_y = srng.normal([D])
        z_k = srng.normal([K])
        z_eps = srng.normal()

        # For data given seqentially we need a different covariance matrix for each yn
        wwT = T.dot(w, w.T)
        cov = Shared((D, D), 'cov')
        cov = wwT + sigma[0] * T.identity_like(wwT)

        return mask, m, y, zero_y, zero2, zero, st, m_w, s_w, w, m_r, s_r, r, m_gamma, s_gamma, gamma, m_gamma0, \
               s_gamma0, gamma0, m_c0, s_c0, c0, m_sigma, s_sigma, sigma, z_y, z_k, z_eps, wwT, cov
Beispiel #14
0
 def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total):
     sigma2 = tt.square(sigma)
     Kuu = cov_total(Xu)
     Kuf = cov_total(Xu, X)
     Luu = cholesky(stabilize(Kuu))
     A = solve_lower(Luu, Kuf)
     Qffd = tt.sum(A * A, 0)
     if self.approx == "FITC":
         Kffd = cov_total(X, diag=True)
         Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2
     else:  # VFE or DTC
         Lamd = tt.ones_like(Qffd) * sigma2
     A_l = A / Lamd
     L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A)))
     r = y - mean_total(X)
     r_l = r / Lamd
     c = solve_lower(L_B, tt.dot(A, r_l))
     Kus = self.cov_func(Xu, Xnew)
     As = solve_lower(Luu, Kus)
     mu = self.mean_func(Xnew) + tt.dot(tt.transpose(As), solve_upper(tt.transpose(L_B), c))
     C = solve_lower(L_B, As)
     if diag:
         Kss = self.cov_func(Xnew, diag=True)
         var = Kss - tt.sum(tt.square(As), 0) + tt.sum(tt.square(C), 0)
         if pred_noise:
             var += sigma2
         return mu, var
     else:
         cov = (self.cov_func(Xnew) - tt.dot(tt.transpose(As), As) +
                tt.dot(tt.transpose(C), C))
         if pred_noise:
             cov += sigma2 * tt.identity_like(cov)
         return mu, stabilize(cov)
Beispiel #15
0
 def get_opt_A(self, sn_trf, EPhiTPhi, XT_EPhi):
     SigInv = EPhiTPhi + (sn_trf + 1e-6) * T.identity_like(EPhiTPhi)
     cholSigInv = sT.cholesky(SigInv)
     invCholSigInv = sT.matrix_inverse(cholSigInv)
     InvSig = invCholSigInv.T.dot(invCholSigInv)
     Sig_EPhiT_X = InvSig.dot(XT_EPhi.T)
     return Sig_EPhiT_X, cholSigInv
Beispiel #16
0
    def calcKFAC(grad_vec, damp):
        self.grads = []
        # self.acts = [TT.concatenate([self.model.x, TT.ones((self.model.x.shape[0], 1))], axis=1)]
        self.acts = [self.model.x]
        for l in self.model.layers:
            S = TT.grad(self.loss, l.s)
            self.grads.append(S)
            self.acts.append(l.a)

        self.G = []
        self.A = []
        self.F_block = []
        self.F = []

        cnt = TT.cast(self.grads[0].shape[0], theano.config.floatX)
        for i in range(len(self.grads)):
            self.G += [[]]
            self.A += [[]]
            for j in range(len(self.grads)):
                # self.G[-1] += [TT.mean(TT.batched_dot(self.grads[i].dimshuffle(0, 1, 'x'), self.grads[j].dimshuffle(0, 'x', 1)), 0).dimshuffle('x', 0, 1)]
                # self.A[-1] += [TT.mean(TT.batched_dot(self.acts[i].dimshuffle(0, 1, 'x'), self.acts[j].dimshuffle(0, 'x', 1)), 0).dimshuffle('x', 0, 1)]

                # self.G[-1] += [TT.batched_dot(self.grads[i].dimshuffle(0, 1, 'x'), self.grads[j].dimshuffle(0, 'x', 1))]
                # self.A[-1] += [TT.batched_dot(self.acts[i].dimshuffle(0, 1, 'x'), self.acts[j].dimshuffle(0, 'x', 1))]

                self.G[-1] += [
                    self.grads[i].TT.dot(self.grads[j]).dimshuffle('x', 0, 1) /
                    cnt
                ]
                self.A[-1] += [
                    self.acts[i].TT.dot(self.acts[j]).dimshuffle('x', 0, 1) /
                    cnt
                ]

                if self.diag:
                    self.G[-1][-1] *= float(i == j)
                    self.A[-1][-1] *= float(i == j)

        for i in range(len(self.grads)):
            self.F_block += [[]]
            for j in range(len(self.grads)):
                # depends on whether you want to compute the real fisher with this or the kr approximation
                # since numpy-base fast_kron somehow computes 3d tensors faster than theano

                # cblock = fast_kron(self.A[i][j], self.G[i][j])
                cblock = native_kron(self.A[i][j], self.G[i][j])

                cblock = cblock.reshape(cblock.shape[1:], ndim=2)
                self.F_block[i] += [cblock]
            self.F.append(TT.concatenate(self.F_block[-1], axis=1))
        self.F = TT.concatenate(self.F, axis=0)
        self.F = (self.F + self.F.T) / 2

        self.Fdamp = self.F + TT.identity_like(self.F) * damp

        # new_grad_vec = theano.tensor.slinalg.solve(self.Fdamp, grad_vec.dimshuffle(0, 'x'))
        new_grad_vec = solve_sym_pos(self.Fdamp, grad_vec)
        # new_grad_vec = gpu_solve(self.Fdamp, grad_vec.dimshuffle(0, 'x'))

        return new_grad_vec
Beispiel #17
0
    def build_L(self,
                sentences,
                context,
                activation=T.nnet.sigmoid,
                biased_diagonal=False):
        """Constructs the matrix L (L-Ensemble from the paper 'Determinantal point processes for
        machine learning' (Kulesza, Taskar, 2013). L_mn = p_m'*p_n*(2*f(p_m'*context)-1)*(2*f(p_n'*context)-1),
        where p_m and p_n are the feature vector of items m and n. f is the activation function and context the
        context vector._

        :type sentences: T.matrix, shape = (num_items_in_set, dim_per_item)
        :param sentences: The input matrix for the DPP. Each row of the DPP encodes the feature vector for one item
        of a ground set S.

        :type context: T.vector, shape = (dim_per_item)
        :param context: The feature vector encoding the context

        :type activation: Theano tensor function
        :parameter activation: An activation function (a sigmoid is recommended to get reasonable values).

        :type return value: T.matrix, shape = (num_items_in_set, num_items_in_set)
        :return return value: The Ensemble Matrix L.

        """

        f_sentT_cont = activation(T.dot(sentences, context))
        f2_1 = f_sentT_cont

        B = sentences * f2_1.dimshuffle((0, 'x'))
        B_BT = T.dot(B, B.T)
        if biased_diagonal:
            return B_BT + T.identity_like(B_BT) * 0.1
        else:
            return B_BT
Beispiel #18
0
    def get_monitoring_channels(self, model, X, Y=None, **kwargs):

        if not self.supervised:
            Y = None

        WBW = T.dot(model.W.T * model.beta,  model.W)
        target = T.identity_like(WBW)
        err = WBW - target
        penalty = T.sqr(err).sum()
        log_likelihood =  model.log_likelihood(X).mean()

        diag = (T.sqr(model.W) * model.beta.dimshuffle(0,'x')).sum(axis=0)
        diag_penalty = T.sqr(diag-1.).sum()

        rval = {
                'constraint_sum_sq_err' : penalty,
                'diagonal_constraint_sum_sq_err' : diag_penalty,
                'log_likelihood' : log_likelihood }

        if self.use_admm:
            dual = model.dual
            rval['dual_min'] = dual.min()
            rval['dual_max'] = dual.max()
            rval['dual_mean'] = dual.mean()
            abs_dual = abs(dual)
            rval['abs_dual_min'] = abs_dual.min()
            rval['abs_dual_mean'] = abs_dual.mean()
            rval['abs_dual_max'] = abs_dual.max()

        return rval
Beispiel #19
0
        def __init__(self, x, n_in, n_hidden, n_out, steps, rng=rng):
            """
            Initialize a basic single-layer RNN
            
            x:    symbolic input tensor
            n_in:    input dimensionality
            n_hidden:    # of hidden units
            hidden_activation:    non-linearity at hidden units (e.g. relu)
            n_out:    # of output units
            steps:    # of time steps to truncate BPTT at
            """
            self.Wx = _uniform_weight(n_in, n_hidden, rng)
            self.Wh = _ortho_weight(n_hidden, rng)
            self.Wy = _uniform_weight(n_hidden, n_out, rng)
            self.bh = _zero_bias(n_hidden)
            self.by = _zero_bias(n_out)
            self.params = [self.Wx, self.Wh, self.Wy, self.bh, self.by]

            def step(x_t, h_tm1, Wx, Wh, Wy, bh, by):
                h_t = relu(T.dot(x_t, Wx) + T.dot(h_tm1, Wh) + bh)
                y_t = relu(T.dot(h_t, Wy) + by)
                return [h_t, y_t]

            h0 = T.zeros((n_hidden, ), dtype=theano.config.floatX)
            ([h, self.output], _) = theano.scan(
                fn=step,
                sequences=x.dimshuffle([1, 0, 2]),
                outputs_info=[T.alloc(h0, x.shape[0], n_hidden), None],
                non_sequences=[self.Wx, self.Wh, self.Wy, self.bh, self.by],
                strict=True,
                truncate_gradient=steps)
            self.orthogonality = T.sum(
                T.sqr(T.dot(self.Wh, self.Wh.T) - T.identity_like(self.Wh)))
Beispiel #20
0
 def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma,
                        cov_total, mean_total):
     sigma2 = tt.square(sigma)
     Kuu = cov_total(Xu)
     Kuf = cov_total(Xu, X)
     Luu = cholesky(stabilize(Kuu))
     A = solve_lower(Luu, Kuf)
     Qffd = tt.sum(A * A, 0)
     if self.approx == "FITC":
         Kffd = cov_total(X, diag=True)
         Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2
     else:  # VFE or DTC
         Lamd = tt.ones_like(Qffd) * sigma2
     A_l = A / Lamd
     L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A)))
     r = y - mean_total(X)
     r_l = r / Lamd
     c = solve_lower(L_B, tt.dot(A, r_l))
     Kus = self.cov_func(Xu, Xnew)
     As = solve_lower(Luu, Kus)
     mu = self.mean_func(Xnew) + tt.dot(tt.transpose(As),
                                        solve_upper(tt.transpose(L_B), c))
     C = solve_lower(L_B, As)
     if diag:
         Kss = self.cov_func(Xnew, diag=True)
         var = Kss - tt.sum(tt.square(As), 0) + tt.sum(tt.square(C), 0)
         if pred_noise:
             var += sigma2
         return mu, var
     else:
         cov = (self.cov_func(Xnew) - tt.dot(tt.transpose(As), As) +
                tt.dot(tt.transpose(C), C))
         if pred_noise:
             cov += sigma2 * tt.identity_like(cov)
         return mu, cov if pred_noise else stabilize(cov)
Beispiel #21
0
 def cal_encoder_step(self, encoder_val):
     '''
         Calculate the weight ratios in encoder.
         
         :type decoder_val: class
         :param decoder_val: the class which stores the intermediate variables in encoder
         
         :returns: R_h_x, R_h_h are theano variables, weight ratios in encoder
     '''
     encoder_val.x = encoder_val.x.dimshuffle(0, 1, 'x')
     R_state_in_x = (encoder_val.x * self.input_emb + self.input_emb_offset
                     ) / (self.ep * TT.sgn(encoder_val.state_in) +
                          encoder_val.state_in).dimshuffle(0, 'x', 1)
     R_state_in_x = R_state_in_x.dimshuffle(0, 2, 1)
     R_reset_in_x = encoder_val.x * self.reset_emb / (
         encoder_val.reset_in +
         self.ep * TT.sgn(encoder_val.reset_in)).dimshuffle(0, 'x', 1)
     R_reset_in_x = R_reset_in_x.dimshuffle(0, 2, 1)
     R_gate_in_x = encoder_val.x * self.gate_emb / (
         encoder_val.gate_in +
         self.ep * TT.sgn(encoder_val.gate_in)).dimshuffle(0, 'x', 1)
     R_gate_in_x = R_gate_in_x.dimshuffle(0, 2, 1)
     h_before = encoder_val.h_before.dimshuffle(0, 1, 'x')
     R_gate_h = h_before * self.gate_hidden / (
         encoder_val.gate + self.ep * TT.sgn(encoder_val.gate)).dimshuffle(
             0, 'x', 1)
     R_gate_x = R_gate_in_x * (encoder_val.gate_in / (
         encoder_val.gate + self.ep * TT.sgn(encoder_val.gate))).dimshuffle(
             0, 1, 'x')
     R_reset_h = h_before * self.reset_hidden / (
         encoder_val.reset +
         self.ep * TT.sgn(encoder_val.reset)).dimshuffle(0, 'x', 1)
     R_reset_x = R_reset_in_x * (
         encoder_val.reset_in /
         (encoder_val.reset +
          self.ep * TT.sgn(encoder_val.reset))).dimshuffle(0, 1, 'x')
     R_reseted_h = R_reset_h * self.weight + TT.eye(self.dim,
                                                    self.dim) * self.weight
     R_reseted_x = R_reset_x * self.weight
     encoder_val.reseted = encoder_val.reseted.dimshuffle(0, 1, 'x')
     R_state_reseted = encoder_val.reseted * self.input_hidden / (
         encoder_val.state +
         self.ep * TT.sgn(encoder_val.state)).dimshuffle(0, 'x', 1)
     R_state_reseted = R_state_reseted.dimshuffle(0, 2, 1)
     R_state_h = TT.batched_dot(R_state_reseted, R_reseted_h)
     R_state_x = TT.batched_dot(R_state_reseted, R_reseted_x)
     R_state_x += R_state_in_x * (
         (encoder_val.state_in /
          (encoder_val.state +
           self.ep * TT.sgn(encoder_val.state))).dimshuffle(0, 1, 'x'))
     R_h = (encoder_val.gate * encoder_val.state /
            (encoder_val.h + self.ep * TT.sgn(encoder_val.h))).dimshuffle(
                0, 1, 'x') * self.weight
     R_h_h = R_state_h * R_h + R_gate_h * R_h
     R_h2 = ((1 - encoder_val.gate) * encoder_val.h_before /
             (encoder_val.h + self.ep * TT.sgn(encoder_val.h))).dimshuffle(
                 0, 1, 'x')
     R_h_h = TT.identity_like(R_h_h[0]) * R_h2
     R_h_x = R_gate_x * R_h + R_state_x * R_h
     return R_h_x, R_h_h
Beispiel #22
0
    def get_monitoring_channels(self, model, X, Y=None, **kwargs):

        if not self.supervised:
            Y = None

        WBW = T.dot(model.W.T * model.beta, model.W)
        target = T.identity_like(WBW)
        err = WBW - target
        penalty = T.sqr(err).sum()
        log_likelihood = model.log_likelihood(X).mean()

        diag = (T.sqr(model.W) * model.beta.dimshuffle(0, 'x')).sum(axis=0)
        diag_penalty = T.sqr(diag - 1.).sum()

        rval = {
            'constraint_sum_sq_err': penalty,
            'diagonal_constraint_sum_sq_err': diag_penalty,
            'log_likelihood': log_likelihood
        }

        if self.use_admm:
            dual = model.dual
            rval['dual_min'] = dual.min()
            rval['dual_max'] = dual.max()
            rval['dual_mean'] = dual.mean()
            abs_dual = abs(dual)
            rval['abs_dual_min'] = abs_dual.min()
            rval['abs_dual_mean'] = abs_dual.mean()
            rval['abs_dual_max'] = abs_dual.max()

        return rval
    def loss(lv1, lv2):
        """ Contrastive cosine distance optimization target """

        # number of samples in batch
        n = lv1.shape[0]

        # compute cosine distance
        D = lv1.dot(lv2.T)

        # compute arcus cosinus -> converts similarity into distance
        D = T.arccos(D)

        # distance between matching pairs
        d = D.diagonal().reshape((-1, 1))

        # distance between non-matching pairs
        M = T.identity_like(D)
        O = D[(M <= 0).nonzero()].reshape((n, n - 1))

        # max margin hinge loss
        L = gamma + d
        L = T.repeat(L, n - 1, 1)
        L -= O
        L = T.clip(L, 0, 1000)

        # compute batch mean
        loss = L.mean()

        return weight * loss
Beispiel #24
0
def batch_jacobian(f, wrt, size=None, *args, **kwargs):
    """Computes the jacobian of f(x) w.r.t. x in parallel.

    Args:
        f: Symbolic function.
        x: Variables to differentiate with respect to.
        size: Expected vector size of f(x).
        *args: Additional positional arguments to pass to `f()`.
        **kwargs: Additional key-word arguments to pass to `f()`.

    Returns:
        Theano tensor.
    """
    if isinstance(wrt, T.TensorVariable):
        if size is None:
            y = f(wrt, *args, **kwargs).shape[-1]
        x_rep = T.tile(wrt, (size, 1))
        y_rep = f(x_rep, *args, **kwargs)
    else:
        if size is None:
            size = f(*wrt, *args, **kwargs).shape[-1]
        x_rep = [T.tile(x, (size, 1)) for x in wrt]
        y_rep = f(*x_rep, *args, **kwargs)

    J = T.grad(
        cost=None,
        wrt=x_rep,
        known_grads={y_rep: T.identity_like(y_rep)},
        disconnected_inputs="ignore",
    )
    return J
Beispiel #25
0
def chi2_test_statistic(M, Obs, K, num_M, num_Obs):
    #Getting frequencies from observations
    Ns = T.dot(Obs, T.ones((K, 1)))
    p = Obs / Ns

    #Find the zeros so we can deal with them later
    pZEROs = T.eq(p, 0)
    mZEROs = T.eq(M, 0)

    #log probabilities, with -INF as log(0)
    lnM = T.log(M + mZEROs) - INF * mZEROs
    lnp = T.log(p + pZEROs) - INF * pZEROs

    #Using kroneker products so every row of M hits every row of P in the difference klnM - kln
    O_ones = T.ones((num_Obs, 1))
    M_ones = T.ones((num_M, 1))
    klnM = kron(lnM, O_ones)
    klnP = kron(M_ones, lnp)
    klnP_M = klnP - klnM
    kObs = kron(M_ones, Obs)

    G = 2.0 * T.dot(klnP_M, kObs.T)

    G = G * T.identity_like(G)
    G = T.dot(G, T.ones((num_M * num_Obs, 1)))
    G = T.reshape(G, (num_M, num_Obs))

    #The following quotient improves the convergence to chi^2 by an order of magnitude
    #source: http://en.wikipedia.org/wiki/Multinomial_test

    #numerator = T.dot(- 1.0/(M + 0.01),T.ones((K,1))) - T.ones((num_M,1))
    #q1 = T.ones((num_M,num_Obs)) + T.dot(numerator,1.0/Ns.T/6.0)/(K-1.0)

    return G  #/q1
Beispiel #26
0
    def __init__(self,
                 steps      = 1,
                 num_layers = 2,
                 num_units  = 32,
                 eps        = 1e-2):

        self.X, self.Z         = T.fvectors('X','Z')
        self.P, self.Q, self.R = T.fmatrices('P','Q','R')
        self.dt                = T.scalar('dt')

        self.matrix_inv = T.nlinalg.MatrixInverse()

        self.ar = AutoRegressiveModel(steps      = steps,
                                      num_layers = num_layers,
                                      num_units  = num_units,
                                      eps        = eps)

        l = InputLayer(input_var = self.X,
                       shape     = (steps,))
        l = ReshapeLayer(l, shape = (1,steps,))
        l = self.ar.network(l)
        l = ReshapeLayer(l, shape=(1,))

        self.l_ = l
        self.f_ = get_output(self.l_)

        self.X_  = T.concatenate([self.f_, T.dot(T.eye(steps)[:-1], self.X)], axis=0)
        self.fX_ = G.jacobian(self.X_.flatten(), self.X)
        self.P_  = T.dot(T.dot(self.fX_, self.P), T.transpose(self.fX_)) + \
                    T.dot(T.dot(T.eye(steps)[:,0:1], self.dt * self.Q), T.eye(steps)[0:1,:])

        self.h = T.dot(T.eye(steps)[0:1], self.X_)
        self.y = self.Z - self.h

        self.hX_ = G.jacobian(self.h, self.X_)

        self.S = T.dot(T.dot(self.hX_, self.P_), T.transpose(self.hX_)) + self.R
        self.K = T.dot(T.dot(self.P_, T.transpose(self.hX_)), self.matrix_inv(self.S))

        self.X__ = self.X_ + T.dot(self.K, self.y)
        self.P__ = T.dot(T.identity_like(self.P) - T.dot(self.K, self.hX_), self.P_)


        self.prediction = theano.function(inputs  = [self.X,
                                                     self.P,
                                                     self.Q,
                                                     self.dt],
                                          outputs = [self.X_,
                                                     self.P_],
                                          allow_input_downcast = True)

        self.update = theano.function(inputs  = [self.X,
                                                 self.Z,
                                                 self.P,
                                                 self.Q,
                                                 self.R,
                                                 self.dt],
                                      outputs = [self.X__,
                                                 self.P__],
                                      allow_input_downcast = True)
Beispiel #27
0
def ncac(target, embedding):
    """Return the sample wise NCA for classification method.

    This corresponds to the probability that a point is correctly classified
    with a soft knn classifier using leave-one-out. Each neighbour is weighted
    according to an exponential of its negative Euclidean distance. Afterwards,
    a probability is calculated for each class depending on the weights of the
    neighbours. For details, we refer you to

    'Neighbourhood Component Analysis' by
    J Goldberger, S Roweis, G Hinton, R Salakhutdinov (2004).

    :param target: An array of shape `(n,)` where `n` is the number of
        samples. Each entry of the array should be an integer between `0` and
        `k-1`, where `k` is the number of classes.
    :param embedding: An array of shape `(n, d)` where each row represents
        a point in d dimensional space.
    :returns: Array of shape `(n, 1)`.
    """
    # Matrix of the distances of points.
    dist = distance_matrix(embedding)
    thisid = T.identity_like(dist)

    # Probability that a point is neighbour of another point based on
    # the distances.
    top = T.exp(-dist) + 1e-8  # Add a small constant for stability.
    bottom = (top - thisid * top).sum(axis=0)
    p = top / bottom

    # Create a matrix that matches same classes.
    sameclass = T.eq(distance_matrix(target), 0) - thisid
    loss_vector = -(p * sameclass).sum(axis=1)
    # To be compatible with the API, we make this a (n, 1) matrix.
    return T.shape_padright(loss_vector)
Beispiel #28
0
 def get_opt_A(self, tau, EPhiTPhi, YT_EPhi):
     SigInv = EPhiTPhi + (tau**-1 + 1e-4) * T.identity_like(EPhiTPhi)
     cholTauSigInv = tau**0.5 * sT.cholesky(SigInv)
     invCholTauSigInv = sT.matrix_inverse(cholTauSigInv)
     tauInvSig = invCholTauSigInv.T.dot(invCholTauSigInv)
     Sig_EPhiT_Y = tau * tauInvSig.dot(YT_EPhi.T)
     return Sig_EPhiT_Y, tauInvSig, cholTauSigInv
Beispiel #29
0
def ncac(target, embedding):
    """Return the sample wise NCA for classification method.

    This corresponds to the probability that a point is correctly classified
    with a soft knn classifier using leave-one-out. Each neighbour is weighted
    according to an exponential of its negative Euclidean distance. Afterwards,
    a probability is calculated for each class depending on the weights of the
    neighbours. For details, we refer you to

    'Neighbourhood Component Analysis' by
    J Goldberger, S Roweis, G Hinton, R Salakhutdinov (2004).

    :param target: An array of shape `(n,)` where `n` is the number of
        samples. Each entry of the array should be an integer between `0` and
        `k-1`, where `k` is the number of classes.
    :param embedding: An array of shape `(n, d)` where each row represents
        a point in d dimensional space.
    :returns: Array of shape `(n, 1)`.
    """
    # Matrix of the distances of points.
    dist = distance_matrix(embedding)
    thisid = T.identity_like(dist)

    # Probability that a point is neighbour of another point based on
    # the distances.
    top = T.exp(-dist) + 1e-8  # Add a small constant for stability.
    bottom = (top - thisid * top).sum(axis=0)
    p = top / bottom

    # Create a matrix that matches same classes.
    sameclass = T.eq(distance_matrix(target), 0) - thisid
    loss_vector = -(p * sameclass).sum(axis=1)
    # To be compatible with the API, we make this a (n, 1) matrix.
    return T.shape_padright(loss_vector)
Beispiel #30
0
def ncar(target, embedding):
    """Return the NCA for regression loss.

    This is similar to NCA for classification, except that not soft KNN
    classification but regression performance is maximized. (Actually, the
    negative performance is minimized.)

    For details, we refer you to

    'Pose-sensitive embedding by nonlinear nca regression' by
    Taylor, G. and Fergus, R. and Williams, G. and Spiro, I. and Bregler, C.
    (2010)

    Parameters
    ----------

    target : Theano variable
        An array of shape ``(n, d)`` where ``n`` is the number of samples and
        ``d`` the dimensionalty of the target space.
    embedding : Theano variable
        An array of shape ``(n, d)`` where each row represents a point in
        ``d``-dimensional space.

    Returns
    -------

    res : Theano variable
        Array of shape ``(n, 1)``.
    """
    # Matrix of the distances of points.
    dist = distance_matrix(embedding) ** 2
    thisid = T.identity_like(dist)

    # Probability that a point is neighbour of another point based on
    # the distances.
    top = T.exp(-dist) + 1E-8  # Add a small constant for stability.
    bottom = (top - thisid * top).sum(axis=0)
    p = top / bottom

    # Create matrix of distances.
    target_distance = distance_matrix(target, target, 'soft_l1')
    # Set diagonal to 0.
    target_distance -= target_distance * T.identity_like(target_distance)

    loss_vector = (p * target_distance ** 2).sum(axis=1)
    # To be compatible with the API, we make this a (n, 1) matrix.
    return T.shape_padright(loss_vector)
Beispiel #31
0
def ncar(target, embedding):
    """Return the NCA for regression loss.

    This is similar to NCA for classification, except that not soft KNN
    classification but regression performance is maximized. (Actually, the
    negative performance is minimized.)

    For details, we refer you to

    'Pose-sensitive embedding by nonlinear nca regression' by
    Taylor, G. and Fergus, R. and Williams, G. and Spiro, I. and Bregler, C.
    (2010)

    Parameters
    ----------

    target : Theano variable
        An array of shape ``(n, d)`` where ``n`` is the number of samples and
        ``d`` the dimensionalty of the target space.
    embedding : Theano variable
        An array of shape ``(n, d)`` where each row represents a point in
        ``d``-dimensional space.

    Returns
    -------

    res : Theano variable
        Array of shape ``(n, 1)``.
    """
    # Matrix of the distances of points.
    dist = distance_matrix(embedding) ** 2
    thisid = T.identity_like(dist)

    # Probability that a point is neighbour of another point based on
    # the distances.
    top = T.exp(-dist) + 1E-8  # Add a small constant for stability.
    bottom = (top - thisid * top).sum(axis=0)
    p = top / bottom

    # Create matrix of distances.
    target_distance = distance_matrix(target, target, 'soft_l1')
    # Set diagonal to 0.
    target_distance -= target_distance * T.identity_like(target_distance)

    loss_vector = (p * target_distance ** 2).sum(axis=1)
    # To be compatible with the API, we make this a (n, 1) matrix.
    return T.shape_padright(loss_vector)
Beispiel #32
0
    def get_model(self, X, Y, x_test):
        '''
        Gaussian Process Regression model.
        Reference: C.E. Rasmussen, "Gaussian Process for Machine Learning", MIT Press 2006

        Args:
            X: tensor matrix, training data
            Y: tensor matrix, training target
            x_test: tensor matrix, testing data
        
        Returns:
            K: prior cov matrix
            Ks: prior joint cov matrix
            Kss: prior cov matrix for testing data
            Posterior Distribution:
                alpha: alpha = inv(K)*(mu-m)
                sW: vector containing diagonal of sqrt(W)
                L: L = chol(sW*K*sW+eye(n))
            y_test_mu: predictive mean
            y_test_var: predictive variance
            fs2: predictive latent variance
        Note: the cov matrix inverse is computed through Cholesky factorization
        https://makarandtapaswi.wordpress.com/2011/07/08/cholesky-decomposition-for-matrix-inversion/
        '''
        # Compute GP prior distribution: mean and covariance matrices (eq 2.13, 2.14)
        K = self.covFunc(X, X, 'K')  # pior cov
        #m = T.mean(Y)*T.ones_like(Y) # pior mean
        m = self.mean * T.ones_like(Y)  # pior mean

        # Compute GP joint prior distribution between training and test (eq 2.18)
        Ks = self.covFunc(X, x_test, 'Ks')
        # Pay attention!! here is the self test cov matrix.
        Kss = self.covFunc(x_test, x_test, 'Kss', mode='self_test')

        # Compute posterior distribution with noise: L,alpha,sW,and log_likelihood.
        sn2 = T.exp(2 * self.sigma_n)  # noise variance of likGauss
        L = sT.cholesky(K / sn2 + T.identity_like(K))
        sl = sn2
        alpha = T.dot(sT.matrix_inverse(L.T),
                      T.dot(sT.matrix_inverse(L), (Y - m))) / sl
        sW = T.ones_like(T.sum(K, axis=1)).reshape(
            (K.shape[0], 1)) / T.sqrt(sl)
        log_likelihood = T.sum(-0.5 * (T.dot((Y - m).T, alpha)) -
                               T.sum(T.log(T.diag(L))) -
                               X.shape[0] / 2 * T.log(2. * np.pi * sl))

        # Compute predictive distribution using the computed posterior distribution.
        fmu = m + T.dot(Ks.T, alpha)  # Prediction Mu fs|f, eq 2.25
        V = T.dot(sT.matrix_inverse(L),
                  T.extra_ops.repeat(sW, x_test.shape[0], axis=1) * Ks)
        fs2 = Kss - (T.sum(V * V, axis=0)).reshape(
            (1, V.shape[1])).T  # Predication Sigma, eq 2.26
        fs2 = T.maximum(fs2, 0)  # remove negative variance noise
        #fs2 = T.sum(fs2,axis=1) # in case x has multiple dimensions

        y_test_mu = fmu
        y_test_var = fs2 + sn2

        return K, Ks, Kss, y_test_mu, y_test_var, log_likelihood, L, alpha, V, fs2, sW
def onestep_attend_copy():

	i_t = T.dot(x_t, Wi) + T.dot(pre_h, Ui) + T.dot(pre_z, Zi)
	i_t_shape = T.shape(i_t)

	bi_reshape = T.repeat(bi, i_t_shape[0], 0)
	bi_reshape_2x = T.repeat(bi_reshape, i_t_shape[1], 1)

	bf_reshape = T.repeat(bf, i_t_shape[0], 0)
	bf_reshape_2x = T.repeat(bf_reshape, i_t_shape[1], 1)

	bc_reshape = T.repeat(bc, i_t_shape[0], 0)
	bc_reshape_2x = T.repeat(bc_reshape, i_t_shape[1], 1)

	bo_reshape = T.repeat(bo, i_t_shape[0], 0)
	bo_reshape_2x = T.repeat(bo_reshape, i_t_shape[1], 1)

	i_t_new= sigmoid(i_t + bi_reshape_2x)
	f_t= sigmoid(T.dot(x_t, Wf) + T.dot(pre_h, Uf) + T.dot(pre_z, Zf) + bf_reshape_2x)
	o_t= sigmoid(T.dot(x_t, Wo) + T.dot(pre_h, Uo) + T.dot(pre_z, Zo) + bo_reshape_2x)
	c_th = tanh(T.dot(x_t, Wc)  + T.dot(pre_h, Uc) + T.dot(pre_z, Zc) + bc_reshape_2x)

	c_t = f_t*pre_c + i_t_new*c_th

	h_t = o_t*T.tanh(c_t) #shape (1, N, h_dim)

	h_t_context = T.repeat(h_t, image_feature_region.shape[1], axis = 0) #new shape (No_region, N, h_dim)
	image_feature_reshape = T.transpose(image_feature_region, (1, 0, 2))
	#compute non-linear correlation between h_t(current text) to image_feature_region (64 for 128*128 and 196 for 224*224)
	# pdb.set_trace()
	m_t = T.tanh(T.dot(h_t_context, Hcontext) + T.dot(image_feature_reshape, Zcontext)) #shape (No_region, N, context_dim)

	e = T.dot(m_t, Va) #No_region, N, 1
	e_reshape = e.reshape((e.shape[0], T.prod(e.shape[1:])))

	e_softmax = softmax_along_axis(e_reshape, axis = 0) #shape No_region, N

	e_t = T.transpose(e_softmax, (1,0)) #shape N, No_region
	e_t_r = e_t.reshape([-1, e_softmax.shape[0], e_softmax.shape[1]]) #3D tensor 1, N, No_region
	e_t_r_t = T.transpose(e_t_r, (1,0, 2)) # shape N, 1, No_region
	e_3D = T.repeat(e_t_r_t, e_t_r_t.shape[2], axis = 1) #shape N, No_region, No_region  image_feature_region.shape[1]
	e_3D_t = T.transpose(e_3D, (1,2,0)) #No_region, No_region, N

	identity_2D = T.identity_like(e_3D_t)# shape No_region, No_region
	identity_3D = identity_2D.reshape([-1, identity_2D.shape[0], identity_2D.shape[1]]) # shape 1, No_region, No_region
	identity_3D_t = T.repeat(identity_3D,  image_feature_region.shape[0], axis = 0)
	e_3D_diagonal = e_3D*identity_3D_t #diagonal tensor 3D  (N, No_region, No_region)

	out_weight_y, updates = theano.scan(fn=onestep_weight_feature_multiply,
	                                   outputs_info=[weight_y],
	                                   sequences=[e_3D_diagonal, image_feature_region],
	                                   non_sequences=[])

	z_t = T.sum(out_weight_y, axis = 1) #shape (N, feature_dim)

	z_t_r = z_t.reshape((-1,z_t.shape[0],z_t.shape[1]))


	return [h_t, c_t, z_t_r]
Beispiel #34
0
def ldet( theta = Th.dvector('theta'), M    = Th.dmatrix('M') ,
          STA   = Th.dvector('STA'), STC  = Th.dmatrix('STC'), **other):
    '''
    Return log-det of I-sym(M), for display/debugging purposes.
    '''
    ImM = Th.identity_like(M)-(M+M.T)/2
    w, v = eig(ImM)
    return Th.sum(Th.log(w))
Beispiel #35
0
 def correlation(self, H1, H2, m):
     H1bar = H1
     H2bar = H2
     SigmaHat12 = (1.0/(m-1))*T.dot(H1bar, H2bar.T)
     SigmaHat11 = (1.0/(m-1))*T.dot(H1bar, H1bar.T)
     SigmaHat11 = SigmaHat11 + self.r1*T.identity_like(SigmaHat11)
     SigmaHat22 = (1.0/(m-1))*T.dot(H2bar, H2bar.T)
     SigmaHat22 = SigmaHat22 + self.r2*T.identity_like(SigmaHat22)
     Tval = T.dot(SigmaHat11**(-0.5), T.dot(SigmaHat12, SigmaHat22**(-0.5)))
     corr = T.nlinalg.trace(T.dot(Tval.T, Tval))**(0.5)
     self.SigmaHat11 = SigmaHat11
     self.SigmaHat12 = SigmaHat12
     self.SigmaHat22 = SigmaHat22
     self.H1bar = H1bar
     self.H2bar = H2bar
     self.Tval = Tval
     return -1*corr
def orthogonality(x):
    '''
    Penalty for deviation from orthogonality:
    
    ||dot(x.T, x) - I||**2
    '''
    xTx = T.dot(x.T, x)
    return T.sum(T.square(xTx - T.identity_like(xTx)))
Beispiel #37
0
def eigs( theta = Th.dvector('theta'), M    = Th.dmatrix('M') ,
          STA   = Th.dvector('STA')  , STC   = Th.dmatrix('STC'), **other):
    '''
    Return eigenvalues of I-sym(M), for display/debugging purposes.
    '''
    ImM = Th.identity_like(M)-(M+M.T)/2
    w,v = eig( ImM )
    return w
Beispiel #38
0
    def inner_lda_objective(y_true, y_pred):
        """
        It is the loss function of LDA as introduced in the original paper.
        It is adopted from the the original implementation in the following link:
        https://github.com/CPJKU/deep_lda
        Note: it is implemented by Theano tensor operations, and does not work on Tensorflow backend
        """
        r = 1e-4

        # init groups
        yt = T.cast(y_true.flatten(), "int32")
        groups = numpy_unique(yt)

        def compute_cov(group, Xt, yt):
            Xgt = Xt[T.eq(yt, group).nonzero()[0], :]
            Xgt_bar = Xgt - T.mean(Xgt, axis=0)
            m = T.cast(Xgt_bar.shape[0], 'float32')
            return (1.0 / (m - 1)) * T.dot(Xgt_bar.T, Xgt_bar)

        # scan over groups
        covs_t, _ = theano.scan(
            fn=compute_cov,
            outputs_info=None,
            sequences=[groups],
            non_sequences=[y_pred, yt],
            #   mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True),
            mode='DebugMode')

        # compute average covariance matrix (within scatter)
        Sw_t = T.mean(covs_t, axis=0)

        # compute total scatter
        Xt_bar = y_pred - T.mean(y_pred, axis=0)
        m = T.cast(Xt_bar.shape[0], 'float32')
        St_t = (1.0 / (m - 1)) * T.dot(Xt_bar.T, Xt_bar)

        # compute between scatter
        Sb_t = St_t - Sw_t

        # cope for numerical instability (regularize)
        Sw_t += T.identity_like(Sw_t) * r

        # return T.cast(T.neq(yt[0], -1), 'float32')*T.nlinalg.trace(T.dot(T.nlinalg.matrix_inverse(St_t), Sb_t))

        # compute eigenvalues
        evals_t = T.slinalg.eigvalsh(Sb_t, Sw_t)

        # get eigenvalues
        top_k_evals = evals_t[-n_components:]

        # maximize variance between classes
        # (k smallest eigenvalues below threshold)
        thresh = T.min(top_k_evals) + margin
        top_k_evals = top_k_evals[(top_k_evals <= thresh).nonzero()]
        costs = T.mean(top_k_evals)

        return -costs
Beispiel #39
0
 def _calc_caylay_delta(step_size, param, gradient):
     A = Tensor.dot(((step_size / 2) * gradient).T, param) - Tensor.dot(param.T, ((step_size / 2) * gradient))
     I = Tensor.identity_like(A)
     temp = I + A
     # Q = Tensor.dot(batched_inv(temp.dimshuffle('x',0,1))[0], (I - A))
     Q = Tensor.dot(matrix_inverse(temp), I - A)
     update = Tensor.dot(param, Q)
     delta = (step_size / 2) * Tensor.dot((param + update), A)
     return update, delta
Beispiel #40
0
    def get_output_singlesample(self, M):
        """
        Given a molecule tensor M, calculate its fingerprint.
        """
        # if incoming tensor M has padding
        # remove padding first
        # this is the part getting slow-down
        if self.padding:
            rowsum = M.sum(axis=0)
            trim = rowsum[:, -1]
            trim_to = T.eq(trim,
                           0).nonzero()[0][0]  # first index with no bonds
            M = M[:trim_to, :trim_to, :]  # reduced graph

        # dimshuffle to get diagonal items to
        # form atom matrix A
        (A_tmp, updates) = theano.scan(lambda x: x.diagonal(),
                                       sequences=M[:, :, :-1].dimshuffle(
                                           (2, 0, 1)))
        # Now the attributes is (N_features x N_atom), so we need to transpose
        A = A_tmp.T

        # get connectivity matrix: N_atom * N_atom
        C = M[:, :, -1] + T.identity_like(M[:, :, -1])

        # get bond tensor: N_atom * N_atom * (N_features-1)
        B_tmp = M[:, :, :-1] - A
        coeff = K.concatenate([M[:, :, -1:]] * self.inner_dim, axis=2)
        B = merge([B_tmp, coeff], mode="mul")

        # Get initial fingerprint
        presum_fp = self.attributes_to_fp_contribution(A, 0)
        fp_all_depth = presum_fp

        # Iterate through different depths, updating atom matrix each time
        A_new = A
        for depth in range(self.depth):
            temp = K.dot(K.dot(C, A_new) + K.sum(B, axis=1), self.W_inner[depth+1, :, :])\
                + self.b_inner[depth+1, 0, :]

            if self.dropout_rate_inner != 0.0:
                mask = K.variable(
                    np.ones(shape=(self.padding_final_size, self.inner_dim),
                            dtype=np.float32))
                self.mask_inner.append(mask)
                n_atom = K.shape(temp)[0]
                temp *= mask[:n_atom, :]

            A_new = self.activation_inner(temp)

            presum_fp_new = self.attributes_to_fp_contribution(
                A_new, depth + 1)
            fp_all_depth = fp_all_depth + presum_fp_new

        fp = K.sum(fp_all_depth, axis=0)  # sum across atom contributions

        return fp
    def compute(self, symmetric_double_encoder, params):

        regularization = 0

        for layer in symmetric_double_encoder:

            OutputLog().write('Adding orthonormal regularization for layer')

            Wy_Square = Tensor.dot(layer.Wy.T, layer.Wy)
            Wx_Square = Tensor.dot(layer.Wx.T, layer.Wx)

            regularization += Tensor.sum((Wy_Square - Tensor.identity_like(Wy_Square)) ** 2, dtype=Tensor.config.floatX)
            regularization += Tensor.sum((Wx_Square - Tensor.identity_like(Wx_Square)) ** 2, dtype=Tensor.config.floatX)

        OutputLog().write('Computing regularization')

        regularization -= self._zeroing_param

        return regularization * (self.weight / 2) * (regularization > 0)
Beispiel #42
0
def eig_pos_barrier( theta = Th.dvector('theta'), M    = Th.dmatrix('M') ,
                 STA   = Th.dvector('STA'), STC  = Th.dmatrix('STC'), 
                 U = Th.dmatrix('U') , V1 = Th.dvector('V1'), **other):
     '''
     A barrier enforcing that the log-det of M should be > exp(-6), 
     and all the eigenvalues of M > 0.  Returns true if barrier is violated.
     '''
     ImM = Th.identity_like(M)-(M+M.T)/2
     w,v = eig( ImM )
     return 1-(Th.sum(Th.log(w))>-250)*(Th.min(w)>0)*(Th.min(V1.flatten())>0) \
Beispiel #43
0
 def correlation(self, H1, H2):
     # H1 = self.output.T
     m = 10000
     H1bar = H1  # - (1.0/m)*T.dot(H1, T.shared(numpy.ones((m,m))))
     H2bar = H2  # - (1.0/m)*T.dot(H1, T.ones_like(numpy.ones((m,m))))
     SigmaHat12 = (1.0 / (m - 1)) * T.dot(H1bar, H2bar.T)
     SigmaHat11 = (1.0 / (m - 1)) * T.dot(H1bar, H1bar.T)
     SigmaHat11 = SigmaHat11 + self.r1 * T.identity_like(SigmaHat11)
     SigmaHat22 = (1.0 / (m - 1)) * T.dot(H2bar, H2bar.T)
     SigmaHat22 = SigmaHat22 + self.r2 * T.identity_like(SigmaHat22)
     Tval = T.dot(SigmaHat11 ** (-0.5), T.dot(SigmaHat12, SigmaHat22 ** (-0.5)))
     corr = T.nlinalg.trace(T.dot(Tval.T, Tval)) ** (0.5)
     self.SigmaHat11 = SigmaHat11
     self.SigmaHat12 = SigmaHat12
     self.SigmaHat22 = SigmaHat22
     self.H1bar = H1bar
     self.H2bar = H2bar
     self.Tval = Tval
     return -1 * corr
Beispiel #44
0
    def get_model(self,X, Y, x_test):
        '''
        Gaussian Process Regression model.
        Reference: C.E. Rasmussen, "Gaussian Process for Machine Learning", MIT Press 2006

        Args:
            X: tensor matrix, training data
            Y: tensor matrix, training target
            x_test: tensor matrix, testing data
        
        Returns:
            K: prior cov matrix
            Ks: prior joint cov matrix
            Kss: prior cov matrix for testing data
            Posterior Distribution:
                alpha: alpha = inv(K)*(mu-m)
                sW: vector containing diagonal of sqrt(W)
                L: L = chol(sW*K*sW+eye(n))
            y_test_mu: predictive mean
            y_test_var: predictive variance
            fs2: predictive latent variance
        Note: the cov matrix inverse is computed through Cholesky factorization
        https://makarandtapaswi.wordpress.com/2011/07/08/cholesky-decomposition-for-matrix-inversion/
        '''
        # Compute GP prior distribution: mean and covariance matrices (eq 2.13, 2.14)
        K = self.covFunc(X,X,'K') # pior cov
        #m = T.mean(Y)*T.ones_like(Y) # pior mean
        m = self.mean*T.ones_like(Y) # pior mean

        # Compute GP joint prior distribution between training and test (eq 2.18)
        Ks = self.covFunc(X,x_test,'Ks')
        # Pay attention!! here is the self test cov matrix.
        Kss = self.covFunc(x_test,x_test,'Kss',mode='self_test')

        # Compute posterior distribution with noise: L,alpha,sW,and log_likelihood.
        sn2 = T.exp(2*self.sigma_n) # noise variance of likGauss
        L = sT.cholesky(K/sn2 + T.identity_like(K))
        sl = sn2
        alpha = T.dot(sT.matrix_inverse(L.T), 
                      T.dot(sT.matrix_inverse(L), (Y-m)) ) / sl
        sW = T.ones_like(T.sum(K,axis=1)).reshape((K.shape[0],1)) / T.sqrt(sl)
        log_likelihood = T.sum(-0.5 * (T.dot((Y-m).T, alpha)) - T.sum(T.log(T.diag(L))) - X.shape[0] / 2 * T.log(2.*np.pi*sl))
        
        
        # Compute predictive distribution using the computed posterior distribution.
        fmu = m + T.dot(Ks.T, alpha) # Prediction Mu fs|f, eq 2.25 
        V = T.dot(sT.matrix_inverse(L),T.extra_ops.repeat(sW,x_test.shape[0],axis=1)*Ks)
        fs2 = Kss - (T.sum(V*V,axis=0)).reshape((1,V.shape[1])).T # Predication Sigma, eq 2.26
        fs2 = T.maximum(fs2,0) # remove negative variance noise
        #fs2 = T.sum(fs2,axis=1) # in case x has multiple dimensions

        y_test_mu = fmu
        y_test_var = fs2 + sn2

        return K, Ks, Kss, y_test_mu, y_test_var, log_likelihood, L, alpha,V, fs2,sW
    def compute(self, symmetric_double_encoder, params):

        regularization = 0

        layer_number = len(symmetric_double_encoder)

        for ndx, layer in enumerate(symmetric_double_encoder):

            hidden_x = layer.output_forward_y
            hidden_y = layer.output_forward_x

            cov_x = Tensor.dot(hidden_x.T, hidden_x)
            cov_y = Tensor.dot(hidden_y.T, hidden_y)

            gama = (ndx / layer_number)

            regularization += gama * 0.5 * nlinalg.trace(cov_x - Tensor.identity_like(cov_x))
            regularization += (1 - gama) * 0.5 * nlinalg.trace(cov_y - Tensor.identity_like(cov_y))

        return regularization
Beispiel #46
0
def orthogonal_pools(W, pool_size):
    """
    Returns the orthogonality penalty ||W^T W - I||.
    :param W: T.matrix, storing filters in column format
    """
    (n_v, n_h) = W.shape
    n_pools = n_h / pool_size
    W3 = T.reshape(W.T, (n_pools, pool_size, n_v), ndim=3)
    W3T = W3.dimshuffle([0,2,1])
    WTW = blas.gpu_gemm_batched(W3, W3T)
    I = T.shape_padleft(T.identity_like(WTW[0]))
    penalty = T.sum((WTW - I)**2)
    return penalty
Beispiel #47
0
    def predict(self, X1, y1, X2):
   
        cov_train = self.compute_cov_s(X1,self.N)
        cov_test  = self.compute_cov_s(X2,self.M)
        cov_te_tr = self.compute_cov(X1,X2,self.N,self.M)     
        cov_tr_te = cov_te_tr.T

        arg0  = T.inv(cov_train+self.noise**2 *T.identity_like(cov_train))
        #arg0  = T.inv(cov_train)
        arg1  = T.dot(cov_te_tr, arg0)
        mu    = T.dot(arg1,y1)
        sigma = cov_test - T.dot(arg1, cov_tr_te) 

        return mu,T.diag(sigma)
Beispiel #48
0
def LNLEP( theta = Th.dvector('theta'), M    = Th.dmatrix('M') ,
                       STA   = Th.dvector('STA')  , STC  = Th.dmatrix('STC'), 
                       N_spike = Th.dscalar('N_spike'), **other):
    '''
    The actual quadratic-Poisson model, as a function of theta and M, 
    without any barriers or priors.
    '''
    ImM = Th.identity_like(M)-(M+M.T)/2
    ldet = logdet(ImM) # Th.log( det( ImM) )  # logdet(ImM)
    return -0.5 * N_spike *( 
             ldet \
             - Th.sum(Th.dot(matrix_inverse(ImM),theta) * theta) \
             + 2. * Th.sum( theta * STA ) \
             + Th.sum( M * (STC + Th.outer(STA,STA)) ))
Beispiel #49
0
def zero_diagonal(X):
    """Given a square matrix ``X``, return a theano variable with the diagonal
    of ``X`` set to zero.

    Parameters
    ----------

    X : theano 2d tensor

    Returns
    -------

    Y : theano 2d tensor"""
    thisid = T.identity_like(X)
    return (X - thisid * X)
Beispiel #50
0
def objective(Xt, yt):
    """
    DeepLDA optimization target
    """

    # init groups
    groups = T.arange(0, n_classes)

    def compute_cov(group, Xt, yt):
        """
        Compute class covariance matrix for group
        """
        Xgt = Xt[T.eq(yt, group).nonzero()]
        Xgt_bar = Xgt - T.mean(Xgt, axis=0)
        m = T.cast(Xgt_bar.shape[0], 'float32')
        return (1.0 / (m - 1)) * T.dot(Xgt_bar.T, Xgt_bar)

    # scan over groups
    covs_t, updates = theano.scan(fn=compute_cov, outputs_info=None,
                                  sequences=[groups], non_sequences=[Xt, yt])

    # compute average covariance matrix (within scatter)
    Sw_t = T.mean(covs_t, axis=0)

    # compute total scatter
    Xt_bar = Xt - T.mean(Xt, axis=0)
    m = T.cast(Xt_bar.shape[0], 'float32')
    St_t = (1.0 / (m - 1)) * T.dot(Xt_bar.T, Xt_bar)

    # compute between scatter
    Sb_t = St_t - Sw_t

    # cope for numerical instability (regularize)
    Sw_t += T.identity_like(Sw_t) * r

    # compute eigenvalues
    evals_t = slinalg.eigvalsh(Sb_t, Sw_t)

    # get eigenvalues
    top_k_evals = evals_t[-n_components:]

    # maximize variance between classes
    # (k smallest eigenvalues below threshold)
    thresh = T.min(top_k_evals) + 1.0
    top_k_evals = top_k_evals[(top_k_evals <= thresh).nonzero()]
    costs = -T.mean(top_k_evals)

    return costs
def get_updates(h, c, U, V, d, bias=1e-5, decomposition="svd", zca=True):
    updates = []
    checks = []

    # theano applies updates in parallel, so all updates are in terms
    # of the old values.  use this and assign the return value, i.e.
    # x = update(x, foo()).  x is then a non-shared variable that
    # refers to the updated value.
    def update(variable, new_value):
        updates.append((variable, new_value))
        return new_value

    # compute canonical parameters
    W = T.dot(U, V)
    b = d - T.dot(c, W)

    # update estimates of c, U
    c = update(c, h.mean(axis=0))
    U = update(U, whiten_by[decomposition](h - c, bias, zca))

    # check that the new covariance is indeed identity
    n = h.shape[0].astype(theano.config.floatX)
    covar = T.dot((h - c).T, (h - c)) / (n - 1)
    whiteh = T.dot(h - c, U)
    whitecovar = T.dot(whiteh.T, whiteh) / (n - 1)
    checks.append(PdbBreakpoint
                  ("correlated after whitening")
                  (1 - T.allclose(whitecovar,
                                  T.identity_like(whitecovar),
                                  rtol=1e-3, atol=1e-3),
                   c, U, covar, whitecovar, h)[0])

    # adjust V, d so that the total transformation is unchanged
    # (lstsq is much more stable than T.inv)
    V = update(V, util.lstsq()(U, W, -1)[0])
    d = update(d, b + T.nlinalg.matrix_dot(c, U, V))

    # check that the total transformation is unchanged
    before = b + T.dot(h, W)
    after = d + T.nlinalg.matrix_dot(h - c, U, V)
    checks.append(
        PdbBreakpoint
        ("transformation changed")
        (1 - T.allclose(before, after,
                        rtol=1e-3, atol=1e-3),
         T.constant(0.0), W, b, c, U, V, d, h, before, after)[0])

    return updates, checks
Beispiel #52
0
def quadratic_Poisson( theta = Th.dvector('theta'), M    = Th.dmatrix('M') ,
                       STA   = Th.dvector('STA')  , STC  = Th.dmatrix('STC'), 
                       N_spike = Th.dscalar('N_spike'), logprior = 0 , 
                       **other):
    '''
    The actual quadratic-Poisson model, as a function of theta and M, 
    with a barrier on the log-det term and a prior.
    '''
    ImM = Th.identity_like(M)-(M+M.T)/2
    ldet = logdet(ImM)    # Th.log( det( ImM) )  # logdet(ImM)
    return -0.5 * N_spike *(
             ldet + logprior \
             - 1./(ldet+250.)**2. \
             - Th.sum(Th.dot(matrix_inverse(ImM),theta) * theta) \
             + 2. * Th.sum( theta * STA ) \
             + Th.sum( M * (STC + Th.outer(STA,STA)) ))
Beispiel #53
0
    def test_grad_W(self):
        """tests that the gradient of the log probability with respect to W
        matches my analytical derivation """

        #self.model.set_param_values(self.new_params)

        g = T.grad(self.prob, self.model.W, consider_constant = self.mf_obs.values())

        B = self.model.B
        W = self.model.W
        mean_hsv = self.stats.d['mean_hsv']

        mean_sq_hs = self.stats.d['mean_sq_hs']

        mean_HS = self.mf_obs['H_hat'] * self.mf_obs['S_hat']

        m = mean_HS.shape[0]

        outer_prod = T.dot(mean_HS.T,mean_HS)
        outer_prod.name = 'outer_prod<from_observations>'
        outer = outer_prod/m
        mask = T.identity_like(outer)
        second_hs = (1.-mask) * outer + alloc_diag(mean_sq_hs)


        term1 = (B * mean_hsv).T
        term2 = - B.dimshuffle(0,'x') * T.dot(W, second_hs)

        analytical = term1 + term2

        f = function([],(g,analytical))

        gv, av = f()

        assert gv.shape == av.shape

        max_diff = np.abs(gv-av).max()

        if max_diff > self.tol:
            print "gv"
            print gv
            print "av"
            print av
            raise Exception("analytical gradient on W deviates from theano gradient on W by up to "+str(max_diff))
Beispiel #54
0
    def dex_cost(self, I, dex_lam=0.00):
        """
        Simple exemplar-svm-like function to optimize.

        This loss is based on unnormalized grounded grounded density
        estimation via Negative Sampling -- Noise-Contrastive Estimation.
        """
        #assert(I.shape[0] == self.X_in.shape[0])
        Wt = T.take(self.W, I, axis=0)
        bt = T.take(self.b, I)
        k = I.size - 1
        F = T.dot(self.X_in, Wt.T) + bt
        #F = T.dot(self.X_in, self.X_in.T)
        mask = T.ones_like(F) - T.identity_like(F)
        dex_loss = T.sum((mask * F) + T.log(1.0 + k*T.exp(-F))) / (k + 1)
        reg_loss = dex_lam * T.sum(F**2.0) / (k + 1)
        C = dex_loss + reg_loss
        self.dW = T.grad(C, Wt)
        self.db = T.grad(C, bt)
        return C
Beispiel #55
0
    def __call__(self, model, X, Y=None, dual=None, **kwargs):

        assert (Y is None) == (not self.supervised)

        WBW = T.dot(model.W.T * model.beta,  model.W)
        target = T.identity_like(WBW)
        err = WBW - target
        penalty = T.sqr(err).sum()

        basic_cost = - model.log_likelihood(X).mean() + self.constraint_coeff * penalty

        if self.use_admm:
            if dual is None:
                if not hasattr(model, 'dual'):
                    model.dual = sharedX(np.zeros((model.nhid, model.nhid)), 'lambda')
                dual = model.dual
            augmented_lagrangian = basic_cost + (dual * err).sum()
            return augmented_lagrangian
        else:
            return basic_cost
        assert False # should be unreached
Beispiel #56
0
    def get_gradients(self, model, X, Y=None, **kwargs):

        assert 'dual' not in kwargs
        updates = {}

        if self.use_admm:
            rho = self.constraint_coeff * 2.
            dual = model.dual
            WBW = T.dot(model.W.T * model.beta, model.W)
            target = T.identity_like(WBW)
            err = WBW - target
            new_dual = dual + rho * err
            new_dual = block_gradient(new_dual)
            kwargs['dual'] = new_dual
            updates[dual] = new_dual

        cost = self(model, X, Y, **kwargs)

        params = model.get_params()
        assert not isinstance(params, set)
        return dict(zip(params, T.grad(cost, params))), updates
Beispiel #57
0
    def _build_conditional(self, Xnew, pred_noise, diag):
        Xs, y, sigma = self.Xs, self.y, self.sigma

        # Old points
        X = cartesian(*Xs)
        delta = y - self.mean_func(X)
        Kns = [f(x) for f, x in zip(self.cov_funcs, Xs)]
        eigs_sep, Qs = zip(*map(eigh, Kns))  # Unzip
        QTs = list(map(tt.transpose, Qs))
        eigs = kron_diag(*eigs_sep)  # Combine separate eigs
        if sigma is not None:
            eigs += sigma**2

        # New points
        Km = self.cov_func(Xnew, diag=diag)
        Knm = self.cov_func(X, Xnew)
        Kmn = Knm.T

        # Build conditional mu
        alpha = kron_dot(QTs, delta)
        alpha = alpha/eigs[:, None]
        alpha = kron_dot(Qs, alpha)
        mu = tt.dot(Kmn, alpha).ravel() + self.mean_func(Xnew)

        # Build conditional cov
        A = kron_dot(QTs, Knm)
        A = A/tt.sqrt(eigs[:, None])
        if diag:
            Asq = tt.sum(tt.square(A), 0)
            cov = Km - Asq
            if pred_noise:
                cov += sigma
        else:
            Asq = tt.dot(A.T, A)
            cov = Km - Asq
            if pred_noise:
                cov += sigma * tt.identity_like(cov)
        return mu, cov
Beispiel #58
0
def stabilize(K):
    """ adds small diagonal to a covariance matrix """
    return K + 1e-6 * tt.identity_like(K)
 def __call__(self, loss):
     loss += K.sum(K.square(self.p.dot(self.p.T) - T.identity_like(self.p))) * self.strength
     return loss