Beispiel #1
0
 def get_latent(self, pca_rotation=True):
     X = self.model.X.copy()
     # X = np.asarray(gs(X.T)).T
     # X = np.linalg.qr(X)[0]
     X -= X.mean(axis = 0)
     X = pca(X, X.shape[1])[0]
     X -= X.mean(axis=0)
     X /= X.std(axis=0)
     return X
Beispiel #2
0
Datei: mrd.py Projekt: Dalar/GPy
 def _init_X(self, init='PCA', likelihood_list=None):
     if likelihood_list is None:
         likelihood_list = self.likelihood_list
     Ylist = []
     for likelihood_or_Y in likelihood_list:
         if type(likelihood_or_Y) is numpy.ndarray:
             Ylist.append(likelihood_or_Y)
         else:
             Ylist.append(likelihood_or_Y.Y)
     del likelihood_list
     if init in "PCA_concat":
         X = pca(numpy.hstack(Ylist), self.input_dim)[0]
     elif init in "PCA_single":
         X = numpy.zeros((Ylist[0].shape[0], self.input_dim))
         for qs, Y in itertools.izip(numpy.array_split(numpy.arange(self.input_dim), len(Ylist)), Ylist):
             X[:, qs] = pca(Y, len(qs))[0]
     else: # init == 'random':
         X = numpy.random.randn(Ylist[0].shape[0], self.input_dim)
     self.X = X
     return X
Beispiel #3
0
 def _init_X(self, init='PCA', likelihood_list=None):
     if likelihood_list is None:
         likelihood_list = self.likelihood_list
     Ylist = []
     for likelihood_or_Y in likelihood_list:
         if type(likelihood_or_Y) is numpy.ndarray:
             Ylist.append(likelihood_or_Y)
         else:
             Ylist.append(likelihood_or_Y.Y)
     del likelihood_list
     if init in "PCA_concat":
         X = pca(numpy.hstack(Ylist), self.input_dim)[0]
     elif init in "PCA_single":
         X = numpy.zeros((Ylist[0].shape[0], self.input_dim))
         for qs, Y in itertools.izip(
                 numpy.array_split(numpy.arange(self.input_dim),
                                   len(Ylist)), Ylist):
             X[:, qs] = pca(Y, len(qs))[0]
     else:  # init == 'random':
         X = numpy.random.randn(Ylist[0].shape[0], self.input_dim)
     self.X = X
     return X
Beispiel #4
0
def create_model(Y, X_init=None, num_inducing=10, nonlinear_dims=5, linear_dims=0):
    """
    Create a BayesianGPLVM model for the expression values in Y.

    Y has the cells on the rows and genes across dimensions:
        Y.shape == (#cells, #genes)

    X_init is the initial latent space for the model.
    Usually this is being initialized by using simulation.run_methods
        X_init, dims = run_methods(Y, methods)

    num_inducing are the number of inducing inputs. It is a number `M`
    between the `0` and the number of datapoints you have and controls
    the complexity of your model. We usually use 10 to 20
    inducing inputs, but if you are having trouble with accuracy in
    your found landscape, you can try to up this number. Note, that
    the speed of the method goes down, with higher numbers of
    inducing inputs. Also, if you use RNASeq data, it is recommended to use a
    lower number (i.e. 10) of inducing inputs so the BayesianGPLVM is
    forced to generalise over patterns and cannot explain the zeros in the
    data by inducing inputs.

    nonlinear_dims are the number of latent dimensions modelled as nonlinear
    relationship between latent space and observed gene expression values
    along the samples. This value gets ignored if X_init is given and the number
    of nonlinear_dims will be the number of dimensions in X_init. If X_init is
    not given, it will be created by PCA.

    linear_dims are the linear dimensions to add into the latent space.
    Linear dimensions are used for modelling linear relationships in the latent
    space independently from the non-linear ones. That is, the last linear_dims
    dimensions in the latent space will be modelled by a linear kernel. We
    recommend try to first run without linear dimensions and see what the
    BayesianGPLVM can learn. If there is a considered amount of confounding
    variation, the linear dimension can help to find this variation
    and explain it away from the rest. It can also lead to unexpected results...

    Missing Data: If you have missing data, you can assign the values in Y,
    which are missing to np.nan and the BayesianGPLVM will assume missing
    data at random over those. This will include the dimensionality in
    the runtime of the method and will slow down progress significantly. Thus,
    only include missing data into the model, if you are certain you want to
    use it.

    Usage example:

        from .simulation import run_methods
        Y -= Y.mean(0) # Normalization of data, zero mean is usually what you want.
        Y /= Y.std(0) # Beware of your data and decide whether you want to normalize the variances!
        X_init, dims = run_methods(Y, methods)
        m = create_model(Y, X_init, num_inducing=10)
        optimize_model(m)

    returns a BayesianGPLVM model for the given data matrix Y.
    """
    from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
    from GPy.kern import Linear, RBF, Add
    from GPy.util.linalg import pca

    try:
        Y = Y.values.copy()
    except:
        Y = np.asarray(Y, float).copy()

    if X_init is None:
        X_init = pca(Y, nonlinear_dims)[0]

    if linear_dims > 0:
        Qlin = linear_dims
        Q = X_init.shape[1] + Qlin
        # Q = 5
        m = BayesianGPLVMMiniBatch(
            Y,
            Q,
            X=np.c_[X_init, pca(Y, Qlin)[0]],
            kernel=Add(
                [
                    RBF(Q - Qlin, ARD=True, active_dims=np.arange(0, X_init.shape[1])),
                    Linear(Qlin, ARD=True, active_dims=np.arange(X_init.shape[1], Q)),
                ]
            ),
            num_inducing=num_inducing,
            missing_data=np.any(np.isnan(Y)),
        )
    else:
        Q = X_init.shape[1]
        # Q = 5
        m = BayesianGPLVMMiniBatch(
            Y,
            Q,
            X=X_init,
            kernel=RBF(Q, ARD=True, active_dims=np.arange(0, X_init.shape[1])),
            num_inducing=num_inducing,
            missing_data=np.any(np.isnan(Y)),
        )

    return m
Beispiel #5
0
def create_model(Y,
                 X_init=None,
                 num_inducing=10,
                 nonlinear_dims=5,
                 linear_dims=0,
                 white_variance=1):
    """
    Create a BayesianGPLVM model for the expression values in Y.

    Y has the cells on the rows and genes across dimensions:
        Y.shape == (#cells, #genes)

    X_init is the initial latent space for the model.
    Usually this is being initialized by using simulation.run_methods
        X_init, dims = run_methods(Y, methods)

    num_inducing are the number of inducing inputs. It is a number `M`
    between the `0` and the number of datapoints you have and controls
    the complexity of your model. We usually use 10 to 20
    inducing inputs, but if you are having trouble with accuracy in
    your found landscape, you can try to up this number. Note, that
    the speed of the method goes down, with higher numbers of
    inducing inputs. Also, if you use RNASeq data, it is recommended to use a
    lower number (i.e. 10) of inducing inputs so the BayesianGPLVM is
    forced to generalise over patterns and cannot explain the zeros in the
    data by inducing inputs.

    nonlinear_dims are the number of latent dimensions modelled as nonlinear
    relationship between latent space and observed gene expression values
    along the samples. This value gets ignored if X_init is given and the number
    of nonlinear_dims will be the number of dimensions in X_init. If X_init is
    not given, it will be created by PCA.

    linear_dims are the linear dimensions to add into the latent space.
    Linear dimensions are used for modelling linear relationships in the latent
    space independently from the non-linear ones. That is, the last linear_dims
    dimensions in the latent space will be modelled by a linear kernel. We
    recommend try to first run without linear dimensions and see what the
    BayesianGPLVM can learn. If there is a considered amount of confounding
    variation, the linear dimension can help to find this variation
    and explain it away from the rest. It can also lead to unexpected results...

    white_variance is a white variance value (float) for a white variance on the 
    kernel. If it is None, no white variance kernel will be added to the analysis.

    Missing Data: If you have missing data, you can assign the values in Y,
    which are missing to np.nan and the BayesianGPLVM will assume missing
    data at random over those. This will include the dimensionality in
    the runtime of the method and will slow down progress significantly. Thus,
    only include missing data into the model, if you are certain you want to
    use it.

    Usage example:

        from .simulation import run_methods
        Y -= Y.mean(0) # Normalization of data, zero mean is usually what you want.
        Y /= Y.std(0) # Beware of your data and decide whether you want to normalize the variances!
        X_init, dims = run_methods(Y, methods)
        m = create_model(Y, X_init, num_inducing=10)
        optimize_model(m)

    returns a BayesianGPLVM model for the given data matrix Y.
    """
    from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
    from GPy.kern import Linear, RBF, Add, White
    from GPy.util.linalg import pca

    try:
        Y = Y.values.copy()
    except:
        Y = np.asarray(Y, float).copy()

    if X_init is None:
        X_init = pca(Y, nonlinear_dims)[0]

    kernels = []

    if linear_dims > 0:
        Qlin = linear_dims
        Q = X_init.shape[1] + Qlin
        kernels.extend([
            RBF(Q - Qlin, ARD=True, active_dims=np.arange(0, X_init.shape[1])),
            Linear(Qlin, ARD=True, active_dims=np.arange(X_init.shape[1], Q))
        ])
    else:
        Q = X_init.shape[1]
        kernels.append(
            RBF(Q, ARD=True, active_dims=np.arange(0, X_init.shape[1])))

    if white_variance is not None:
        kernels.append(White(Q, variance=white_variance))

    if len(kernels) > 1:
        kernel = Add(kernels)
    else:
        kernel = kernels[0]

    m = BayesianGPLVMMiniBatch(Y,
                               Q,
                               X=X_init,
                               kernel=kernel,
                               num_inducing=num_inducing,
                               missing_data=np.any(np.isnan(Y)))

    return m