def test_kernels(self): from GPy.kern import RBF,Linear,MLP,Bias,White Q = self.Z.shape[1] kernels = [RBF(Q,ARD=True), Linear(Q,ARD=True),MLP(Q,ARD=True), RBF(Q,ARD=True)+Linear(Q,ARD=True)+Bias(Q)+White(Q) ,RBF(Q,ARD=True)+Bias(Q)+White(Q), Linear(Q,ARD=True)+Bias(Q)+White(Q)] for k in kernels: k.randomize() self._test_kernel_param(k) self._test_Z(k) self._test_qX(k) self._test_kernel_param(k, psi2n=True) self._test_Z(k, psi2n=True) self._test_qX(k, psi2n=True)
def get_data(kernel_name, variance_value=1.0, n_traces=3, lengthscale=1.0): n_dims = 100 n_frames = 20 #n_traces = 3 x = np.linspace(0, 10, n_dims)[:, np.newaxis] if kernel_name == "RBF": kernel = RBF(input_dim=1, variance=variance_value, lengthscale=lengthscale) elif kernel_name == "Brownian": kernel = Brownian(input_dim=1, variance=variance_value) elif kernel_name == "Matern32": kernel = Matern32(input_dim=1, variance=variance_value) elif kernel_name == "Cosine": kernel = Cosine(input_dim=1, variance=variance_value) elif kernel_name == "Exponential": kernel = Exponential(input_dim=1, variance=variance_value) elif kernel_name == "Linear": kernel = Linear(input_dim=1) elif kernel_name == "GridRBF": kernel = GridRBF(input_dim=1, variance=variance_value) elif kernel_name == "MLP": kernel = MLP(input_dim=1, variance=variance_value) elif kernel_name == "PeriodicMatern32": kernel = PeriodicMatern32(input_dim=1, variance=variance_value) elif kernel_name == "Spline": kernel = Spline(input_dim=1, variance=variance_value) elif kernel_name == "White": kernel = White(input_dim=1, variance=variance_value) elif kernel_name == "StdPeriodic": kernel = StdPeriodic(input_dim=1, variance=variance_value) else: raise ValueError("Unknown Kernel name") kernel_matrix = kernel.K(x, x) gaussian_process_animation = GaussianProcessAnimation(kernel_matrix, n_dims=n_dims, n_frames=n_frames) frames = gaussian_process_animation.get_traces(n_traces) data = np.stack(frames).transpose((2, 0, 1)) return data
def create_model(Y, X_init=None, num_inducing=10, nonlinear_dims=5, linear_dims=0, white_variance=1): """ Create a BayesianGPLVM model for the expression values in Y. Y has the cells on the rows and genes across dimensions: Y.shape == (#cells, #genes) X_init is the initial latent space for the model. Usually this is being initialized by using simulation.run_methods X_init, dims = run_methods(Y, methods) num_inducing are the number of inducing inputs. It is a number `M` between the `0` and the number of datapoints you have and controls the complexity of your model. We usually use 10 to 20 inducing inputs, but if you are having trouble with accuracy in your found landscape, you can try to up this number. Note, that the speed of the method goes down, with higher numbers of inducing inputs. Also, if you use RNASeq data, it is recommended to use a lower number (i.e. 10) of inducing inputs so the BayesianGPLVM is forced to generalise over patterns and cannot explain the zeros in the data by inducing inputs. nonlinear_dims are the number of latent dimensions modelled as nonlinear relationship between latent space and observed gene expression values along the samples. This value gets ignored if X_init is given and the number of nonlinear_dims will be the number of dimensions in X_init. If X_init is not given, it will be created by PCA. linear_dims are the linear dimensions to add into the latent space. Linear dimensions are used for modelling linear relationships in the latent space independently from the non-linear ones. That is, the last linear_dims dimensions in the latent space will be modelled by a linear kernel. We recommend try to first run without linear dimensions and see what the BayesianGPLVM can learn. If there is a considered amount of confounding variation, the linear dimension can help to find this variation and explain it away from the rest. It can also lead to unexpected results... white_variance is a white variance value (float) for a white variance on the kernel. If it is None, no white variance kernel will be added to the analysis. Missing Data: If you have missing data, you can assign the values in Y, which are missing to np.nan and the BayesianGPLVM will assume missing data at random over those. This will include the dimensionality in the runtime of the method and will slow down progress significantly. Thus, only include missing data into the model, if you are certain you want to use it. Usage example: from .simulation import run_methods Y -= Y.mean(0) # Normalization of data, zero mean is usually what you want. Y /= Y.std(0) # Beware of your data and decide whether you want to normalize the variances! X_init, dims = run_methods(Y, methods) m = create_model(Y, X_init, num_inducing=10) optimize_model(m) returns a BayesianGPLVM model for the given data matrix Y. """ from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch from GPy.kern import Linear, RBF, Add, White from GPy.util.linalg import pca try: Y = Y.values.copy() except: Y = np.asarray(Y, float).copy() if X_init is None: X_init = pca(Y, nonlinear_dims)[0] kernels = [] if linear_dims > 0: Qlin = linear_dims Q = X_init.shape[1] + Qlin kernels.extend([ RBF(Q - Qlin, ARD=True, active_dims=np.arange(0, X_init.shape[1])), Linear(Qlin, ARD=True, active_dims=np.arange(X_init.shape[1], Q)) ]) else: Q = X_init.shape[1] kernels.append( RBF(Q, ARD=True, active_dims=np.arange(0, X_init.shape[1]))) if white_variance is not None: kernels.append(White(Q, variance=white_variance)) if len(kernels) > 1: kernel = Add(kernels) else: kernel = kernels[0] m = BayesianGPLVMMiniBatch(Y, Q, X=X_init, kernel=kernel, num_inducing=num_inducing, missing_data=np.any(np.isnan(Y))) return m