def test_kernels(self): from GPy.kern import RBF,Linear,MLP,Bias,White Q = self.Z.shape[1] kernels = [RBF(Q,ARD=True), Linear(Q,ARD=True),MLP(Q,ARD=True), RBF(Q,ARD=True)+Linear(Q,ARD=True)+Bias(Q)+White(Q) ,RBF(Q,ARD=True)+Bias(Q)+White(Q), Linear(Q,ARD=True)+Bias(Q)+White(Q)] for k in kernels: k.randomize() self._test_kernel_param(k) self._test_Z(k) self._test_qX(k) self._test_kernel_param(k, psi2n=True) self._test_Z(k, psi2n=True) self._test_qX(k, psi2n=True)
def get_data(kernel_name, variance_value=1.0, n_traces=3, lengthscale=1.0): n_dims = 100 n_frames = 20 #n_traces = 3 x = np.linspace(0, 10, n_dims)[:, np.newaxis] if kernel_name == "RBF": kernel = RBF(input_dim=1, variance=variance_value, lengthscale=lengthscale) elif kernel_name == "Brownian": kernel = Brownian(input_dim=1, variance=variance_value) elif kernel_name == "Matern32": kernel = Matern32(input_dim=1, variance=variance_value) elif kernel_name == "Cosine": kernel = Cosine(input_dim=1, variance=variance_value) elif kernel_name == "Exponential": kernel = Exponential(input_dim=1, variance=variance_value) elif kernel_name == "Linear": kernel = Linear(input_dim=1) elif kernel_name == "GridRBF": kernel = GridRBF(input_dim=1, variance=variance_value) elif kernel_name == "MLP": kernel = MLP(input_dim=1, variance=variance_value) elif kernel_name == "PeriodicMatern32": kernel = PeriodicMatern32(input_dim=1, variance=variance_value) elif kernel_name == "Spline": kernel = Spline(input_dim=1, variance=variance_value) elif kernel_name == "White": kernel = White(input_dim=1, variance=variance_value) elif kernel_name == "StdPeriodic": kernel = StdPeriodic(input_dim=1, variance=variance_value) else: raise ValueError("Unknown Kernel name") kernel_matrix = kernel.K(x, x) gaussian_process_animation = GaussianProcessAnimation(kernel_matrix, n_dims=n_dims, n_frames=n_frames) frames = gaussian_process_animation.get_traces(n_traces) data = np.stack(frames).transpose((2, 0, 1)) return data
def _init_kernel_function(self, kern_types=None, hyp=None): """ Initialize GPy kernel functions based on name. Check if supported. Utility function to return a kernel based on its type name. Checks if the kernel type is supported. Parameters ---------- kern_types: n_s x 0 array_like[str] The names of the kernels for each dimension Returns ------- kern: GPy.Kern The Gpy kernel function """ input_dim = self.n_s_in + self.n_u kerns = [None] * self.n_s_out if hyp is None: hyp = [None] * self.n_s_out warnings.warn( "Changed the kernel structure from the cdc paper implementation, see old structure commented out" ) """ if kern_types[i] == "rbf": kern_i = RBF(input_dim, ARD = True) elif kern_types[i] == "lin_rbf": kern_i = Linear(1,active_dims = [1])*RBF(1,active_dims=[1]) + Linear(input_dim,ARD=True) elif kern_types[i] == "lin_mat52": kern_i = Linear(1,active_dims = [1])*Matern52(1,active_dims=[1]) + Linear(input_dim,ARD=True) else: """ if kern_types is None: kern_types = [None] * self.n_s_out for i in range(self.n_s_out): kern_types[i] = "rbf" kerns[i] = RBF(input_dim, ARD=True) else: for i in range(self.n_s_out): hyp_i = hyp[i] if kern_types[i] == "rbf": kern_i = RBF(input_dim, ARD=True) elif kern_types[i] == "mat52": kern_i = Matern52(input_dim, ARD=True) elif kern_types[i] == "lin_rbf": kern_i = Linear(input_dim) * RBF(input_dim) + Linear( input_dim, ARD=True) elif kern_types[i] == "lin_mat52": kern_i = Linear(input_dim) * Matern52(input_dim) + Linear( input_dim, ARD=True) else: raise ValueError("kernel type '{}' not supported".format( kern_types[i])) if not hyp_i is None: for k, v in list(hyp_i.items()): try: rsetattr(kern_i, k, v) kern_hyp = rgetattr(kern_i, k) kern_hyp.fix() except: warnings.warn( "Cannot set and fix hyperparameter: {}".format( k)) kerns[i] = kern_i self.base_kerns = kerns self.kern_types = kern_types
def create_model(Y, X_init=None, num_inducing=10, nonlinear_dims=5, linear_dims=0, white_variance=1): """ Create a BayesianGPLVM model for the expression values in Y. Y has the cells on the rows and genes across dimensions: Y.shape == (#cells, #genes) X_init is the initial latent space for the model. Usually this is being initialized by using simulation.run_methods X_init, dims = run_methods(Y, methods) num_inducing are the number of inducing inputs. It is a number `M` between the `0` and the number of datapoints you have and controls the complexity of your model. We usually use 10 to 20 inducing inputs, but if you are having trouble with accuracy in your found landscape, you can try to up this number. Note, that the speed of the method goes down, with higher numbers of inducing inputs. Also, if you use RNASeq data, it is recommended to use a lower number (i.e. 10) of inducing inputs so the BayesianGPLVM is forced to generalise over patterns and cannot explain the zeros in the data by inducing inputs. nonlinear_dims are the number of latent dimensions modelled as nonlinear relationship between latent space and observed gene expression values along the samples. This value gets ignored if X_init is given and the number of nonlinear_dims will be the number of dimensions in X_init. If X_init is not given, it will be created by PCA. linear_dims are the linear dimensions to add into the latent space. Linear dimensions are used for modelling linear relationships in the latent space independently from the non-linear ones. That is, the last linear_dims dimensions in the latent space will be modelled by a linear kernel. We recommend try to first run without linear dimensions and see what the BayesianGPLVM can learn. If there is a considered amount of confounding variation, the linear dimension can help to find this variation and explain it away from the rest. It can also lead to unexpected results... white_variance is a white variance value (float) for a white variance on the kernel. If it is None, no white variance kernel will be added to the analysis. Missing Data: If you have missing data, you can assign the values in Y, which are missing to np.nan and the BayesianGPLVM will assume missing data at random over those. This will include the dimensionality in the runtime of the method and will slow down progress significantly. Thus, only include missing data into the model, if you are certain you want to use it. Usage example: from .simulation import run_methods Y -= Y.mean(0) # Normalization of data, zero mean is usually what you want. Y /= Y.std(0) # Beware of your data and decide whether you want to normalize the variances! X_init, dims = run_methods(Y, methods) m = create_model(Y, X_init, num_inducing=10) optimize_model(m) returns a BayesianGPLVM model for the given data matrix Y. """ from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch from GPy.kern import Linear, RBF, Add, White from GPy.util.linalg import pca try: Y = Y.values.copy() except: Y = np.asarray(Y, float).copy() if X_init is None: X_init = pca(Y, nonlinear_dims)[0] kernels = [] if linear_dims > 0: Qlin = linear_dims Q = X_init.shape[1] + Qlin kernels.extend([ RBF(Q - Qlin, ARD=True, active_dims=np.arange(0, X_init.shape[1])), Linear(Qlin, ARD=True, active_dims=np.arange(X_init.shape[1], Q)) ]) else: Q = X_init.shape[1] kernels.append( RBF(Q, ARD=True, active_dims=np.arange(0, X_init.shape[1]))) if white_variance is not None: kernels.append(White(Q, variance=white_variance)) if len(kernels) > 1: kernel = Add(kernels) else: kernel = kernels[0] m = BayesianGPLVMMiniBatch(Y, Q, X=X_init, kernel=kernel, num_inducing=num_inducing, missing_data=np.any(np.isnan(Y))) return m
def gp_on_fold(feature_sets, train, test, y, y_all, learn_options): sequences = np.array([str(x) for x in y_all.index.get_level_values(0).tolist()]) kern = WeightedDegree( 1, sequences, d=learn_options["kernel degree"], active_dims=[0] ) X = np.arange(len(train))[:, None] current_dim = 1 if "gc_count" in feature_sets: kern += RBF(1, active_dims=[current_dim], name="GC_rbf") X = np.concatenate((X, feature_sets["gc_count"].values), axis=1) current_dim += 1 if X.shape[1] != current_dim: raise AssertionError("incorrect number of columns") if "drug" in feature_sets: Q = feature_sets["drug"].values.shape[1] kern += Linear( Q, active_dims=range(current_dim, current_dim + Q), name="drug_lin" ) X = np.concatenate((X, feature_sets["drug"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") if "gene effect" in feature_sets: Q = feature_sets["gene effect"].values.shape[1] kern += Linear( Q, active_dims=range(current_dim, current_dim + Q), name="gene_lin" ) X = np.concatenate((X, feature_sets["gene effect"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") if "Percent Peptide" in feature_sets: Q = feature_sets["Percent Peptide"].values.shape[1] kern += RBF( Q, active_dims=range(current_dim, current_dim + Q), name="percent_pept" ) X = np.concatenate((X, feature_sets["Percent Peptide"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") if "Nucleotide cut position" in feature_sets: Q = feature_sets["Nucleotide cut position"].values.shape[1] kern += RBF( Q, active_dims=range(current_dim, current_dim + Q), name="nucleo_cut" ) X = np.concatenate((X, feature_sets["Nucleotide cut position"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") if "Strand effect" in feature_sets: Q = feature_sets["Strand effect"].values.shape[1] kern += Linear( Q, active_dims=range(current_dim, current_dim + Q), name="strand" ) X = np.concatenate((X, feature_sets["Strand effect"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") if "NGGX" in feature_sets: Q = feature_sets["NGGX"].values.shape[1] kern += Linear(Q, active_dims=range(current_dim, current_dim + Q), name="NGGX") X = np.concatenate((X, feature_sets["NGGX"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") if "TM" in feature_sets: Q = feature_sets["TM"].values.shape[1] kern += RBF( Q, ARD=True, active_dims=range(current_dim, current_dim + Q), name="TM" ) X = np.concatenate((X, feature_sets["TM"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") if "gene features" in feature_sets: Q = feature_sets["gene features"].values.shape[1] kern += Linear( Q, ARD=True, active_dims=range(current_dim, current_dim + Q), name="genefeat", ) X = np.concatenate((X, feature_sets["gene features"].values), axis=1) current_dim += Q if X.shape[1] != current_dim: raise AssertionError("incorrect number or columns") kern += Bias(X.shape[1]) if learn_options["warpedGP"]: m = WarpedGP(X[train], y[train], kernel=kern) else: m = GPRegression(X[train], y[train], kernel=kern) m.optimize_restarts(3) y_pred, _ = m.predict(X[test]) # TODO add offset such that low scores are around 0 (not -4 or so) return y_pred, m[:]
def single_model(args): import h5py import pandas as pd import numpy as np import dill as pickle from utils import read_hdf5_dataset, prepare_output_file, read_hdf5_single from sklearn.preprocessing import StandardScaler from sklearn.metrics import r2_score, mean_squared_error from tqdm import tqdm logger.info('read phenotypes from file: ' + args.phenotype_file) #phenotypes = pd.read_table(args.phenotype_file) phenotypes = read_hdf5_dataset(args.phenotype_file) logger.info('read genotypes from file: ' + args.genotype_file) X = read_hdf5_dataset(args.genotype_file) if args.transpose_x: logger.info('transpose X') X = X.T y = phenotypes if args.feature_indices_file: logger.info('read feature indices from: ' + args.feature_indices_file) feature_indices = read_hdf5_dataset(args.feature_indices_file) X = np.take(X, feature_indices, axis=1) if args.normalize_x: logger.info('normalize X') X = StandardScaler().fit_transform(X) if args.sample_indices_file: logger.info('read sample indices from: ' + args.sample_indices_file) sample_indices = read_hdf5_dataset(args.sample_indices_file) else: sample_indices = np.nonzero(~np.isnan(phenotypes))[0] X_train = X[sample_indices] y_train = y[sample_indices] logger.info('read parent table from file: ' + args.parent_table_file) parent_table = read_hdf5_single(args.parent_table_file) logger.info('use model ' + args.model_name) logger.info('X.shape = %s, y.shape = %s' % (repr(X.shape), repr(y.shape))) if args.model_name == 'ridge': from sklearn.linear_model import Ridge model = Ridge(alpha=10000) model.fit(X_train, y_train) y_pred = np.ravel(model.predict(X)) y_pred_train = y_pred[sample_indices] elif args.model_name == 'ridge_cv': from sklearn.linear_model import Ridge alphas = 10.0**np.arange(1, 6) train_masks, test_masks = generate_cv_masks(sample_indices, parent_table, k_female=5, k_male=5) cv_metrics = {} cv_metrics['mse'] = np.zeros((len(alphas), train_masks.shape[0])) cv_metrics['r2'] = np.zeros((len(alphas), train_masks.shape[0])) pbar = tqdm(total=len(alphas) * train_masks.shape[0]) for i, alpha in enumerate(alphas): for j in range(train_masks.shape[0]): model = Ridge(alpha=alpha) model.fit(X[train_masks[j]], y[train_masks[j]]) y_pred = model.predict(X[test_masks[j]]) cv_metrics['mse'][i, j] = mean_squared_error( y[test_masks[j]], y_pred) cv_metrics['r2'][i, j] = r2_score(y[test_masks[j]], y_pred) pbar.update(1) pbar.close() best_alpha = alphas[cv_metrics['r2'].mean(axis=1).argmax()] logger.info('optmized alpha = %f' % best_alpha) model = Ridge(alpha=best_alpha) model.fit(X_train, y_train) y_pred = np.ravel(model.predict(X)) y_pred_train = y_pred[sample_indices] elif args.model_name == 'gpr': from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF kernel = RBF() + WhiteKernel() model = GaussianProcessRegressor(kernel=kernel) model.fit(X_train, y_train) logger.info('kernel params: %s' % repr(model.get_params())) y_pred_train = np.ravel(model.predict(X_train)) y_pred = np.ravel(model.predict(X)) elif args.model_name == 'gpy': from GPy.kern import Linear from GPy.models import GPRegression kernel = Linear(input_dim=2, name='linear') model = GPRegression(X_train, y_train, kernel=kernel) model.optimize() else: raise ValueError('unknown model name: ' + args.model_name) logger.info('r2 score = %f' % r2_score(y_train, y_pred_train)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model_file = os.path.join(args.output_dir, 'model') logger.info('save model file: ' + model_file) with open(model_file, 'wb') as f: pickle.dump(model, f) pred_file = os.path.join(args.output_dir, 'predictions') logger.info('save predictions to file: ' + pred_file) with h5py.File(pred_file, 'w') as f: if args.output_residuals: f.create_dataset('residual', data=(y - y_pred)) f.create_dataset('y_true', data=y) f.create_dataset('y_pred', data=y_pred) f.create_dataset('y_pred_train', data=y_pred_train) f.create_dataset('indices_train', data=sample_indices) if args.model_name == 'ridge_cv': f.create_dataset('alpha', data=alphas) g = f.create_group('cv_metrics') for key in cv_metrics.keys(): g.create_dataset(key, data=cv_metrics[key])