Esempio n. 1
0
    def test_kernels(self):
        from GPy.kern import RBF,Linear,MLP,Bias,White
        Q = self.Z.shape[1]
        kernels = [RBF(Q,ARD=True), Linear(Q,ARD=True),MLP(Q,ARD=True), RBF(Q,ARD=True)+Linear(Q,ARD=True)+Bias(Q)+White(Q)
                  ,RBF(Q,ARD=True)+Bias(Q)+White(Q),  Linear(Q,ARD=True)+Bias(Q)+White(Q)]

        for k in kernels:
            k.randomize()
            self._test_kernel_param(k)
            self._test_Z(k)
            self._test_qX(k)
            self._test_kernel_param(k, psi2n=True)
            self._test_Z(k, psi2n=True)
            self._test_qX(k, psi2n=True)
Esempio n. 2
0
def get_data(kernel_name, variance_value=1.0, n_traces=3, lengthscale=1.0):
    n_dims = 100
    n_frames = 20
    #n_traces = 3

    x = np.linspace(0, 10, n_dims)[:, np.newaxis]

    if kernel_name == "RBF":
        kernel = RBF(input_dim=1,
                     variance=variance_value,
                     lengthscale=lengthscale)
    elif kernel_name == "Brownian":
        kernel = Brownian(input_dim=1, variance=variance_value)
    elif kernel_name == "Matern32":
        kernel = Matern32(input_dim=1, variance=variance_value)
    elif kernel_name == "Cosine":
        kernel = Cosine(input_dim=1, variance=variance_value)
    elif kernel_name == "Exponential":
        kernel = Exponential(input_dim=1, variance=variance_value)
    elif kernel_name == "Linear":
        kernel = Linear(input_dim=1)
    elif kernel_name == "GridRBF":
        kernel = GridRBF(input_dim=1, variance=variance_value)
    elif kernel_name == "MLP":
        kernel = MLP(input_dim=1, variance=variance_value)
    elif kernel_name == "PeriodicMatern32":
        kernel = PeriodicMatern32(input_dim=1, variance=variance_value)
    elif kernel_name == "Spline":
        kernel = Spline(input_dim=1, variance=variance_value)
    elif kernel_name == "White":
        kernel = White(input_dim=1, variance=variance_value)
    elif kernel_name == "StdPeriodic":
        kernel = StdPeriodic(input_dim=1, variance=variance_value)
    else:
        raise ValueError("Unknown Kernel name")

    kernel_matrix = kernel.K(x, x)

    gaussian_process_animation = GaussianProcessAnimation(kernel_matrix,
                                                          n_dims=n_dims,
                                                          n_frames=n_frames)
    frames = gaussian_process_animation.get_traces(n_traces)
    data = np.stack(frames).transpose((2, 0, 1))
    return data
    def _init_kernel_function(self, kern_types=None, hyp=None):
        """ Initialize GPy kernel functions based on name. Check if supported.

        Utility function to return a kernel based on its type name.
        Checks if the kernel type is supported.

        Parameters
        ----------
        kern_types: n_s x 0 array_like[str]
            The names of the kernels for each dimension

        Returns
        -------
        kern: GPy.Kern
            The Gpy kernel function
        """

        input_dim = self.n_s_in + self.n_u
        kerns = [None] * self.n_s_out

        if hyp is None:
            hyp = [None] * self.n_s_out
        warnings.warn(
            "Changed the kernel structure from the cdc paper implementation, see old structure commented out"
        )
        """
        if kern_types[i] == "rbf":
                    kern_i = RBF(input_dim, ARD = True)
                elif kern_types[i] == "lin_rbf":
                    kern_i = Linear(1,active_dims = [1])*RBF(1,active_dims=[1]) + Linear(input_dim,ARD=True)
                elif kern_types[i] == "lin_mat52":
                    kern_i = Linear(1,active_dims = [1])*Matern52(1,active_dims=[1]) + Linear(input_dim,ARD=True)
                else:
        """

        if kern_types is None:
            kern_types = [None] * self.n_s_out
            for i in range(self.n_s_out):
                kern_types[i] = "rbf"
                kerns[i] = RBF(input_dim, ARD=True)

        else:
            for i in range(self.n_s_out):
                hyp_i = hyp[i]
                if kern_types[i] == "rbf":
                    kern_i = RBF(input_dim, ARD=True)
                elif kern_types[i] == "mat52":
                    kern_i = Matern52(input_dim, ARD=True)
                elif kern_types[i] == "lin_rbf":
                    kern_i = Linear(input_dim) * RBF(input_dim) + Linear(
                        input_dim, ARD=True)
                elif kern_types[i] == "lin_mat52":
                    kern_i = Linear(input_dim) * Matern52(input_dim) + Linear(
                        input_dim, ARD=True)
                else:
                    raise ValueError("kernel type '{}' not supported".format(
                        kern_types[i]))

                if not hyp_i is None:
                    for k, v in list(hyp_i.items()):
                        try:
                            rsetattr(kern_i, k, v)
                            kern_hyp = rgetattr(kern_i, k)
                            kern_hyp.fix()

                        except:
                            warnings.warn(
                                "Cannot set and fix hyperparameter: {}".format(
                                    k))
                kerns[i] = kern_i

        self.base_kerns = kerns
        self.kern_types = kern_types
Esempio n. 4
0
def create_model(Y,
                 X_init=None,
                 num_inducing=10,
                 nonlinear_dims=5,
                 linear_dims=0,
                 white_variance=1):
    """
    Create a BayesianGPLVM model for the expression values in Y.

    Y has the cells on the rows and genes across dimensions:
        Y.shape == (#cells, #genes)

    X_init is the initial latent space for the model.
    Usually this is being initialized by using simulation.run_methods
        X_init, dims = run_methods(Y, methods)

    num_inducing are the number of inducing inputs. It is a number `M`
    between the `0` and the number of datapoints you have and controls
    the complexity of your model. We usually use 10 to 20
    inducing inputs, but if you are having trouble with accuracy in
    your found landscape, you can try to up this number. Note, that
    the speed of the method goes down, with higher numbers of
    inducing inputs. Also, if you use RNASeq data, it is recommended to use a
    lower number (i.e. 10) of inducing inputs so the BayesianGPLVM is
    forced to generalise over patterns and cannot explain the zeros in the
    data by inducing inputs.

    nonlinear_dims are the number of latent dimensions modelled as nonlinear
    relationship between latent space and observed gene expression values
    along the samples. This value gets ignored if X_init is given and the number
    of nonlinear_dims will be the number of dimensions in X_init. If X_init is
    not given, it will be created by PCA.

    linear_dims are the linear dimensions to add into the latent space.
    Linear dimensions are used for modelling linear relationships in the latent
    space independently from the non-linear ones. That is, the last linear_dims
    dimensions in the latent space will be modelled by a linear kernel. We
    recommend try to first run without linear dimensions and see what the
    BayesianGPLVM can learn. If there is a considered amount of confounding
    variation, the linear dimension can help to find this variation
    and explain it away from the rest. It can also lead to unexpected results...

    white_variance is a white variance value (float) for a white variance on the 
    kernel. If it is None, no white variance kernel will be added to the analysis.

    Missing Data: If you have missing data, you can assign the values in Y,
    which are missing to np.nan and the BayesianGPLVM will assume missing
    data at random over those. This will include the dimensionality in
    the runtime of the method and will slow down progress significantly. Thus,
    only include missing data into the model, if you are certain you want to
    use it.

    Usage example:

        from .simulation import run_methods
        Y -= Y.mean(0) # Normalization of data, zero mean is usually what you want.
        Y /= Y.std(0) # Beware of your data and decide whether you want to normalize the variances!
        X_init, dims = run_methods(Y, methods)
        m = create_model(Y, X_init, num_inducing=10)
        optimize_model(m)

    returns a BayesianGPLVM model for the given data matrix Y.
    """
    from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
    from GPy.kern import Linear, RBF, Add, White
    from GPy.util.linalg import pca

    try:
        Y = Y.values.copy()
    except:
        Y = np.asarray(Y, float).copy()

    if X_init is None:
        X_init = pca(Y, nonlinear_dims)[0]

    kernels = []

    if linear_dims > 0:
        Qlin = linear_dims
        Q = X_init.shape[1] + Qlin
        kernels.extend([
            RBF(Q - Qlin, ARD=True, active_dims=np.arange(0, X_init.shape[1])),
            Linear(Qlin, ARD=True, active_dims=np.arange(X_init.shape[1], Q))
        ])
    else:
        Q = X_init.shape[1]
        kernels.append(
            RBF(Q, ARD=True, active_dims=np.arange(0, X_init.shape[1])))

    if white_variance is not None:
        kernels.append(White(Q, variance=white_variance))

    if len(kernels) > 1:
        kernel = Add(kernels)
    else:
        kernel = kernels[0]

    m = BayesianGPLVMMiniBatch(Y,
                               Q,
                               X=X_init,
                               kernel=kernel,
                               num_inducing=num_inducing,
                               missing_data=np.any(np.isnan(Y)))

    return m
Esempio n. 5
0
def gp_on_fold(feature_sets, train, test, y, y_all, learn_options):

    sequences = np.array([str(x) for x in y_all.index.get_level_values(0).tolist()])

    kern = WeightedDegree(
        1, sequences, d=learn_options["kernel degree"], active_dims=[0]
    )
    X = np.arange(len(train))[:, None]

    current_dim = 1

    if "gc_count" in feature_sets:
        kern += RBF(1, active_dims=[current_dim], name="GC_rbf")
        X = np.concatenate((X, feature_sets["gc_count"].values), axis=1)
        current_dim += 1
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number of columns")

    if "drug" in feature_sets:
        Q = feature_sets["drug"].values.shape[1]
        kern += Linear(
            Q, active_dims=range(current_dim, current_dim + Q), name="drug_lin"
        )
        X = np.concatenate((X, feature_sets["drug"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    if "gene effect" in feature_sets:
        Q = feature_sets["gene effect"].values.shape[1]
        kern += Linear(
            Q, active_dims=range(current_dim, current_dim + Q), name="gene_lin"
        )
        X = np.concatenate((X, feature_sets["gene effect"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    if "Percent Peptide" in feature_sets:
        Q = feature_sets["Percent Peptide"].values.shape[1]
        kern += RBF(
            Q, active_dims=range(current_dim, current_dim + Q), name="percent_pept"
        )
        X = np.concatenate((X, feature_sets["Percent Peptide"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    if "Nucleotide cut position" in feature_sets:
        Q = feature_sets["Nucleotide cut position"].values.shape[1]
        kern += RBF(
            Q, active_dims=range(current_dim, current_dim + Q), name="nucleo_cut"
        )
        X = np.concatenate((X, feature_sets["Nucleotide cut position"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    if "Strand effect" in feature_sets:
        Q = feature_sets["Strand effect"].values.shape[1]
        kern += Linear(
            Q, active_dims=range(current_dim, current_dim + Q), name="strand"
        )
        X = np.concatenate((X, feature_sets["Strand effect"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    if "NGGX" in feature_sets:
        Q = feature_sets["NGGX"].values.shape[1]
        kern += Linear(Q, active_dims=range(current_dim, current_dim + Q), name="NGGX")
        X = np.concatenate((X, feature_sets["NGGX"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    if "TM" in feature_sets:
        Q = feature_sets["TM"].values.shape[1]
        kern += RBF(
            Q, ARD=True, active_dims=range(current_dim, current_dim + Q), name="TM"
        )
        X = np.concatenate((X, feature_sets["TM"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    if "gene features" in feature_sets:
        Q = feature_sets["gene features"].values.shape[1]
        kern += Linear(
            Q,
            ARD=True,
            active_dims=range(current_dim, current_dim + Q),
            name="genefeat",
        )
        X = np.concatenate((X, feature_sets["gene features"].values), axis=1)
        current_dim += Q
        if X.shape[1] != current_dim:
            raise AssertionError("incorrect number or columns")

    kern += Bias(X.shape[1])

    if learn_options["warpedGP"]:
        m = WarpedGP(X[train], y[train], kernel=kern)
    else:
        m = GPRegression(X[train], y[train], kernel=kern)

    m.optimize_restarts(3)
    y_pred, _ = m.predict(X[test])

    # TODO add offset such that low scores are around 0 (not -4 or so)

    return y_pred, m[:]
Esempio n. 6
0
def single_model(args):
    import h5py
    import pandas as pd
    import numpy as np
    import dill as pickle
    from utils import read_hdf5_dataset, prepare_output_file, read_hdf5_single
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import r2_score, mean_squared_error
    from tqdm import tqdm

    logger.info('read phenotypes from file: ' + args.phenotype_file)
    #phenotypes = pd.read_table(args.phenotype_file)
    phenotypes = read_hdf5_dataset(args.phenotype_file)
    logger.info('read genotypes from file: ' + args.genotype_file)
    X = read_hdf5_dataset(args.genotype_file)
    if args.transpose_x:
        logger.info('transpose X')
        X = X.T
    y = phenotypes
    if args.feature_indices_file:
        logger.info('read feature indices from: ' + args.feature_indices_file)
        feature_indices = read_hdf5_dataset(args.feature_indices_file)
        X = np.take(X, feature_indices, axis=1)
    if args.normalize_x:
        logger.info('normalize X')
        X = StandardScaler().fit_transform(X)
    if args.sample_indices_file:
        logger.info('read sample indices from: ' + args.sample_indices_file)
        sample_indices = read_hdf5_dataset(args.sample_indices_file)
    else:
        sample_indices = np.nonzero(~np.isnan(phenotypes))[0]
    X_train = X[sample_indices]
    y_train = y[sample_indices]
    logger.info('read parent table from file: ' + args.parent_table_file)
    parent_table = read_hdf5_single(args.parent_table_file)

    logger.info('use model ' + args.model_name)
    logger.info('X.shape = %s, y.shape = %s' % (repr(X.shape), repr(y.shape)))
    if args.model_name == 'ridge':
        from sklearn.linear_model import Ridge
        model = Ridge(alpha=10000)
        model.fit(X_train, y_train)
        y_pred = np.ravel(model.predict(X))
        y_pred_train = y_pred[sample_indices]
    elif args.model_name == 'ridge_cv':
        from sklearn.linear_model import Ridge
        alphas = 10.0**np.arange(1, 6)
        train_masks, test_masks = generate_cv_masks(sample_indices,
                                                    parent_table,
                                                    k_female=5,
                                                    k_male=5)
        cv_metrics = {}
        cv_metrics['mse'] = np.zeros((len(alphas), train_masks.shape[0]))
        cv_metrics['r2'] = np.zeros((len(alphas), train_masks.shape[0]))
        pbar = tqdm(total=len(alphas) * train_masks.shape[0])
        for i, alpha in enumerate(alphas):
            for j in range(train_masks.shape[0]):
                model = Ridge(alpha=alpha)
                model.fit(X[train_masks[j]], y[train_masks[j]])
                y_pred = model.predict(X[test_masks[j]])
                cv_metrics['mse'][i, j] = mean_squared_error(
                    y[test_masks[j]], y_pred)
                cv_metrics['r2'][i, j] = r2_score(y[test_masks[j]], y_pred)
                pbar.update(1)
        pbar.close()
        best_alpha = alphas[cv_metrics['r2'].mean(axis=1).argmax()]
        logger.info('optmized alpha = %f' % best_alpha)
        model = Ridge(alpha=best_alpha)
        model.fit(X_train, y_train)
        y_pred = np.ravel(model.predict(X))
        y_pred_train = y_pred[sample_indices]
    elif args.model_name == 'gpr':
        from sklearn.gaussian_process import GaussianProcessRegressor
        from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF
        kernel = RBF() + WhiteKernel()
        model = GaussianProcessRegressor(kernel=kernel)
        model.fit(X_train, y_train)
        logger.info('kernel params: %s' % repr(model.get_params()))
        y_pred_train = np.ravel(model.predict(X_train))
        y_pred = np.ravel(model.predict(X))
    elif args.model_name == 'gpy':
        from GPy.kern import Linear
        from GPy.models import GPRegression
        kernel = Linear(input_dim=2, name='linear')
        model = GPRegression(X_train, y_train, kernel=kernel)
        model.optimize()

    else:
        raise ValueError('unknown model name: ' + args.model_name)

    logger.info('r2 score = %f' % r2_score(y_train, y_pred_train))

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    model_file = os.path.join(args.output_dir, 'model')
    logger.info('save model file: ' + model_file)
    with open(model_file, 'wb') as f:
        pickle.dump(model, f)
    pred_file = os.path.join(args.output_dir, 'predictions')
    logger.info('save predictions to file: ' + pred_file)
    with h5py.File(pred_file, 'w') as f:
        if args.output_residuals:
            f.create_dataset('residual', data=(y - y_pred))
        f.create_dataset('y_true', data=y)
        f.create_dataset('y_pred', data=y_pred)
        f.create_dataset('y_pred_train', data=y_pred_train)
        f.create_dataset('indices_train', data=sample_indices)
        if args.model_name == 'ridge_cv':
            f.create_dataset('alpha', data=alphas)
            g = f.create_group('cv_metrics')
            for key in cv_metrics.keys():
                g.create_dataset(key, data=cv_metrics[key])