Ejemplo n.º 1
0
    def predict_log_marginal_probabilities(self, X: np.ndarray) -> np.ndarray:
        # TODO: Check against GPFlow.
        # TODO: Is this really worth it? Could just use predict_y.

        assert self.is_fit

        X = self.scaler.transform(X)

        # Run the prediction for each model
        results = list()

        for cur_model in self.models:

            if self.use_cache:
                # Load model
                cur_model = load_saved_gpflow_model(cur_model)

            # Predict f, the latent probability on the probit scale
            f_mean, f_var = cur_model.predict_f(X)
            f_std = np.sqrt(f_var)

            if self.use_cache:
                gpflow.reset_default_graph_and_session()

            result = log_probability_via_sampling(
                np.squeeze(f_mean), np.squeeze(f_std), self.n_draws_predict)

            results.append(result)

        results = np.stack(results, axis=1)

        return results
Ejemplo n.º 2
0
def evalMCMCSamples(X, Y, x):
    gpflow.reset_default_graph_and_session()

    Y = np.atleast_2d(Y).T
    traces, m = evalMCMC(X, Y)

    f_samples = []
    nn = 1

    for i, s in traces.iloc[::10].iterrows():
        f = m.predict_f_samples(x,
                                nn,
                                initialize=False,
                                feed_dict=m.sample_feed_dict(s))
        f_samples.append(f)

    f_samples = np.array(f_samples)

    # print("f_samples.shape=", f_samples.shape)
    # print("x.shape=", x.shape)
    out = f_samples[:, 0, :, 0].reshape(f_samples.shape[0], f_samples.shape[2])
    # print("out.shape=", out.shape)

    m.clear()
    return x, out
Ejemplo n.º 3
0
def evalMLESamples(X, Y, x):
    gpflow.reset_default_graph_and_session()

    Y = np.atleast_2d(Y).T
    _, m = evalMLE(X, Y)

    num_samples = 10
    ff = m.predict_f_samples(x, num_samples, initialize=False)

    # print("ff.shape=", ff.shape)

    m.clear()

    return x, ff[:, :, 0]
Ejemplo n.º 4
0
    def calculate_log_likelihood(self, X, y):

        assert self.is_fit

        assert y.shape[1] == len(self.models)

        X = self.scaler.transform(X)

        means, sds = list(), list()

        for cur_model in self.models:

            if self.use_cache:
                cur_model = load_saved_gpflow_model(cur_model)

            cur_mean, cur_vars = cur_model.predict_f(X)
            cur_sds = np.sqrt(cur_vars)

            if self.use_cache:
                gpflow.reset_default_graph_and_session()

            means.append(np.squeeze(cur_mean))
            sds.append(np.squeeze(cur_sds))

        means = np.stack(means, axis=1)
        sds = np.stack(sds, axis=1)

        site_log_liks = np.zeros(means.shape[0])

        # Estimate site by site
        for i, (cur_y, cur_mean, cur_sd) in enumerate(zip(y, means, sds)):

            draws = np.random.normal(
                cur_mean, cur_sd, size=(self.n_draws_predict, means.shape[1]))

            log_lik = calculate_log_joint_bernoulli_likelihood(draws, cur_y)

            site_log_liks[i] = log_lik

        return site_log_liks
Ejemplo n.º 5
0
def main():
    X = np.loadtxt("../data/neur.X.txt")
    Y = np.loadtxt("../data/neur.Y.txt")

    gpflow.reset_default_graph_and_session()
    name = 'test'
    minibatch_size = 500

    W1_init = normalize(np.random.random(size=(C, K1)))
    W2_init = normalize(np.random.random(size=(G, K2)))

    with gpflow.defer_build():
        kernel = mk.SharedIndependentMok(
            gpflow.kernels.RBF(1, active_dims=[0]), K1 * K2)
        Z = np.linspace(0, 1, T)[:, None].astype(np.float64)
        feature = gpflow.features.InducingPoints(Z)
        feature = mf.SharedIndependentMof(feature)

        model = SplitGPM(X,
                         Y,
                         np.log(W1_init + 1e-5),
                         np.log(W2_init + 1e-5),
                         kernel,
                         gpflow.likelihoods.Gaussian(),
                         feat=feature,
                         minibatch_size=minibatch_size,
                         name=name)
    model.compile()

    model.W1.set_trainable(True)  # learn cell assignments
    model.W2.set_trainable(True)  # learn gene assignments
    model.feature.set_trainable(True)  # move inducing points
    model.kern.set_trainable(True)  # learn kernel parameters
    model.likelihood.set_trainable(True)  # lear likelihood parameters

    adam = gpflow.train.AdamOptimizer(0.005)
    adam.minimize(model, maxiter=10000)

    save_model(model)
Ejemplo n.º 6
0
    def fit(self, X, y):

        self.scaler = StandardScaler()
        X = self.scaler.fit_transform(X)

        Z = find_starting_z(X, num_inducing=self.n_inducing,
                            use_minibatching=False)

        self.models = list()

        # We need to fit each species separately
        for cur_output in tqdm(range(y.shape[1])):

            cur_kernel = self.kernel_function()
            cur_likelihood = gpflow.likelihoods.Bernoulli()

            cur_y = y[:, [cur_output]]

            cur_m = gpflow.models.SVGP(X, cur_y, kern=cur_kernel,
                                       likelihood=cur_likelihood, Z=Z)

            opt = gpflow.train.ScipyOptimizer(
                options={'maxfun': self.maxiter})

            opt.minimize(cur_m, maxiter=self.maxiter, disp=self.verbose_fit)

            if self.use_cache:
                # Store in cache dir
                save_dir = join(self.cache_dir, f'model_{cur_output}')
                save_gpflow_model(cur_m, save_dir)
                self.models.append(save_dir)
                # Reset graph
                gpflow.reset_default_graph_and_session()
            else:
                # Append directly
                self.models.append(cur_m)

        self.is_fit = True
Ejemplo n.º 7
0
idx = [d[x] for x in zip(data.line, data.time)]

library_size = data.groupby(['line', 'time']).value.sum().values[idx]
gene_lengths = gene_attributes['size'].values[W2_idx]
gc_content = gene_attributes.percentage_gene_gc_content.values[W2_idx] / 100


# make feature
Zlibsize = np.quantile(library_size, np.linspace(0.01, .99, 200))
Zgenesize = np.quantile(gene_lengths, np.linspace(0.01, 0.99, 200))
Z = np.log(np.vstack([Zlibsize, Zgenesize]).T)

X_aug = np.log(np.stack([library_size.flatten(), gene_lengths.flatten()]).T)

# build model
gpflow.reset_default_graph_and_session()
with gpflow.defer_build():
    kernel = gpflow.kernels.RBF(1, active_dims=[0]) + gpflow.kernels.RBF(1, active_dims=[1])
    feature = gpflow.features.InducingPoints(Z)
    
    model = gpflow.models.SVGP(
        X_aug, Y, kernel, NegativeBinomial(), 
        feat=feature, minibatch_size=500, name=name)
model.compile()


# restore/create monitor session
lr = 0.01
monitor_tasks, session, global_step, file_writer = build_monitor(model, path)

optimiser = gpflow.train.AdamOptimizer(lr)
Ejemplo n.º 8
0
def fitGP(method, normalize=True):
    assert method in ('VFE', 'FITC', 'GP')
    if data_directory == 'year-prediction-MSD' and method == 'FITC':
        return  # big data, thus using only SVGP, no FITC

    perf = np.nan * np.zeros((n_splits, 2))

    np.random.seed(1)
    for split in range(n_splits):
        (X_train_normalized, y_train_normalized, y_train, X_test_normalized,
         X_test, y_test, mean_X_train, std_X_train, mean_y_train,
         std_y_train) = _split_data(split, normalize)

        if data_directory != 'year-prediction-MSD':
            if method == 'GP':
                gp = GPy.models.GPRegression(
                    X_train_normalized, y_train_normalized[:, None],
                    GPy.kern.RBF(X_train_normalized.shape[1], ARD=True))
            else:
                gp = GPy.models.SparseGPRegression(
                    X_train_normalized,
                    y_train_normalized[:, None],
                    GPy.kern.RBF(X_train_normalized.shape[1], ARD=True),
                    num_inducing=n_hidden)
            if method == 'FITC':
                gp.inference_method = GPy.inference.latent_function_inference.FITC(
                )
            success = False
            for _ in range(10):
                try:
                    gp.optimize_restarts(robust=True)
                    success = True
                    break
                except:
                    pass
            if success:
                gp.save('results/%s/%s_split%g.hdf5' %
                        (method, data_directory, split))
            else:
                continue
        else:
            gpflow.reset_default_graph_and_session()
            Z = X_train_normalized[np.random.choice(np.arange(
                len(X_train_normalized)),
                                                    n_hidden,
                                                    replace=False)].copy()
            gp = gpflow.models.SVGP(X_train_normalized,
                                    y_train_normalized[:, None],
                                    gpflow.kernels.RBF(
                                        X_train_normalized.shape[1], ARD=True),
                                    gpflow.likelihoods.Gaussian(),
                                    Z,
                                    minibatch_size=1000)
            adam = gpflow.train.AdamOptimizer().make_optimize_action(gp)
            gpflow.actions.Loop(adam, stop=30000)()
            gp.anchor(gp.enquire_session())
            saver = gpflow.saver.Saver()
            saver.save(
                'results/%s/%s_split%g' % (method, data_directory, split), gp)

        if data_directory != 'year-prediction-MSD':
            m, v = np.squeeze(gp.predict(X_test_normalized))
        else:
            m, v = np.squeeze(gp.predict_y(X_test_normalized))
        if normalize:
            v *= std_y_train**2
            m = m * std_y_train + mean_y_train
        perf[split] = np.sqrt(np.mean(
            (y_test - m)**2)), -logpdf(y_test - m, v).mean()

    np.save('results/%s/%s' % (method, data_directory), perf)
Ejemplo n.º 9
0
def fitANN(normalize=True):
    etas = np.array([1, 2, 5, 10, 20, 50, 100]) * {
        'bostonHousing': 1e-7,
        'concrete': 1e-7,
        'energy': 1e-10,
        'kin8nm': 1e-5,
        'naval-propulsion-plant': 1e-9,
        'power-plant': 1e-6,
        'protein-tertiary-structure': 1e-5,
        'wine-quality-red': 1e-6,
        'yacht': 1e-9,
        'year-prediction-MSD': 1e-4
    }[data_directory]

    perf = np.nan * np.zeros((n_splits, 8))

    np.random.seed(1)
    for split in range(n_splits):
        (X_train_normalized, y_train_normalized, y_train, X_test_normalized,
         X_test, y_test, mean_X_train, std_X_train, mean_y_train,
         std_y_train) = _split_data(split, normalize)

        if data_directory != 'year-prediction-MSD':
            gp = GPy.models.SparseGPRegression(X_train_normalized,
                                               y_train_normalized[:, None],
                                               GPy.kern.RBF(
                                                   X_train_normalized.shape[1],
                                                   ARD=True),
                                               num_inducing=n_hidden)
            gp[:] = h5py.File(
                'results/VFE/%s_split%g.hdf5' % (data_directory, split),
                'r')['param_array']
            var = gp.Gaussian_noise.variance
            varK = gp.kern.variance
            Kfu = gp.kern.K(X_train_normalized, gp.inducing_inputs)
            Kfu_test = gp.kern.K(X_test_normalized, gp.inducing_inputs)
            w = gp.posterior.woodbury_vector.ravel()
            woodbury_inv = gp.posterior.woodbury_inv
        else:
            gpflow.reset_default_graph_and_session()
            saver = gpflow.saver.Saver()
            gp = saver.load('results/VFE/%s_split%g' % (data_directory, split))
            var = gp.likelihood.variance.value
            varK = gp.kern.variance.value
            Kfu = gp.kern.compute_K(X_train_normalized, gp.feature.Z.value)
            Kuu = gp.kern.compute_K(gp.feature.Z.value, gp.feature.Z.value)
            Kfu_test = gp.kern.compute_K(X_test_normalized, gp.feature.Z.value)
            Sigma = np.linalg.inv(Kfu.T.dot(Kfu) + var * Kuu)
            w = Sigma.dot(Kfu.T.dot(y_train_normalized))
            woodbury_inv = np.linalg.inv(Kuu) - var * Sigma

        def custom_loss():  # neg loglikelihood
            def loss(y_true, y_pred):
                return tf.divide(tf.square(y_pred[..., 0] - y_true[..., 0]), y_pred[..., 1]) + \
                    tf.math.log(y_pred[..., 1])

            return loss

        def build_model(eta):
            u, s, v = np.linalg.svd(woodbury_inv)
            U = (u + v.T).dot(np.diag(np.sqrt(s))) / 2
            inputs = layers.Input(shape=(n_hidden, ))
            m = layers.Dense(1,
                             kernel_initializer=tf.constant_initializer(w),
                             trainable=False)(inputs)
            x = layers.Dense(n_hidden,
                             kernel_initializer=tf.constant_initializer(U),
                             activation=tf.square)(inputs)

            def act(a):
                return tf.math.softplus(a / var / 2) * var * 2

            v = layers.Dense(1,
                             kernel_initializer=tf.constant_initializer(
                                 -np.ones((1, n_hidden))),
                             bias_initializer=tf.constant_initializer(var +
                                                                      varK),
                             activation=act)(x)
            outputs = layers.concatenate([m, v])
            model = tf.keras.Model(inputs=inputs, outputs=outputs)
            model.compile(loss=custom_loss(),
                          optimizer=tf.keras.optimizers.Adam(eta))
            return model

        # find best learning rate using 5-fold cross validation
        best_loss = np.inf
        best_eta = etas[0]
        for eta in etas:
            loss = 0
            for fold in range(5):
                model = build_model(eta)
                train_idx = np.ones(X_train_normalized.shape[0], dtype=bool)
                train_idx[fold::5] = False
                history = model.fit(
                    Kfu[train_idx],
                    y_train_normalized[train_idx],
                    epochs=n_epochs,
                    validation_data=(Kfu[~train_idx],
                                     y_train_normalized[~train_idx]),
                    verbose=0)
                loss += history.history['val_loss'][-1]
            if loss < best_loss:
                best_loss = loss
                best_eta = eta

        model = build_model(best_eta)
        history = model.fit(Kfu,
                            y_train_normalized,
                            epochs=n_epochs,
                            verbose=0)
        if data_directory != 'year-prediction-MSD':
            m = np.squeeze(gp.predict(X_test_normalized))[0]
        else:
            m = np.squeeze(gp.predict_y(X_test_normalized))[0]
        v = np.squeeze(model.predict(Kfu_test)).T[1]
        if normalize:
            m = m * std_y_train + mean_y_train
            v = v * std_y_train**2
        perf[split, :2] = np.sqrt(np.mean(
            (y_test - m)**2)), -logpdf(y_test - m, v).mean()
        perf[split, 2] = best_eta

        # measure prediction time
        if data_directory != 'year-prediction-MSD':
            U, Ub, _, _, w, wb = model.get_weights()
            m = gp.posterior.woodbury_vector
            var = 2 * gp.Gaussian_noise.variance

            def act(a):
                return np.log(1 + np.exp(a / var)) * var

            if normalize:

                def predict(X_test):
                    X_test_normalized = (X_test - mean_X_train) / std_X_train
                    K = gp.kern.K(X_test_normalized, gp.inducing_inputs)
                    return np.concatenate([
                        K.dot(m) * std_y_train + mean_y_train,
                        act(((K.dot(U) + Ub)**2).dot(w) + wb) * std_y_train**2
                    ], 1)
            else:

                def predict(X_test):
                    K = gp.kern.K(X_test, gp.inducing_inputs)
                    return np.concatenate(
                        [K.dot(m),
                         act(((K.dot(U) + Ub)**2).dot(w) + wb)], 1)
        else:
            if normalize:

                def predict(X_test):
                    X_test_normalized = (X_test - mean_X_train) / std_X_train
                    K = gp.kern.compute_K(X_test_normalized,
                                          gp.feature.Z.value)
                    m, v = np.squeeze(model.predict(K)).T
                    return np.array(
                        [m * std_y_train + mean_y_train, v * std_y_train**2])
            else:

                def predict(X_test):
                    K = gp.kern.compute_K(X_test, gp.feature.Z.value)
                    m, v = np.squeeze(model.predict(K)).T
                    return np.array([m, v])

        for i in range(5):
            t = -time()
            _ = predict(X_test)
            t += time()
            perf[split, 3 + i] = t

    np.save('results/ANN/' + data_directory, perf)
Ejemplo n.º 10
0
def fitBioNN(normalize=True):
    perf = np.nan * np.zeros((n_splits, 8))

    np.random.seed(1)
    for split in range(n_splits):
        (X_train_normalized, y_train_normalized, y_train, X_test_normalized,
         X_test, y_test, mean_X_train, std_X_train, mean_y_train,
         std_y_train) = _split_data(split, normalize)

        if data_directory != 'year-prediction-MSD':
            vfe = GPy.models.SparseGPRegression(
                X_train_normalized,
                y_train_normalized[:, None],
                GPy.kern.RBF(X_train_normalized.shape[1], ARD=True),
                num_inducing=n_hidden)
            vfe[:] = h5py.File(
                'results/VFE/%s_split%g.hdf5' % (data_directory, split),
                'r')['param_array']
            nn = BioNN(X_train_normalized, y_train_normalized[:, None],
                       vfe.inducing_inputs, vfe.kern.lengthscale)
        else:
            gpflow.reset_default_graph_and_session()
            saver = gpflow.saver.Saver()
            vfe = saver.load('results/VFE/%s_split%g' %
                             (data_directory, split))
            nn = BioNN(X_train_normalized, y_train_normalized[:, None],
                       vfe.feature.Z.value, vfe.kern.lengthscales.value)

        m, v = np.squeeze(nn.predict(X_test_normalized))
        if normalize:
            m = m * std_y_train + mean_y_train
            v = v * std_y_train**2
        perf[split, :2] = np.sqrt(np.mean(
            (y_test - m)**2)), -logpdf(y_test - m, v).mean()
        perf[split, 2] = v.var()

        # measure prediction time
        if normalize:

            def predict(X_test):
                X_test_normalized = (X_test - mean_X_train) / std_X_train
                K = nn.kern.K(X_test_normalized, nn.inducing_inputs)
                m = K.dot(nn.w_mean)
                SNRinv = np.maximum(1 - np.sum(K**2, 1), 0)
                v = np.vstack([SNRinv, np.ones(len(m))]).T.dot(nn.wb_var)
                return np.concatenate(
                    [m * std_y_train + mean_y_train, v * std_y_train**2], 1)
        else:

            def predict(X_test):
                K = nn.kern.K(X_test, nn.inducing_inputs)
                m = K.dot(nn.w_mean)
                SNRinv = np.maximum(1 - np.sum(K**2, 1), 0)
                v = np.vstack([SNRinv, np.ones(len(m))]).T.dot(nn.wb_var)
                return np.concatenate([m, v], 1)

        for i in range(5):
            t = -time()
            _ = predict(X_test)
            t += time()
            perf[split, 3 + i] = t

    np.save('results/BioNN/' + data_directory, perf)
Ejemplo n.º 11
0
def load_saved_gpflow_model(gpflow_model_path: str):

    gpflow.reset_default_graph_and_session()
    m = gpflow.saver.Saver().load(gpflow_model_path)
    return m
Ejemplo n.º 12
0
    def fit(self, X, y):

        n_dims = X.shape[1]
        n_outputs = y.shape[1]

        kern_fun = partial(self.kernel_fun, n_dims=n_dims, n_outputs=n_outputs)

        def get_model(w_prior, bias_var):

            # We need to make a model creation function.
            cur_kernel = kern_fun(w_prior=w_prior, bias_var=bias_var)
            model_fun = partial(self.model_fun, kernel=cur_kernel)
            return model_fun()

        scores = list()

        for cur_variance in self.variances_to_try:

            # Compute the bias variance so that we have a variance of 0.4
            # for that overall
            bias_var = 0.4 / cur_variance

            print(f'Fitting {cur_variance:.2f} with bias var {bias_var:.2f}')

            model_fun = lambda: get_model(cur_variance, bias_var)  # NOQA

            cur_mean_score, cur_stderr = MultiOutputGP.cross_val_score(
                X,
                y,
                model_fun,
                save_dir=join(self.cv_save_dir, f'{cur_variance:.4f}'),
                n_folds=self.n_folds)

            gpf.reset_default_graph_and_session()

            print(f'Mean likelihood is {cur_mean_score}')

            scores.append({
                'mean': cur_mean_score,
                'stderr': cur_stderr,
                'variance': cur_variance
            })

        scores = pd.DataFrame(scores)

        # Sort by ascending complexity
        scores = scores.sort_values('variance')

        # Find best index; invert mean since error rule expects errors,
        # where smaller is better, rather than likelihoods where higher is
        # better.
        best_idx = select_using_standard_error_rule(-scores['mean'].values,
                                                    scores['stderr'].values)

        best_variance = scores.iloc[best_idx]['variance']

        print(f'Selected model using one standard error rule has variance '
              f' {best_variance:.2f}')

        bias_var = 0.4 / best_variance

        best_model = get_model(best_variance, bias_var)

        best_model.fit(X, y)

        self.is_fit = True
        self.model = best_model
Ejemplo n.º 13
0
 def del_graph(self):
     gpflow.reset_default_graph_and_session()
     return
Ejemplo n.º 14
0
    def cross_val_score(X, y, model_creation_fun, save_dir, n_folds=4):

        kfold = KFold(n_splits=n_folds)
        fold_liks = np.empty(n_folds)

        for i, (cur_train_ind,
                cur_test_ind) in tqdm(enumerate(kfold.split(X, y))):

            cur_X = X[cur_train_ind]
            cur_y = y[cur_train_ind]

            gpf.reset_default_graph_and_session()

            model = model_creation_fun()

            model.fit(cur_X, cur_y)

            cur_save_dir = join(save_dir, f'fold_{i + 1}')
            os.makedirs(cur_save_dir, exist_ok=True)

            model.save_model(cur_save_dir)

            cur_test_x = X[cur_test_ind]
            cur_test_y = y[cur_test_ind]

            log_liks = model.calculate_log_likelihood(cur_test_x, cur_test_y)
            marg_pred = pd.DataFrame(
                model.predict_marginal_probabilities(cur_test_x))

            marg_pred.to_csv(join(cur_save_dir, 'marginal_probs.csv'))
            pd.DataFrame(cur_test_y).to_csv(join(cur_save_dir, 'y_t.csv'))

            # I am also interested in the log loss.
            y_t_df = pd.DataFrame(cur_test_y)
            neg_log_loss_results = multi_class_eval(marg_pred, y_t_df,
                                                    neg_log_loss_with_labels,
                                                    'log_lik')

            neg_log_loss_results.to_csv(
                join(cur_save_dir, 'marginal_species_log_lik.csv'))

            pd.Series(neg_log_loss_results.mean()).to_csv(
                join(cur_save_dir, 'neg_log_loss_mean.csv'))

            fold_liks[i] = np.mean(log_liks)

            np.savez(join(cur_save_dir, 'cv_results'),
                     site_log_liks=log_liks,
                     cur_train_X=cur_X,
                     cur_train_y=cur_y,
                     cur_test_X=cur_test_x,
                     cur_test_y=cur_test_y,
                     train_ind=cur_train_ind,
                     test_ind=cur_test_ind)

        pd.Series({
            'mean_lik': np.mean(fold_liks)
        }).to_csv(join(save_dir, 'mean_lik.csv'))

        pd.Series(fold_liks, index=[f'fold_{i+1}' for i in range(n_folds)
                                    ]).to_csv(join(save_dir, 'fold_liks.csv'))

        return np.mean(fold_liks), np.std(fold_liks) / np.sqrt(len(fold_liks))