예제 #1
0
def get_data(args) -> Data:
    """Gets the dataset specified in the command line arguments from Bayesian benchmarks."""
    if args.dataset == "two_points":
        X = np.array([[-1.0], [1.0]])
        Y = np.array([[1.0], [2.0]])
        return Data(X_train=X, Y_train=Y, X_test=X, Y_test=Y)

    elif args.dataset == "demo":
        X_train, Y_train, X_test, Y_test = demo_dataset.create_demo_data()
        return Data(X_train=X_train,
                    Y_train=Y_train,
                    X_test=X_test,
                    Y_test=Y_test)
    elif args.dataset == "demo_normalized":
        X_train, Y_train, X_test, Y_test = _normalize(
            demo_dataset.create_demo_data())
        return Data(X_train=X_train,
                    Y_train=Y_train,
                    X_test=X_test,
                    Y_test=Y_test)

    else:
        if not args.disable_split_fix:
            dataset = get_regression_data(args.dataset, split=args.split)
        else:
            dataset = get_regression_data(args.dataset)
        X_test = dataset.X_test[:10000]
        Y_test = dataset.Y_test[:10000]
        return _remove_unused_dimensions(
            Data(dataset.X_train, dataset.Y_train, X_test, Y_test))
예제 #2
0
def run(ARGS, data=None, model=None, is_test=False):
    data = data or get_regression_data(ARGS.dataset, split=ARGS.split)
    model = model or get_regression_model(ARGS.model)(is_test=is_test, seed=ARGS.seed)

    model.fit(data.X_train, data.Y_train)

    res = {}

    samples = model.sample(data.X_test, ARGS.num_samples)
    data_tiled = np.tile(data.X_test[None, :, :], [ARGS.num_samples, 1, 1])
    shape =  [ARGS.num_samples * data.X_test.shape[0], data.X_test.shape[1] + data.Y_test.shape[1]]
    A = np.reshape(np.concatenate([data_tiled, samples], -1), shape)
    B = np.concatenate([data.X_test, data.Y_test], -1)


    if ARGS.pca_dim > 0:
        AB = np.concatenate([A, B], 0)
        pca = PCA(n_components=ARGS.pca_dim).fit(AB)
        A = pca.transform(A)
        B = pca.transform(B)

    # import matplotlib.pyplot as plt
    # plt.scatter(A[:, 0], A[:, 1], color='b')
    # plt.scatter(B[:, 0], B[:, 1], color='r')
    # plt.show()

    kernel = gpflow.kernels.RBF(A.shape[-1])
    res['mmd'] = mmd(A, B, kernel)

    print(res)

    res.update(ARGS.__dict__)
    if not is_test:  # prgama: no cover
        with Database(ARGS.database_path) as db:
            db.write('mmd', res)
def main(file_path: str, datasets: List[str]):
    if file_path.endswith("tar.gz"):
        mode = "r:gz"
    elif file_path.endswith("tar"):
        mode = "r:"
    else:
        raise ValueError

    extract_path = os.path.join(tempfile.gettempdir(), f"uci-{uuid.uuid4()}")
    uci_path = os.path.join(extract_path, "uci")
    try:
        with tarfile.open(file_path, mode) as tar:
            assert len(tar.members) == 1
            tar.extractall(extract_path)

        for dataset in datasets:
            dataset_path = os.path.join(uci_path, dataset)
            if not os.path.isdir(dataset_path):
                raise ValueError(
                    f"Unknown dataset {dataset} (available {tar.members}")

            dest_path = os.path.join(DATA_PATH, "uci", dataset)

            if os.path.isdir(dest_path):
                print(f"Skipping existing dataset {dataset}")
            else:
                print(f"Moving dataset {dataset} to {dest_path}")
                shutil.move(dataset_path, dest_path)

            assert get_regression_data(f"wilson_{dataset}") is not None
    finally:
        shutil.rmtree(extract_path)
예제 #4
0
def test_regression(d):
    data = get_regression_data(d)

    assert_almost_equal(
        np.average(np.concatenate([data.X_train, data.X_test], 0), 0),
        np.zeros(data.X_train.shape[1]))

    assert_almost_equal(np.std(np.concatenate([data.X_train, data.X_test], 0),
                               0),
                        np.ones(data.X_train.shape[1]),
                        decimal=3)

    assert_almost_equal(
        np.average(np.concatenate([data.Y_train, data.Y_test], 0), 0),
        np.zeros(data.Y_train.shape[1]))

    assert_almost_equal(np.std(np.concatenate([data.Y_train, data.Y_test], 0),
                               0),
                        np.ones(data.Y_train.shape[1]),
                        decimal=3)

    assert data.X_train.shape[0] == data.Y_train.shape[0]
    assert data.X_test.shape[0] == data.Y_test.shape[0]
    assert data.X_train.shape[1] == data.X_test.shape[1]
    assert data.Y_train.shape[1] == data.Y_test.shape[1]
예제 #5
0
def run(ARGS, is_test=False):
    data = get_regression_data(ARGS.dataset, split=ARGS.split)

    Model = get_regression_model(ARGS.model)
    model = Model(is_test=is_test, seed=ARGS.seed)
    model.fit(data.X_train, data.Y_train)
    m, v = model.predict(data.X_test)

    res = {}

    l = norm.logpdf(data.Y_test, loc=m, scale=v**0.5)
    res['test_loglik'] = np.average(l)

    lu = norm.logpdf(data.Y_test * data.Y_std, loc=m * data.Y_std, scale=(v**0.5) * data.Y_std)
    res['test_loglik_unnormalized'] = np.average(lu)

    d = data.Y_test - m
    du = d * data.Y_std

    res['test_mae'] = np.average(np.abs(d))
    res['test_mae_unnormalized'] = np.average(np.abs(du))

    res['test_rmse'] = np.average(d**2)**0.5
    res['test_rmse_unnormalized'] = np.average(du**2)**0.5

    res.update(ARGS.__dict__)

    if not is_test:  # pragma: no cover
        with Database(ARGS.database_path) as db:
            db.write('regression', res)
예제 #6
0
def test_pos_def():
    # N = 10
    # Dx = 3
    # Dy = 1
    # K = 5
    from bayesian_benchmarks.data import get_regression_data
    data = get_regression_data('wilson_3droad')
    X = data.X_train
    Y = data.Y_train
    M = 128
    from scipy.cluster.vq import kmeans2
    Z = kmeans2(X, M, minit='points')[0]

    N, Dx = X.shape
    Dy = Y.shape[1]
    K = 1

    lik = gpflow.likelihoods.Gaussian(variance=0.1)
    kern = gpflow.kernels.RBF(Dx, lengthscales=0.1)

    X = np.random.randn(N, Dx)
    Y = np.random.randn(N, Dy)

    layers_vi = [LatentVariableLayer(Dx, XY_dim=Dx + Dy), GPLayer(kern, Z, Dy)]

    layers_iw = [LatentVariableLayer(Dx, XY_dim=Dx + Dy), GPLayer(kern, Z, Dy)]

    m_dgp_vi = DGP_VI(X, Y, layers_vi, lik, num_samples=K, minibatch_size=512)
    m_dgp_iw = DGP_IWVI(X,
                        Y,
                        layers_iw,
                        lik,
                        num_samples=K,
                        minibatch_size=512)

    for model in [m_dgp_vi, m_dgp_iw]:

        model.layers[-1].q_mu.set_trainable(False)
        model.layers[-1].q_sqrt.set_trainable(False)

        optimizer_adam = gpflow.train.AdamOptimizer(0.005)
        adam_op = optimizer_adam.make_optimize_tensor(model)

        optimizer_ng = gpflow.train.NatGradOptimizer(gamma=0.01)
        ng_op = optimizer_ng.make_optimize_tensor(
            model, var_list=[[model.layers[-1].q_mu, model.layers[-1].q_sqrt]])
        sess = model.enquire_session()
        for _ in range(10):
            print('{} {:.2f}'.format(_, sess.run(model.likelihood_tensor)))
            sess.run(ng_op)
            sess.run(adam_op)

    L_vi = [m_dgp_vi.compute_log_likelihood() for _ in range(100)]
    L_iw = [m_dgp_iw.compute_log_likelihood() for _ in range(100)]

    L_vi = np.average(L_vi)
    L_iw = np.average(L_iw)

    print(L_vi, L_iw)
예제 #7
0
def run(ARGS, is_test):
    data = get_regression_data(ARGS.dataset, split=ARGS.split, prop=1.)

    ind = np.zeros(data.X_train.shape[0]).astype(bool)
    ind[:ARGS.num_initial_points] = True

    X, Y = data.X_train, data.Y_train

    Model = non_bayesian_model(ARGS.model, 'regression') or\
            import_module('bayesian_benchmarks.models.{}.models'.format(ARGS.model)).RegressionModel
    model = Model(is_test=is_test, seed=ARGS.seed)

    test_ll = []
    train_ll = []
    all_ll = []
    test_rmse = []
    train_rmse = []
    all_rmse = []

    for _ in range(min(ARGS.iterations, X.shape[0] - ARGS.num_initial_points)):
        model.fit(X[ind], Y[ind])

        m, v = model.predict(X)  # ND

        vars = v.copy()

        # set the seen ones to -inf so we don't choose them
        vars[ind] = -np.inf

        # choose the highest variance point
        i = np.argmax(vars)
        ind[i] = True

        logp = norm.logpdf(Y, loc=m, scale=v**0.5)  # N
        d2 = (Y - m)**2

        test_ll.append(np.average(logp[np.invert(ind)]))
        train_ll.append(np.average(logp[ind]))
        all_ll.append(np.average(logp))
        test_rmse.append(np.average(d2[np.invert(ind)])**0.5)
        train_rmse.append(np.average(d2[ind])**0.5)
        all_rmse.append(np.average(d2)**0.5)

    # save
    res = {
        'test_loglik': np.array(test_ll),
        'train_loglik': np.array(train_ll),
        'total_loglik': np.array(all_ll),
        'test_rmse': np.array(test_rmse),
        'train_rmse': np.array(train_rmse),
        'total_rmse': np.array(all_rmse),
    }
    res.update(ARGS.__dict__)

    if not is_test:  # pragma: no cover
        with Database() as db:
            db.write('active_learning_continuous', res)
예제 #8
0
def run(ARGS, data=None, model=None, is_test=False):

    data = data or get_regression_data(ARGS.dataset, split=ARGS.split)
    model = model or get_regression_model(ARGS.model)(is_test=is_test,
                                                      seed=ARGS.seed)
    res = {}

    print('data standard deviation is: ', data.Y_std)
    start = time.time()
    model.fit(data.X_train, data.Y_train)
    fit_time = time.time() - start
    res['fit_time'] = fit_time

    start = time.time()
    m, v = model.predict(data.X_test)
    infer_time = time.time() - start
    res['infer_time'] = infer_time

    l = norm.logpdf(data.Y_test, loc=m, scale=v**0.5)
    res['test_loglik'] = np.average(l)

    lu = norm.logpdf(data.Y_test * data.Y_std,
                     loc=m * data.Y_std,
                     scale=(v**0.5) * data.Y_std)
    res['test_loglik_unnormalized'] = np.average(lu)

    d = data.Y_test - m
    std = v**0.5
    cal = (d < 1.96 * std) * (d > -1.96 * std)

    du = d * data.Y_std

    res['test_mae'] = np.average(np.abs(d))
    res['test_mae_unnormalized'] = np.average(np.abs(du))

    res['test_rmse'] = np.average(d**2)**0.5
    res['test_rmse_unnormalized'] = np.average(du**2)**0.5

    res['test_calibration'] = np.average(cal)

    res.update(ARGS.__dict__)

    if not is_test:  # pragma: no cover
        print("HERE!!!!! DB IS {}".format(ARGS.database_path))
        with Database(ARGS.database_path) as db:
            db.write('regression', res)

    return res
예제 #9
0
def run(ARGS, data=None, model=None, is_test=False):

    data = data or get_regression_data(ARGS.dataset, split=ARGS.split)
    model = model or get_regression_model(ARGS.model)(
        is_test=is_test, seed=ARGS.seed, lr=ARGS.lr, iters=ARGS.iters)

    model.fit(data.X_train, data.Y_train)
    m, v = model.predict(data.X_test)

    res = {}

    l = norm.logpdf(data.Y_test, loc=m, scale=v**0.5)
    res['test_loglik'] = np.average(l)

    lu = norm.logpdf(data.Y_test * data.Y_std,
                     loc=m * data.Y_std,
                     scale=(v**0.5) * data.Y_std)
    res['test_loglik_unnormalized'] = np.average(lu)

    d = data.Y_test - m
    du = d * data.Y_std

    res['test_mae'] = np.average(np.abs(d))
    res['test_mae_unnormalized'] = np.average(np.abs(du))

    res['test_rmse'] = np.average(d**2)**0.5
    res['test_rmse_unnormalized'] = np.average(du**2)**0.5

    res.update(ARGS.__dict__)
    res['model'] = '{}_{}'.format(res['model'], res['num_gpus'])

    if not is_test:  # pragma: no cover
        with Database(ARGS.database_path) as db:
            db.write('regression', res)

    return res
예제 #10
0
    args.device = torch.device('cuda')
else:
    args.device = torch.device('cpu')

print('Preparing directory %s' % args.dir)
os.makedirs(args.dir, exist_ok=True)
with open(os.path.join(args.dir, 'command.sh'), 'w') as f:
    f.write(' '.join(sys.argv))
    f.write('\n')

torch.backends.cudnn.benchmark = True
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)

print('Preparing dataset %s' % args.dataset)
dataset = get_regression_data(args.dataset, split=args.split)
print(dataset.N, dataset.D, dataset.name)
print('Using model %s' % args.model)
model_cfg = getattr(models, args.model)

print('Preparing model')
print(*model_cfg.args)

if not args.uci_small:
    if dataset.N > 6000:
        model_cfg.kwargs['dimensions'] = [1000, 1000, 500, 50, 2]
    else:
        model_cfg.kwargs['dimensions'] = [1000, 500, 50, 2]
else:
    # similarly to DVI paper;
    # protein dataset case
def run(ARGS, data=None, model=None, is_test=False):

    data = data or get_regression_data(ARGS.dataset, split=ARGS.split)
    model = model or get_regression_model(ARGS.model)(is_test=is_test,
                                                      seed=ARGS.seed)

    model.fit(data.X_train, data.Y_train)
    m, v = model.predict(
        data.X_test
    )  # both [data points x output dim] or [samples x data points x output dim]

    assert m.ndim == v.ndim
    assert m.ndim in {
        2, 3
    }  # 3-dim in case of approximate predictions (multiple samples per each X)
    assert np.all(v >= 0.0)

    res = {}
    log_eps = np.log(1e-12)  # log probability threshold
    log_1_minus_eps = np.log(1.0 - 1e-12)

    if m.ndim == 2:  # keep analysis as in the original code in case of 2-dim predictions

        l = norm.logpdf(data.Y_test, loc=m, scale=v**0.5)  # []
        l = np.clip(l, log_eps, log_1_minus_eps)  # clip
        res['test_loglik'] = np.average(l)

        lu = norm.logpdf(data.Y_test * data.Y_std,
                         loc=m * data.Y_std,
                         scale=(v**0.5) * data.Y_std)
        lu = np.clip(lu, log_eps, log_1_minus_eps)  # clip
        res['test_loglik_unnormalized'] = np.average(lu)

        d = data.Y_test - m
        du = d * data.Y_std

        res['test_mae'] = np.average(np.abs(d))
        res['test_mae_unnormalized'] = np.average(np.abs(du))

        res['test_rmse'] = np.average(d**2)**0.5
        res['test_rmse_unnormalized'] = np.average(du**2)**0.5

    else:  # compute metrics in case of 3-dim predictions

        res['test_loglik'] = []
        res['test_loglik_unnormalized'] = []

        for n in range(m.shape[0]):  # iterate through samples
            l = norm.logpdf(data.Y_test, loc=m[n], scale=v[n]**0.5)
            l = np.clip(l, log_eps, log_1_minus_eps)  # clip
            res['test_loglik'].append(l)

            lu = norm.logpdf(data.Y_test * data.Y_std,
                             loc=m[n] * data.Y_std,
                             scale=(v[n]**0.5) * data.Y_std)
            lu = np.clip(lu, log_eps, log_1_minus_eps)  # clip
            res['test_loglik_unnormalized'].append(lu)

        # Mixture test likelihood (mean over per data point evaluations)
        res['test_loglik'] = meanlogsumexp(res['test_loglik'])

        # Mixture test likelihood (mean over per data point evaluations)
        res['test_loglik_unnormalized'] = meanlogsumexp(
            res['test_loglik_unnormalized'])

        d = data.Y_test - np.mean(m, axis=0)
        du = d * data.Y_std

        res['test_mae'] = np.average(np.abs(d))
        res['test_mae_unnormalized'] = np.average(np.abs(du))

        res['test_rmse'] = np.average(d**2)**0.5
        res['test_rmse_unnormalized'] = np.average(du**2)**0.5

    if not is_test:
        res.update(ARGS.__dict__)

    if not is_test:  # pragma: no cover
        with Database(ARGS.database_path) as db:
            db.write('regression', res)

    return res
예제 #12
0
tensorboard_path = os.path.join(tensorboard_path_base, file_name)
checkpoint_path = os.path.join(checkpoints_path_base, file_name)
figs_path = os.path.join(figs_path_base, file_name + '.png')
results_path = os.path.join(ARGS.results_path, 'results.db')

for p in [
        ARGS.results_path, tensorboard_path_base, checkpoints_path_base,
        figs_path_base
]:
    if not os.path.isdir(p):
        os.mkdir(p)

#################################### data

from bayesian_benchmarks.data import get_regression_data
data = get_regression_data(ARGS.dataset)
data.X_test = data.X_test[:10000]
data.Y_test = data.Y_test[:10000]

#################################### model
from build_models import build_model

model = build_model(ARGS, data.X_train, data.Y_train)

#################################### init

sess = model.enquire_session()
model.init_op(sess)

#################################### monitoring
예제 #13
0
def run(ARGS, data=None, model=None, is_test=False):

    # Set list of softmax scaling we want to train experts with
    powers = [100]

    # Set list of models (and their weighting methods) to be trained
    dict_models = {
        'bar': ['variance'],
        'gPoE': ['uniform', 'variance'],
        'rBCM': ['diff_entr', 'variance'],
        'BCM': ['no_weights'],
        'PoE': ['no_weights']
    }

    # Gather the data
    data = data or get_regression_data(ARGS.dataset, split=ARGS.split)
    print(data.X_train.shape)

    # Initialize the model
    model = model or get_regression_model(ARGS.model)(is_test=is_test,
                                                      seed=ARGS.seed)
    if (ARGS.model == 'gp' and data.N > 6000):
        return ('too large data for full gp')

    # Optimize the model by maximizing sum of log-marginal likelihoods
    print('model fitting')
    model.fit(data.X_train, data.Y_train)

    if 'expert' in ARGS.model:

        if 'minibatching' in ARGS.model:
            minibatching = True
        else:
            minibatching = False

        # Gather the predictions of all experts at all test inputs with an option to minibatch. mu_s, var_s are n_expert x n_test
        print('gathering predictions')
        mu_s, var_s = expert_predictions(data.X_test,
                                         model,
                                         minibatching=minibatching,
                                         gather=True)

        # Loop over models (Poe,...), weighting methods (Wass,variance,...) and powers  (softmax scaling)
        for model_name in dict_models.keys():
            for weighting in dict_models[model_name]:
                for power in powers:
                    model.power = power
                    model.model = model_name
                    model.weighting = weighting

                    #Aggregate predictions for a single model (using a specific weighting scheme, e.g gPoE_var with T=100). m,v are n_test x 1
                    print('prediction aggregation')
                    m, v = expert_predictions(data.X_test,
                                              model,
                                              mu_s=mu_s,
                                              var_s=var_s,
                                              minibatching=minibatching,
                                              gather=False)

                    #Add scores (RMSE/NLPD) of the single model to the database
                    res = update_score_database(m,
                                                v,
                                                data,
                                                ARGS,
                                                is_test,
                                                power=power,
                                                weighting=weighting,
                                                model_name=model_name)

                    if weighting in ['no_weights', 'uniform', 'diff_entr']:
                        break
    else:

        m, v = model.predict(data.X_test)

        res = update_score_database(m, v, data, ARGS, is_test)

    return res