def ridge_objective(trial):
    global DF_TRAIN
    global CONFIG
    df_fit, df_val = train_val_split(DF_TRAIN, 0.8)
    encoders = fit_target_encoders(df_fit.drop("target"), df_fit["target"])

    feature_cols = [c for c in df_fit if c.startswith("cont")]

    x_train = df_fit[feature_cols].to_numpy()
    y_train = df_fit["target"].to_numpy()

    x_val = df_val[feature_cols].to_numpy()
    y_val = df_val["target"].to_numpy()

    alpha_start, alpha_end = CONFIG.get("experiment/training/alpha_range")
    model = Ridge(
        alpha=trial.suggest_loguniform('alpha', alpha_start, alpha_end))
    model = model.fit(x_train, y_train)

    y_hat = model.predict(x_val)
    residual = y_val.ravel() - y_hat.ravel()
    rmse = np.sqrt(np.mean(np.square(residual)))

    return rmse
Beispiel #2
0
    x, y, x_test, y_test = util.format_data_torch(
        *utils_load_dataset.load_mimic_patient_gap(
            dir_data, subject_id=args.dataset_option))
elif args.dataset == 'GPdata':
    x, y, x_val, y_val, x_test, y_test, x_plot, f_plot = data.load_GPdata(
        dir_data, lengthscale=args.dataset_option)
    x, y, x_test, y_test = util.format_data_torch(x, y, x_test, y_test)
elif args.dataset == 'finance_nogap':
    x, y, x_test, y_test = util.format_data_torch(
        *data.load_finance2(dir_data, gaps=None, seed=args.seed))
elif args.dataset == 'mimic':
    x, y = utils_load_dataset.load_mimic_patient(
        patient_id=int(args.dataset_option),
        csv_filename=os.path.join(dir_data, 'hr.csv'))
    x, y, x_test, y_test = util.format_data_torch(
        *data.train_val_split(x, y, frac_train=0.75, seed=args.seed))
else:
    print('dataset not found')

if args.validation_mode:
    # split training set into smaller training set and validation set, use as training set and test set going forward
    x, y, x_test, y_test = data.train_val_split(x,
                                                y,
                                                frac_train=0.8,
                                                seed=args.seed + 50)

x, y, x_test, y_test = data.standardize(x, y, x_test, y_test)

# center around zero
x = x - 0.5
x_test = x_test - 0.5
Beispiel #3
0
pickle_path = os.path.join(opt.data_dir, 'data.pickle')
if os.path.isfile(pickle_path):
    print('Loading premade train val split')
    with open(pickle_path, 'rb') as f:
        pickle_dict = pickle.load(f)
        label_names = pickle_dict['label_names']
        train_files = pickle_dict['train_files']
        train_labels = pickle_dict['train_labels']
        val_files = (pickle_dict['val_files'] if 'val_files' in pickle_dict
                     else pickle_dict['test_files'])
        val_labels = (pickle_dict['val_labels'] if 'val_labels' in pickle_dict
                      else pickle_dict['test_labels'])
else:
    print('Making train val split')
    files, labels, label_names = data.read_grouped_filenames_and_labels(opt.data_dir)
    train_files, train_labels, val_files, val_labels = data.train_val_split(files, labels)
    pickle_dict = { 'label_names' : label_names,
                    'train_files' : train_files,
                    'train_labels' : train_labels,
                    'val_files' :  val_files,
                    'val_labels' : val_labels }
    with open(pickle_path, 'wb') as f:
        pickle.dump(pickle_dict, f)

nepochs = 50
if opt.phase == 'test':
    nepochs = 1
    test_files, test_labels, label_names = data.read_grouped_filenames_and_labels(opt.test_dir)


opt.num_classes = 50 #len(label_names)    
Beispiel #4
0
def run_gp(dataset, dataset_option, seed, x_plot, sig2, dir_data, dir_out):

    ## dataset
    if dataset == 'motorcycle':
        x, y, x_test, y_test = reshape_dataset(
            *data.load_motorcycle(dir_data, seed=seed))
    elif dataset == 'finance':
        x, y, x_test, y_test = reshape_dataset(
            *data.load_finance2(dir_data, seed=seed))
    elif dataset == 'mimic_gap':
        x, y, x_test, y_test = reshape_dataset(
            *utils_load_dataset.load_mimic_patient_gap(
                dir_data, subject_id=dataset_option))
    elif dataset == 'GPdata':
        x, y, x_val, y_val, x_test, y_test, _, _ = data.load_GPdata(
            dir_data, lengthscale=dataset_option)
        x, y, x_test, y_test = reshape_dataset(x, y, x_test, y_test)
    elif dataset == 'finance_nogap':
        x, y, x_test, y_test = reshape_dataset(
            *data.load_finance2(dir_data, gaps=None, seed=seed))
    elif dataset == 'mimic':
        x, y = utils_load_dataset.load_mimic_patient(
            patient_id=int(dataset_option),
            csv_filename=os.path.join(dir_data, 'hr.csv'))
        x, y, x_test, y_test = reshape_dataset(
            *data.train_val_split(x, y, frac_train=0.75, seed=seed))

    x, y, x_test, y_test = data.standardize(x, y, x_test, y_test)

    ## fit gp
    kernel = GPy.kern.RBF(input_dim=1, variance=1.0, lengthscale=1.0)
    m = GPy.models.GPRegression(x, y, kernel)
    m.Gaussian_noise.variance.fix()
    m.Gaussian_noise.variance = sig2
    m.kern.lengthscale.constrain_bounded(1 / (5 * 2 * np.pi), 10)
    m.optimize_restarts(num_restarts=10, verbose=False)

    # ploterior metrics
    y_plot_samp = m.posterior_samples_f(x_plot, size=1000)
    y_plot_samp = np.moveaxis(y_plot_samp, [0, 1, 2], [1, 2, 0])

    y_plot_pred, y_plot_pred_var = m.predict(x_plot,
                                             full_cov=False,
                                             include_likelihood=False)
    y_test_pred, y_test_pred_var = m.predict(x_test,
                                             full_cov=False,
                                             include_likelihood=False)

    y_test_samp = m.posterior_samples_f(x_test, size=1000)
    y_test_samp = np.moveaxis(y_test_samp, [0, 1, 2], [1, 2, 0])

    ll_test = np.mean(m.log_predictive_density(x_test, y_test))

    rmse_test = np.sqrt(np.mean((y_test_pred - y_test)**2))

    # save
    samples = {
        'x_plot': x_plot,
        'y_plot_pred': np.squeeze(y_plot_pred),
        'y_plot_pred_var': y_test_pred_var,
        'y_plot_samp': y_plot_samp,
        'y_test_pred': np.squeeze(y_test_pred),
        'y_test_pred_var': y_test_pred_var,
        'y_test_samp': y_test_samp,
        'x': x,
        'y': y,
        'x_test': x_test,
        'y_test': y_test,
        'sig2': m.Gaussian_noise.variance.item(),
        'll_test': ll_test,
        'rmse_test': rmse_test,
        'kernel_lengthscale': m.kern.lengthscale.item(),
        'kernel_variance': m.kern.variance.item()
    }

    if not os.path.exists(dir_out):
        os.makedirs(dir_out)
    np.save(os.path.join(dir_out, 'samples.npy'), samples)