def main(args=sys.argv[1:]): args = parse_args(args) print(args) train_aug_X, train_y, train_patients = read_and_extract_features(args, "train") test_aug_X, test_y, test_patients = read_and_extract_features(args, "test") print('Imputing missing values ...') # Impute things imputer = SimpleImputer(strategy="median") imputer.fit(train_aug_X) imputed_train_X = imputer.transform(train_aug_X) print("train data shape", imputed_train_X.shape) imputed_test_X = imputer.transform(test_aug_X) # Save things train_data = Dataset(x=imputed_train_X, y=train_y, group_id=train_patients) support_sim_settings = SupportSimSettingsComplex.create_from_dataset(train_data.x, args.inflation_factor) train_data_dict = { "train": train_data, "support_sim_settings": support_sim_settings, "imputer": imputer} pickle_to_file(train_data_dict, args.out_train_data) test_data = Dataset(x=imputed_test_X, y=test_y, group_id=test_patients) pickle_to_file(test_data, args.out_test_data)
def main(args=sys.argv[1:]): MIMIC_TEST = "experiment_mimic/_output/data/valid_data_%d_%d.csv" args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) trial_data = TrialData() times = [] for time_key in range(args.start_year, args.start_year + args.num_years): for quarter in range(4): path_time = MIMIC_TEST % (time_key, quarter) raw_dataset = np.genfromtxt(path_time) if len(raw_dataset.shape) == 1: raw_dataset = raw_dataset.reshape((1, -1)) print("VALIDATION DATA ONLY SIZE 1") print(raw_dataset.shape) if raw_dataset.shape[0] < args.min_batch_size: print("SKIPPING THIS BATCH. TOO SMALL", raw_dataset.shape) continue print("year q", time_key, quarter) dataset = Dataset(raw_dataset[:, 1:], raw_dataset[:, 0], num_classes=2) trial_data.add_batch(dataset) nature = FixedNature(trial_data=trial_data) pickle_to_file(nature, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) # Read all data data_dict = pickle_from_file(args.data_file) # Get the appropriate datasplit split_dict = pickle_from_file(args.data_split_file) recalib_data = data_dict["train"].subset(split_dict["recalibrate_idxs"]) # Load model fitted_model = load_model(args.fitted_file) family = fitted_model.density_parametric_form if family == "gaussian": coverage_dict = recalibrate_intervals_gaussian(fitted_model, recalib_data, args) elif family == "bernoulli": coverage_dict = recalibrate_intervals_bernoulli( fitted_model, recalib_data, args) elif "multinomial" in family: coverage_dict = recalibrate_intervals_multinomial( fitted_model, recalib_data, args) else: raise ValueError("dunno what is going on") print(coverage_dict) pickle_to_file(coverage_dict, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) # Read all data data_dict = pickle_from_file(args.data_file) # Get the appropriate datasplit split_dict = pickle_from_file(args.data_split_file) recalib_data = data_dict["train"].subset(split_dict["recalibrate_idxs"]) # Load model fitted_model = load_model(args.fitted_file) coverage_dict = {} for alpha in args.alphas: recalibrator = DecisionIntervalRecalibrator(fitted_model, alpha) inference_dict = recalibrator.recalibrate(recalib_data) print("RECALIB INF DICT", inference_dict["cov_given_accept"]) est_cov_given_accept = inference_dict["cov_given_accept"]["mean"] logging.info("Alpha %f, ideal cov %f, est cov|accept %f", alpha, 1 - alpha, est_cov_given_accept) logging.info(get_normal_ci(inference_dict["cov_given_accept"])) coverage_dict[alpha] = inference_dict pickle_to_file(coverage_dict, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) np.random.seed(args.seed) if args.support_setting == "constant": support_sim_settings = SupportSimSettingsUniform( args.num_p, min_func_name="min_x_func_constant", max_func_name="max_x_func_constant") elif args.support_setting == "changing": raise ValueError("huh? i can get here?") support_sim_settings = SupportSimSettingsNormal( args.num_p, std_func_name="std_func_changing", mu_func_name="mu_func_changing") else: raise ValueError("Asdfasdf") data_gen = DataGenerator(args.density_parametric_form, args.sim_func_name, support_sim_settings, max_y=args.max_y, min_y=args.min_y) trial_data = TrialData(data_gen, args.batch_sizes) for batch_index in range(args.num_batches): trial_data.make_new_batch() out_dict = {"meta": trial_data.make_meta_data(), "data": trial_data} print(out_dict["meta"]) pickle_to_file(out_dict, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) scratch_dir = make_scratch_dir(args) np.random.seed(args.seed) tf.set_random_seed(args.seed) # Read data data_dict = pickle_from_file(args.data_file) # Get the appropriate datasplit split_dict = pickle_from_file(args.data_split_file) train_split_dataset = data_dict["train"].subset(split_dict["train_idxs"]) # TODO: so hacky. so lazy if args.density_parametric_form == "multinomial": print("num classes", train_split_dataset.num_classes) density_parametric_form = "multinomial%d" % train_split_dataset.num_classes else: density_parametric_form = args.density_parametric_form # Setup the parameters we will tune over param_grid = [{ 'density_layer_sizes': args.density_layer_sizes, 'density_parametric_form': [density_parametric_form], 'density_weight_param': args.density_weight_params, 'dropout_rate': args.dropout_rate, 'weight_penalty_type': [args.weight_penalty_type], 'max_iters': [args.max_iters], 'num_ensemble': [args.num_ensemble], 'num_inits': [args.num_inits], 'act_func': [args.act_func], 'learning_rate': [args.learning_rate], 'do_distributed': [args.do_distributed], 'scratch_dir': [scratch_dir], }] # Fit model fitted_model, best_hyperparams, cv_results = do_cross_validation( train_split_dataset, nn_class=EnsembleDensityNN, param_grid=param_grid, cv=args.cv) logging.info("Best hyperparams %s", best_hyperparams) # Save model pickle_to_file({ "nn_class": EnsembleDensityNN, "fitted_params": [nn.model_params for nn in fitted_model.nns], "hyperparams": best_hyperparams, "cv_results": cv_results, }, args.fitted_file)
def main(args=sys.argv[1:]): args = parse_args(args) agg_model_preds_and_targets = AggModelPredsAndTargets() for year in range(args.start_year, args.start_year + args.num_years): for split_idx in range(args.start_num_year_splits, args.end_num_year_splits): prefetch_file = args.path_template % (year, split_idx) model_preds_and_targets = pickle_from_file(prefetch_file) agg_model_preds_and_targets.append(model_preds_and_targets) pickle_to_file(agg_model_preds_and_targets, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) np.random.seed(args.seed) tf.set_random_seed(args.seed) # Read data data_dict = pickle_from_file(args.data_file) assert data_dict["support_sim_settings"].check_dataset(data_dict["train"]) # Get the appropriate datasplit split_dict = pickle_from_file(args.data_split_file) train_split_dataset = data_dict["train"].subset(split_dict["train_idxs"]) # Setup the parameters we will tune over param_grid = [{ 'interval_alpha': [args.interval_alpha], 'decision_layer_sizes': args.decision_layer_sizes, 'interval_layer_sizes': args.interval_layer_sizes, 'decision_weight_param': args.decision_weight_params, 'interval_weight_param': args.interval_weight_params, 'weight_penalty_type': [args.weight_penalty_type], 'cost_decline': [args.cost_decline], 'do_no_harm_param': args.do_no_harm_params, 'log_barrier_param': args.log_barrier_params, 'max_iters': [args.max_iters], 'num_inits': [args.num_inits], 'act_func': [args.act_func], 'learning_rate': [args.learning_rate], 'support_sim_settings': [data_dict["support_sim_settings"]], 'support_sim_num': [args.support_sim_num], }] # Fit model fitted_model, best_hyperparams, cv_results = do_cross_validation( train_split_dataset, nn_class=SimultaneousIntervalDecisionNNs, param_grid=param_grid, cv=args.cv) logging.info("Best hyperparams %s", best_hyperparams) # Save model pickle_to_file( { "nn_class": SimultaneousIntervalDecisionNNs, "fitted_params": fitted_model.model_params, "hyperparams": best_hyperparams, "cv_results": cv_results, }, args.fitted_file)
def main(args=sys.argv[1:]): args = parse_args(args) print(args) np.random.seed(args.seed) all_approval_list = [ pickle_from_file(history_file) for history_file in args.history_files ] for hist in all_approval_list: print(hist) all_approval_dict = {x.policy_name: x for x in all_approval_list} pickle_to_file(all_approval_dict, args.out_file)
def main(args=sys.argv[1:]): args = parse_args() print(args) np.random.seed(args.seed) # Read data data_dict = pickle_from_file(args.in_data_file) full_data = data_dict["train"] unique_groups = np.unique(full_data.group_id) shuffled_order = np.random.permutation(unique_groups) if args.recalibrate_num is not None: num_recalibrate = args.recalibrate_num train_groups = shuffled_order[:-num_recalibrate] recalibrate_groups = shuffled_order[-num_recalibrate:] else: fold_size = int(unique_groups.size / args.k_folds) + 1 start_idx = args.fold_idx * fold_size end_idx = min((args.fold_idx + 1) * fold_size, unique_groups.size) print("number in recalibrated groups", end_idx - start_idx) train_groups = np.concatenate( [shuffled_order[:start_idx], shuffled_order[end_idx:]]) recalibrate_groups = shuffled_order[start_idx:end_idx] train_idxs = np.isin(full_data.group_id, train_groups).flatten() assert train_idxs.size > 1 # For recalibartion, we only grab a random obs per group recalibrate_idxs = [] for recalib_group_id in recalibrate_groups: matching_obs_idxs = np.where(full_data.group_id == recalib_group_id)[0] random_matching_obs_idx = np.random.choice(matching_obs_idxs) recalibrate_idxs.append(random_matching_obs_idx) recalibrate_idxs = np.array(recalibrate_idxs) assert recalibrate_idxs.size > 1 # Double check we grabbed a single random obs per group assert np.unique( full_data.group_id[recalibrate_idxs]).size == recalibrate_idxs.size # Write data to file print("num train", train_idxs.size) pickle_to_file( { "train_idxs": train_idxs, "recalibrate_idxs": recalibrate_idxs, "support_sim_settings": data_dict["support_sim_settings"], }, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) models = [] for year in range(args.start_year, args.start_year + args.num_years): for quarter in range(4): model_file = args.path_template % (year, quarter) print("model", model_file) assert os.path.exists(model_file) if len(models) == 0: models.append(pickle_from_file(model_file)) else: models.append(model_file) proposer = FixedProposer(models) print("pickling...") pickle_to_file(proposer, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) np.random.seed(args.seed) proposer = LassoProposer( args.density_parametric_form, eps=args.proposer_eps, n_alphas=args.proposer_alphas, cv=args.proposer_cv, num_back_batches=args.proposer_batches, ) pickle_to_file(proposer, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) YEARS = range(args.start_year, args.start_year + args.num_years) MONTHS = range(1, 1 + args.num_months) model_paths = [] for year in YEARS: for month in MONTHS: model_file = args.path_template % (year, month) print("model", model_file, os.path.exists(model_file)) if not os.path.exists(model_file): model_paths.append(prev_model_file) else: model_paths.append(model_file) prev_model_file = model_file proposer = FixedProposerFromFile(model_paths, criterion_str="l1", max_loss=args.max_loss) pickle_to_file(proposer, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) nature = pickle_from_file(args.nature_file) approval_hist = ApprovalHistory(human_max_loss=1, policy_name="Placeholder") model = pickle_from_file(args.model_file) proposer = FixedProposer([model]) # begin simulation # introduce the singleton model proposer.propose_model(None, None) model_pred_targets = ModelPredsAndTargets() for t in range(nature.total_time - 1): print("prefetcthing time", t) sub_trial_data = nature.get_trial_data(t + 1) obs_batch_data = sub_trial_data.batch_data[-1] batch_preds, batch_target = proposer.get_model_preds_and_target(obs_batch_data) model_pred_targets.append(batch_preds, batch_target) nature.next(approval_hist) pickle_to_file(model_pred_targets, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) times = [] models = [] YEARS = range(args.start_year, args.start_year + args.num_years) MONTHS = range(1, 1 + args.num_months) for year in YEARS: for month in MONTHS: times.append((year, month)) trial_data = TrialDataFromDisk() for time_key in times: path_time = args.valid_data_template % time_key trial_data.add_batch(path_time, args.batch_size) nature = FixedNature(trial_data=trial_data) pickle_to_file(nature, args.out_file)
def main(args=sys.argv[1:]): args = parse_args() print(args) np.random.seed(args.seed) data_gen = DataGenerator( sim_func_form=args.sim_func_form, sim_func_name=args.sim_func, num_p=args.num_p, num_classes=args.num_classes, noise_sd=args.sim_noise_sd, std_dev_x=args.std_dev_x, max_x=args.max_x, ) train_data, support_sim_settings = data_gen.create_data(args.num_train) # Write data to file pickle_to_file( { "train": train_data, "support_sim_settings": support_sim_settings, "data_gen": data_gen }, args.out_data_file)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) np.random.seed(args.seed) if args.support_setting == "constant": support_sim_settings = SupportSimSettingsUniform( args.num_p, min_func_name="min_x_func_constant", max_func_name="max_x_func_constant", ) elif args.support_setting == "changing": raise ValueError("huh? i can get here?") support_sim_settings = SupportSimSettingsNormal( args.num_p, std_func_name="std_func_changing", mu_func_name="mu_func_changing", ) else: raise ValueError("Asdfasdf") data_gen = DataGenerator( args.density_parametric_form, args.sim_func_name, support_sim_settings, max_y=args.max_y, min_y=args.min_y, ) proposer = get_proposer(args, data_gen) pickle_to_file(proposer, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig( format="%(message)s", filename=args.log_file, level=logging.DEBUG ) print(args) logging.info(args) np.random.seed(args.seed) nature = pickle_from_file(args.nature_file) logging.info("BATCH SIZES %s", nature.batch_sizes) proposer = pickle_from_file(args.proposer_file) nature.next(None) model = proposer.propose_model(nature.get_trial_data(0), None) if args.human_max_loss is None: args.human_max_loss = np.mean( proposer.score_models( nature.create_test_data(time_t=0, num_obs=args.num_test_obs) )[0] ) logging.info("HUMAN MAX %f", args.human_max_loss) nature.next(None) print("POLICY") policy = create_policy( args.policy_name, args, human_max_loss=args.human_max_loss, drift=args.human_max_loss * args.drift_scale, total_time=nature.total_time, num_experts=nature.total_time, batch_size=np.mean(nature.batch_sizes[1:]), ) st_time = time.time() if args.prefetched_file is None: sim = Simulation( nature, proposer, policy, args.human_max_loss, num_test_obs=args.num_test_obs, holdout_last_batch=args.holdout_last_batch, ) else: prefetched = pickle_from_file(args.prefetched_file) sim = SimulationPrefetched( nature, proposer, prefetched, policy, args.human_max_loss, num_test_obs=args.num_test_obs, holdout_last_batch=args.holdout_last_batch, ) sim.run(lambda approval_hist: pickle_to_file(approval_hist, args.out_file)) logging.info(sim.approval_hist) print(sim.approval_hist) logging.info("run time %d", time.time() - st_time) pickle_to_file(sim.approval_hist, args.out_file) if args.out_nature_file is not None: pickle_to_file(nature.to_fixed(), args.out_nature_file)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) nn_class = EnsembleSimultaneousDensityDecisionNNs scratch_dir = make_scratch_dir(args) np.random.seed(args.seed) tf.set_random_seed(args.seed) # Read data data_dict = pickle_from_file(args.data_file) #assert data_dict["support_sim_settings"].check_dataset(data_dict["train"]) # Get the appropriate datasplit split_dict = pickle_from_file(args.data_split_file) print(split_dict["train_idxs"]) print(data_dict["train"].x.shape) print(data_dict["train"].y.shape) print(split_dict["train_idxs"].shape) train_split_dataset = data_dict["train"].subset(split_dict["train_idxs"]) print(train_split_dataset.y.shape) if args.density_parametric_form == "multinomial": print("num classes", train_split_dataset.num_classes) density_parametric_form = "multinomial%d" % train_split_dataset.num_classes else: density_parametric_form = args.density_parametric_form if args.use_train_data_support: support_data = train_split_dataset old_support_settings = data_dict["support_sim_settings"] data_dict["support_sim_settings"] = SupportSimSettingsEmpirical( support_data.x, scale=args.support_empirical_scale, min_x=old_support_settings.min_x, max_x=old_support_settings.max_x) elif args.empirical_support_file: empirical_support = pickle_from_file(args.empirical_support_file) old_support_settings = data_dict["support_sim_settings"] data_dict["support_sim_settings"] = SupportSimSettingsEmpirical( empirical_support, scale=args.support_empirical_scale, min_x=old_support_settings.min_x, max_x=old_support_settings.max_x) # Setup the parameters we will tune over param_grid = [{ 'density_layer_sizes': args.density_layer_sizes, 'decision_layer_sizes': args.decision_layer_sizes, 'dropout_rate': args.dropout_rate, 'density_parametric_form': [density_parametric_form], 'density_weight_param': args.density_weight_params, 'decision_weight_param': args.decision_weight_params, 'weight_penalty_type': [args.weight_penalty_type], 'cost_decline': [args.cost_decline], 'do_no_harm_param': args.do_no_harm_params, 'log_barrier_param': args.log_barrier_params, 'max_iters': [args.max_iters], 'num_inits': [args.num_inits], 'num_ensemble': [args.num_ensemble], 'do_distributed': [args.do_distributed], 'scratch_dir': [scratch_dir], 'act_func': [args.act_func], 'learning_rate': [args.learning_rate], 'support_sim_settings': [data_dict["support_sim_settings"]], 'support_sim_num': [args.support_sim_num], }] # Fit model fitted_model, best_hyperparams, cv_results = do_cross_validation( train_split_dataset, nn_class=nn_class, param_grid=param_grid, cv=args.cv) logging.info("Best hyperparams %s", best_hyperparams) # Save model pickle_to_file({ "nn_class": nn_class, "fitted_params": [m.model_params for m in fitted_model.nns], "hyperparams": best_hyperparams, "cv_results": cv_results, }, args.fitted_file) # DOUBLE CHECKING THINGS WORK #pickle_from_file(args.fitted_file) fitted_model.get_accept_prob(train_split_dataset.x[:10,:]) fitted_model.get_prediction_interval(train_split_dataset.x[:10,:])
def main(args=sys.argv[1:]): args = parse_args(args) print(args) np.random.seed(0) mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 orig_image_shape = x_train.shape[1:] x_train = x_train.reshape((x_train.shape[0], -1)) x_test = x_test.reshape((x_test.shape[0], -1)) num_classes = 10 num_train_classes = 9 data_mask = y_train < num_train_classes x_train = x_train[data_mask] y_train = y_train[data_mask] y_train_categorical = np.zeros((y_train.size, num_train_classes)) y_train_categorical[np.arange(y_train.size), y_train] = 1 y_test_categorical = np.zeros((y_test.size, num_classes)) y_test_categorical[np.arange(y_test.size), y_test] = 1 (_, _), (weird_x, _) = tf.keras.datasets.fashion_mnist.load_data() weird_x = weird_x / 255.0 weird_x = weird_x.reshape((weird_x.shape[0], -1)) if args.do_pca: pca = PCA(n_components=300, whiten=True) x_train = pca.fit_transform(x_train) print(pca.explained_variance_ratio_) x_test = pca.transform(x_test) weird_x = pca.transform(weird_x) num_p = x_train.shape[1] min_x = np.min(np.concatenate([x_train, x_test]), axis=0).reshape((1, -1)) max_x = np.max(np.concatenate([x_train, x_test]), axis=0).reshape((1, -1)) support_sim_settings = SupportSimSettingsContinuousMulti(num_p, min_x=min_x, max_x=max_x) train_data = Dataset(x=x_train, y=y_train_categorical, num_classes=num_train_classes) train_data_dict = { "train": train_data, "support_sim_settings": support_sim_settings } pickle_to_file(train_data_dict, args.out_train_data) random_idx = np.random.choice(x_test.shape[0], size=4000, replace=False) test_data = Dataset(x=x_test[random_idx, :], y=y_test_categorical[random_idx, :], num_classes=num_classes) pickle_to_file(test_data, args.out_test_data) check_supp = support_sim_settings.check_obs_x(weird_x) print("NUM WEIRD", weird_x.shape) print("NUM WEiRD IN SUPPORT", np.sum(check_supp)) weird_x = weird_x[check_supp, :] pickle_to_file(weird_x, args.out_weird_data)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig( format="%(message)s", filename=args.log_file, level=logging.DEBUG ) print(args) logging.info(args) np.random.seed(args.seed) if args.support_setting == "constant": support_sim_settings = SupportSimSettingsUniform( args.num_p, min_func_name="min_x_func_constant", max_func_name="max_x_func_constant", ) elif args.support_setting == "changing": raise ValueError("huh? i can get here?") support_sim_settings = SupportSimSettingsNormal( args.num_p, std_func_name="std_func_changing", mu_func_name="mu_func_changing", ) else: raise ValueError("Asdfasdf") data_gen = DataGenerator( args.density_parametric_form, args.sim_func_name, support_sim_settings, noise_sd=args.y_sigma, max_y=args.max_y, min_y=args.min_y, ) trial_data = TrialData(args.batch_sizes) init_coef = np.zeros(args.num_p) init_coef[: args.num_coefs] = args.coef_scale new_coef = init_coef all_coefs = [] last_coef_change = 0 coef_norm = np.sqrt(np.sum(np.power(init_coef, 2))) did_drift = False for batch_index in range(args.num_batches): do_drift = ( batch_index % args.drift_cycle == args.drift_cycle - 1 if args.drift_cycle > 0 else False ) if do_drift: print("DRIFT", do_drift) new_coef = np.copy(new_coef) to0_rand_idx = np.random.choice( np.where(np.abs(new_coef) > 0)[0], size=args.num_coef_drift ) to1_rand_idx = np.random.choice( np.where(np.abs(new_coef) <= 1e-10)[0], size=args.num_coef_drift ) new_coef[to0_rand_idx] = 0 new_coef[to1_rand_idx] = np.max(init_coef) last_coef_change = batch_index - 1 did_drift = True elif did_drift and np.random.rand() < args.prob_revert_drift: print("REVERT", batch_index) # Try reverting to old coefs new_coef = all_coefs[last_coef_change] last_coef_change = batch_index - 1 did_drift = False else: did_drift = False new_data = data_gen.create_data( args.batch_sizes[batch_index], batch_index, coef=new_coef ) all_coefs.append(new_coef) trial_data.add_batch(new_data) nature = FixedNature(data_gen, trial_data, coefs=all_coefs) pickle_to_file(nature, args.out_file)
def main(args=sys.argv[1:]): train_size = 0.5 seed = 0 # Read the y data outcomes = pd.read_csv("../data/Outcomes-a.txt") subject_outcomes = outcomes[["RecordID", "Length_of_stay", "Survival"]] # Create a dictionary of features for each subject # Using a dictionary because some of the features don't appear in all subjects... value_range = {} # this is just for printing out ranges of the values file_folder = "../data/set-a/" all_subject_features = {} for idx, filename in enumerate(os.listdir(file_folder)[:MAX_PROCESS]): df = pd.read_csv("%s%s" % (file_folder, filename)) df["hour"] = np.array([time.split(":")[0] for time in df.Time.values], dtype=int) df["minute"] = np.array( [time.split(":")[1] for time in df.Time.values], dtype=int) df.Time = df.hour * 60 + df.minute record_id = int(df.loc[0].Value) subject_features = {"RecordID": record_id} for feat_name, process_func_list in FEATURES.items(): if WEIGHTED_MEAN in process_func_list: sub_df = df.loc[(df.Parameter == feat_name) & (df.Value > 0)] else: sub_df = df.loc[(df.Parameter == feat_name) & (df.Value >= 0)] if sub_df.shape[0] == 0: continue if feat_name not in value_range: value_range[feat_name] = [ sub_df.Value.min(), sub_df.Value.max() ] else: value_range[feat_name][0] = min(value_range[feat_name][0], sub_df.Value.min()) value_range[feat_name][1] = max(value_range[feat_name][1], sub_df.Value.max()) for func in process_func_list: value = func(sub_df) if not np.isfinite(value): print(value, feat_name, func.__name__) print(sub_df) assert np.isfinite(value) full_feature_name = "%s:%s" % (feat_name, func.__name__) subject_features[full_feature_name] = value fio2_df = df.loc[df.Parameter == "FiO2"] pao2_df = df.loc[df.Parameter == "PaO2"] if fio2_df.shape[0] and pao2_df.shape[0]: fio2_mean = _get_mean(fio2_df) pao2_mean = _get_mean(pao2_df) if fio2_mean > 0: subject_features["O2:_get_ratio"] = pao2_mean / fio2_mean all_subject_features[idx] = subject_features for k, v in value_range.items(): print(k, v) subjects_x = pd.DataFrame.from_dict(all_subject_features, orient="index") # Merge the X and Y data icu_subjects = subjects_x.merge(subject_outcomes, on="RecordID") print(icu_subjects["Survival"]) icu_subjects["resp"] = np.maximum(icu_subjects["Length_of_stay"], icu_subjects["Survival"]) icu_subjects = icu_subjects.drop(columns=["RecordID"]) print(np.mean(icu_subjects["Survival"])) print(np.median(icu_subjects["Survival"])) print(np.max(icu_subjects["Survival"])) print(np.mean(icu_subjects["Length_of_stay"])) print(np.median(icu_subjects["Length_of_stay"])) print(np.max(icu_subjects["Length_of_stay"])) # Grab column names column_names = list(icu_subjects.columns.values) icu_subjects = icu_subjects.as_matrix() # Center the x covariates centering_term = np.nanmean(icu_subjects, axis=0) centering_term[-1] = 0 icu_subjects[:, :-3] -= centering_term[:-3] # randomly split the data print(column_names) mats = train_test_split(icu_subjects, train_size=train_size, test_size=1.0 - train_size, random_state=seed) x_train = mats[0][:, :-3] y_train = mats[0][:, -1:] y_censored_train = mats[0][:, -2:-1] < 0 x_test = mats[1][:, :-3] y_test = mats[1][:, -1:] y_censored_test = mats[1][:, -2:-1] < 0 # Save the data icu_train_data = data_generator.Dataset(x=x_train, y=y_train, is_censored=y_censored_train) icu_test_data = data_generator.Dataset(x=x_test, y=y_test, is_censored=y_censored_test) ## save off as a pickle icu_processed_file = "../data/icu_data_processed.pkl" pickle_to_file({ "train": icu_train_data, "test": icu_test_data }, icu_processed_file) icu_column_file = "../data/icu_data_column_names.txt" with open(icu_column_file, "w") as f: for i, col in enumerate(column_names[:-1]): f.write("%d, %s\n" % (i, col))
def main(args=sys.argv[1:]): args = parse_args(args) np.random.seed(args.seed) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) logging.info(args) data_dict = pickle_from_file(args.data_file) test_data, _ = data_dict["data_gen"].create_data(args.num_test) fitted_models = [] agg_dict = {} for fitted_file, coverage_file in zip(args.fitted_files, args.coverage_files): fitted_model = load_model(fitted_file) fitted_models.append(fitted_model) coverage_dict = pickle_from_file(coverage_file) for pi_alpha, inference_dict in coverage_dict.items(): if pi_alpha not in agg_dict: agg_dict[pi_alpha] = [] agg_dict[pi_alpha].append(inference_dict) unif_x = data_dict["support_sim_settings"].support_unif_rvs(args.num_test) unif_test_data = data_dict["data_gen"].create_data_given_x(unif_x) coverage_agg_results = {} for pi_alpha, inference_dicts in agg_dict.items(): aggregator = DecisionIntervalAggregator(fitted_models, pi_alpha, inference_dicts) indiv_test_datas = [ data_dict["data_gen"].create_data(args.num_test)[0] for _ in fitted_models ] indiv_test_inf_dicts = [ DecisionIntervalRecalibrator(fitted_model, pi_alpha).recalibrate(indiv_test_data) for fitted_model, indiv_test_data in zip(fitted_models, indiv_test_datas) ] individual_is_covereds = [] for test_coverage_dict, inf_dict in zip(indiv_test_inf_dicts, inference_dicts): print(inf_dict) test_coverage = test_coverage_dict["cov_given_accept"]["mean"] test_coverage_ci = get_normal_ci( test_coverage_dict["cov_given_accept"], args.ci_alpha) individual_ci = get_normal_ci(inf_dict["cov_given_accept"], args.ci_alpha) indiv_covered = individual_ci[ 0] <= test_coverage and test_coverage <= individual_ci[1] logging.info("indiv est %f ci %s", inf_dict["cov_given_accept"]["mean"], individual_ci) logging.info("true indiv %f ci %s", test_coverage, test_coverage_ci) logging.info("indiv is covered? %s", indiv_covered) individual_is_covereds.append(indiv_covered) # Calculate the width of the individual CI diams for comparison individual_ci_diams = get_individual_ci_diams(inference_dicts, args.ci_alpha) # Evaluate if the true coverage value is covered agg_cov_given_accept_dict = aggregator.calc_agg_cover_given_accept( args.ci_alpha) true_cov_given_accept_dict = aggregator.eval_cov_given_accept( test_data)["cov_given_accept"] true_cov_given_accept = true_cov_given_accept_dict["mean"] agg_ci = agg_cov_given_accept_dict["ci"] is_covered = true_cov_given_accept > agg_ci[ 0] and true_cov_given_accept < agg_ci[1] # Evaluate coverage if using independence assumption indpt_aggregator = DecisionIntervalIndptAggregator( fitted_models, pi_alpha, inference_dicts) indpt_agg_cov_given_accept_dict = indpt_aggregator.calc_agg_cover_given_accept( args.ci_alpha) indpt_ci = indpt_agg_cov_given_accept_dict["ci"] indpt_is_covered = true_cov_given_accept > indpt_ci[ 0] and true_cov_given_accept < indpt_ci[1] coverage_agg_results[pi_alpha] = { "is_covered": { "agg": [is_covered], "independent": [indpt_is_covered], "individual": individual_is_covereds }, "ci_diams": { "agg": [agg_ci[1] - agg_ci[0]], "independent": [indpt_ci[1] - indpt_ci[0]], "individual": individual_ci_diams }, "true_cov": { "agg": [true_cov_given_accept], "independent": [true_cov_given_accept], "individual": [ test_inf_dict["cov_given_accept"]["mean"] for test_inf_dict in indiv_test_inf_dicts ] } } # Evaluate local coverage local_coverages = assess_local_agg_coverage_true( aggregator, test_data, data_dict["data_gen"]) for key, val in local_coverages.items(): coverage_agg_results[pi_alpha][key] = val logging.info("PI alpha %f", pi_alpha) logging.info("estimated agg cover given accept %f %s", agg_cov_given_accept_dict["mean"], agg_ci) logging.info("indepttt estimated agg cover given accept %f %s", indpt_agg_cov_given_accept_dict["mean"], indpt_ci) logging.info("true cov given accept %f, se %f", true_cov_given_accept, true_cov_given_accept_dict["se"]) logging.info("is covered? %s", is_covered) logging.info("indept is covered? %s", indpt_is_covered) logging.info(coverage_agg_results) pickle_to_file(coverage_agg_results, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) print(args) # Save things test_data = pickle_from_file(args.in_test_data) train_data_dict = pickle_from_file(args.in_train_data) train_data = train_data_dict["train"] num_meta_feats = 3 assert train_data.x.shape[1] % 2 == 1 num_non_meta_feats = int((train_data.x.shape[1] - num_meta_feats) / 2) start_missing_idx = num_non_meta_feats + num_meta_feats print(start_missing_idx) is_missingness_acceptable = np.mean(train_data.x[:, start_missing_idx:], axis=0) < 0.1 keep_cols = np.concatenate([ np.array([True, True, True]), is_missingness_acceptable, np.zeros(num_non_meta_feats, dtype=bool) ]) print(keep_cols) print(keep_cols.shape) train_data.x = train_data.x[:, keep_cols] test_data.x = test_data.x[:, keep_cols] orig_support_sim_settings = SupportSimSettingsComplex.create_from_dataset( train_data.x, inflation_factor=0) orig_cts_feature_idxs = orig_support_sim_settings.cts_feature_idxs orig_discrete_feature_idxs = orig_support_sim_settings.discrete_feature_idxs print(orig_cts_feature_idxs[:10]) print(orig_discrete_feature_idxs[:10]) if args.holdout_age: age = train_data.x[:, 0] age_mask = ((age < args.holdout_min_age) + (age > args.holdout_max_age)).astype(bool) heldin_train_data = train_data.subset(age_mask) # REMOVE AGE FROM CTS FEATURES orig_cts_feature_idxs = orig_cts_feature_idxs[1:] else: heldin_train_data = train_data offset_idx = 0 print("max train age", np.max(heldin_train_data.x[:, 0])) pca = PCA(n_components=args.num_pca, whiten=True) print("ORIG SHAPE", heldin_train_data.x.shape) heldin_train_data_x_cts = pca.fit_transform( heldin_train_data.x[:, orig_cts_feature_idxs]) print(pca.explained_variance_ratio_) print("NUM DIS", orig_discrete_feature_idxs.size) test_data_x_cts = pca.transform(test_data.x[:, orig_cts_feature_idxs]) heldin_train_data.x = np.hstack([ heldin_train_data.x[:, orig_discrete_feature_idxs], heldin_train_data_x_cts ]) if args.holdout_age: test_data.x = np.hstack([ test_data.x[:, 0:1], # age feature test_data.x[:, orig_discrete_feature_idxs], test_data_x_cts ]) else: test_data.x = np.hstack( [test_data.x[:, orig_discrete_feature_idxs], test_data_x_cts]) print('NEW TEST SHAPE', test_data.x.shape) print('NEW TRAIN SHAPE', heldin_train_data.x.shape) support_sim_settings = SupportSimSettingsComplex.create_from_dataset( heldin_train_data.x, inflation_factor=0) support_sim_settings._process_feature_ranges() print("dataset check", support_sim_settings.check_dataset(heldin_train_data)) train_data_dict["train"] = heldin_train_data train_data_dict["support_sim_settings"] = support_sim_settings heldin_train_data.num_p = heldin_train_data.x.shape[1] pickle_to_file(train_data_dict, args.out_train_data) test_data.num_p = test_data.x.shape[1] pickle_to_file(test_data, args.out_test_data) print("num obs", heldin_train_data.num_obs) print("num obs", train_data.num_obs) print("FINAL NUM FEATS", heldin_train_data.num_p) print("FINAL NUM FEATS", test_data.num_p)