def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) # Read all data data_dict = pickle_from_file(args.data_file) # Get the appropriate datasplit split_dict = pickle_from_file(args.data_split_file) recalib_data = data_dict["train"].subset(split_dict["recalibrate_idxs"]) # Load model fitted_model = load_model(args.fitted_file) coverage_dict = {} for alpha in args.alphas: recalibrator = DecisionIntervalRecalibrator(fitted_model, alpha) inference_dict = recalibrator.recalibrate(recalib_data) print("RECALIB INF DICT", inference_dict["cov_given_accept"]) est_cov_given_accept = inference_dict["cov_given_accept"]["mean"] logging.info("Alpha %f, ideal cov %f, est cov|accept %f", alpha, 1 - alpha, est_cov_given_accept) logging.info(get_normal_ci(inference_dict["cov_given_accept"])) coverage_dict[alpha] = inference_dict pickle_to_file(coverage_dict, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) np.random.seed(args.seed) # Read all data orig_data_dict = pickle_from_file(args.data_file) # Get the appropriate datasplit split_dict = pickle_from_file(args.data_split_file) recalib_data = orig_data_dict["train"].subset( split_dict["recalibrate_idxs"]) args.num_p = recalib_data.x.shape[1] # Load models fitted_dicts = [] #for fitted_file, coverage_file in zip(args.fitted_files, args.coverage_files): for fitted_file in args.fitted_files: fitted_model = load_model(fitted_file) #coverage_dict = pickle_from_file(coverage_file) fitted_dicts.append({"model": fitted_model}) #"coverage_dict": coverage_dict}) print("fitted dicts", len(fitted_dicts)) # Do all the plotting new_data, _ = orig_data_dict["data_gen"].create_data(args.num_test) #plot_PI_diam(fitted_dicts, new_data, args) #plot_coverages(fitted_dicts, new_data, args) #plot_accept_probs( # [d["model"] for d in fitted_dicts], # new_data, # args) if args.num_p == 2: plot_accepted_rejected_region(orig_data_dict, [d["model"] for d in fitted_dicts], args)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) # Read all data data_dict = pickle_from_file(args.data_file) # Get the appropriate datasplit split_dict = pickle_from_file(args.data_split_file) recalib_data = data_dict["train"].subset(split_dict["recalibrate_idxs"]) # Load model fitted_model = load_model(args.fitted_file) family = fitted_model.density_parametric_form if family == "gaussian": coverage_dict = recalibrate_intervals_gaussian(fitted_model, recalib_data, args) elif family == "bernoulli": coverage_dict = recalibrate_intervals_bernoulli( fitted_model, recalib_data, args) elif "multinomial" in family: coverage_dict = recalibrate_intervals_multinomial( fitted_model, recalib_data, args) else: raise ValueError("dunno what is going on") print(coverage_dict) pickle_to_file(coverage_dict, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) scratch_dir = make_scratch_dir(args) np.random.seed(args.seed) tf.set_random_seed(args.seed) # Read data data_dict = pickle_from_file(args.data_file) # Get the appropriate datasplit split_dict = pickle_from_file(args.data_split_file) train_split_dataset = data_dict["train"].subset(split_dict["train_idxs"]) # TODO: so hacky. so lazy if args.density_parametric_form == "multinomial": print("num classes", train_split_dataset.num_classes) density_parametric_form = "multinomial%d" % train_split_dataset.num_classes else: density_parametric_form = args.density_parametric_form # Setup the parameters we will tune over param_grid = [{ 'density_layer_sizes': args.density_layer_sizes, 'density_parametric_form': [density_parametric_form], 'density_weight_param': args.density_weight_params, 'dropout_rate': args.dropout_rate, 'weight_penalty_type': [args.weight_penalty_type], 'max_iters': [args.max_iters], 'num_ensemble': [args.num_ensemble], 'num_inits': [args.num_inits], 'act_func': [args.act_func], 'learning_rate': [args.learning_rate], 'do_distributed': [args.do_distributed], 'scratch_dir': [scratch_dir], }] # Fit model fitted_model, best_hyperparams, cv_results = do_cross_validation( train_split_dataset, nn_class=EnsembleDensityNN, param_grid=param_grid, cv=args.cv) logging.info("Best hyperparams %s", best_hyperparams) # Save model pickle_to_file({ "nn_class": EnsembleDensityNN, "fitted_params": [nn.model_params for nn in fitted_model.nns], "hyperparams": best_hyperparams, "cv_results": cv_results, }, args.fitted_file)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) np.random.seed(args.seed) tf.set_random_seed(args.seed) # Read data data_dict = pickle_from_file(args.data_file) assert data_dict["support_sim_settings"].check_dataset(data_dict["train"]) # Get the appropriate datasplit split_dict = pickle_from_file(args.data_split_file) train_split_dataset = data_dict["train"].subset(split_dict["train_idxs"]) # Setup the parameters we will tune over param_grid = [{ 'interval_alpha': [args.interval_alpha], 'decision_layer_sizes': args.decision_layer_sizes, 'interval_layer_sizes': args.interval_layer_sizes, 'decision_weight_param': args.decision_weight_params, 'interval_weight_param': args.interval_weight_params, 'weight_penalty_type': [args.weight_penalty_type], 'cost_decline': [args.cost_decline], 'do_no_harm_param': args.do_no_harm_params, 'log_barrier_param': args.log_barrier_params, 'max_iters': [args.max_iters], 'num_inits': [args.num_inits], 'act_func': [args.act_func], 'learning_rate': [args.learning_rate], 'support_sim_settings': [data_dict["support_sim_settings"]], 'support_sim_num': [args.support_sim_num], }] # Fit model fitted_model, best_hyperparams, cv_results = do_cross_validation( train_split_dataset, nn_class=SimultaneousIntervalDecisionNNs, param_grid=param_grid, cv=args.cv) logging.info("Best hyperparams %s", best_hyperparams) # Save model pickle_to_file( { "nn_class": SimultaneousIntervalDecisionNNs, "fitted_params": fitted_model.model_params, "hyperparams": best_hyperparams, "cv_results": cv_results, }, args.fitted_file)
def main(args=sys.argv[1:]): args = parse_args(args) data_dict = pickle_from_file(args.data_file) test_data, _ = data_dict["data_gen"].create_data(args.num_test, args.seed) args.num_p = test_data.x.shape[1] fitted_model = load_model(args.fitted_file) # Look at the region we accepted plot_accepted_rejected_region(data_dict, fitted_model, args) # Look at how good the density estimates are in the # accept vs reject region plot_densities(test_data, fitted_model, args) recalibrated_dict = pickle_from_file(args.recalibrated_file) check_recalibration_covered(fitted_model, recalibrated_dict, test_data)
def main(args=sys.argv[1:]): args = parse_args(args) np.random.seed(args.seed) coverage_results = [] for coverage_file, num_train in zip(args.coverage_files, args.num_trains): coverage_result = pickle_from_file(coverage_file) coverage_results.append((num_train, coverage_result)) plot_coverage_vs_num_train(coverage_results, args)
def main(args=sys.argv[1:]): args = parse_args(args) print(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) np.random.seed(args.seed) # Load policy histories all_approval_histories = [ pickle_from_file(history_file) for history_file in args.history_files if os.path.exists(history_file) ] approval_history_dict = {x: [] for x in all_approval_histories[0].keys()} approval_history_keys = list(approval_history_dict.keys()) human_max_losses = [] for curr_approval_hist in all_approval_histories: for k in approval_history_dict.keys(): approval_history_dict[k].append(curr_approval_hist[k]) human_max_losses.append(curr_approval_hist[k].human_max_loss) human_max_loss = np.mean(human_max_losses) print("HUMAN MAX", human_max_loss) # Sort keys sorted_keys = sorted([ k for k in approval_history_keys if not k.startswith("Learning-to-Approve") ]) ordered_approval_history_keys = [ k for k in approval_history_keys if k.startswith("Learning-to-Approve") ] + [k for k in sorted_keys if k != "Fixed"] if "Fixed" in sorted_keys: ordered_approval_history_keys.append("Fixed") sns.set_context("paper", font_scale=2) plot_losses( approval_history_dict, args.loss_plot, alpha=human_max_loss * args.scale_loss, scale_loss=args.scale_loss, ymin=args.y_min, ymax=args.y_max, plot_mean=args.plot_mean, key_order=ordered_approval_history_keys, x_start=args.x_start, x_skip=args.x_skip, ) plot_human_uses( approval_history_dict, args.human_plot, plot_mean=args.plot_mean, key_order=ordered_approval_history_keys, x_start=args.x_start, x_skip=args.x_skip, )
def main(args=sys.argv[1:]): args = parse_args(args) np.random.seed(args.seed) # Read all data orig_data_dict = pickle_from_file(args.data_file) # Get the appropriate datasplit split_dict = pickle_from_file(args.data_split_file) recalib_data = orig_data_dict["train"].subset( split_dict["recalibrate_idxs"]) args.num_p = recalib_data.x.shape[1] # Load models fitted_models = [ load_model(fitted_file) for fitted_file in args.fitted_files ] # Do all the plotting if args.num_p == 2: plot_accepted_rejected_region(orig_data_dict, fitted_models, args)
def main(args=sys.argv[1:]): args = parse_args(args) agg_model_preds_and_targets = AggModelPredsAndTargets() for year in range(args.start_year, args.start_year + args.num_years): for split_idx in range(args.start_num_year_splits, args.end_num_year_splits): prefetch_file = args.path_template % (year, split_idx) model_preds_and_targets = pickle_from_file(prefetch_file) agg_model_preds_and_targets.append(model_preds_and_targets) pickle_to_file(agg_model_preds_and_targets, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) print(args) np.random.seed(args.seed) all_approval_list = [ pickle_from_file(history_file) for history_file in args.history_files ] for hist in all_approval_list: print(hist) all_approval_dict = {x.policy_name: x for x in all_approval_list} pickle_to_file(all_approval_dict, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) nature = pickle_from_file(args.nature_file) approval_hist = ApprovalHistory(human_max_loss=1, policy_name="Placeholder") model = pickle_from_file(args.model_file) proposer = FixedProposer([model]) # begin simulation # introduce the singleton model proposer.propose_model(None, None) model_pred_targets = ModelPredsAndTargets() for t in range(nature.total_time - 1): print("prefetcthing time", t) sub_trial_data = nature.get_trial_data(t + 1) obs_batch_data = sub_trial_data.batch_data[-1] batch_preds, batch_target = proposer.get_model_preds_and_target(obs_batch_data) model_pred_targets.append(batch_preds, batch_target) nature.next(approval_hist) pickle_to_file(model_pred_targets, args.out_file)
def main(args=sys.argv[1:]): args = parse_args() print(args) np.random.seed(args.seed) # Read data data_dict = pickle_from_file(args.in_data_file) full_data = data_dict["train"] unique_groups = np.unique(full_data.group_id) shuffled_order = np.random.permutation(unique_groups) if args.recalibrate_num is not None: num_recalibrate = args.recalibrate_num train_groups = shuffled_order[:-num_recalibrate] recalibrate_groups = shuffled_order[-num_recalibrate:] else: fold_size = int(unique_groups.size / args.k_folds) + 1 start_idx = args.fold_idx * fold_size end_idx = min((args.fold_idx + 1) * fold_size, unique_groups.size) print("number in recalibrated groups", end_idx - start_idx) train_groups = np.concatenate( [shuffled_order[:start_idx], shuffled_order[end_idx:]]) recalibrate_groups = shuffled_order[start_idx:end_idx] train_idxs = np.isin(full_data.group_id, train_groups).flatten() assert train_idxs.size > 1 # For recalibartion, we only grab a random obs per group recalibrate_idxs = [] for recalib_group_id in recalibrate_groups: matching_obs_idxs = np.where(full_data.group_id == recalib_group_id)[0] random_matching_obs_idx = np.random.choice(matching_obs_idxs) recalibrate_idxs.append(random_matching_obs_idx) recalibrate_idxs = np.array(recalibrate_idxs) assert recalibrate_idxs.size > 1 # Double check we grabbed a single random obs per group assert np.unique( full_data.group_id[recalibrate_idxs]).size == recalibrate_idxs.size # Write data to file print("num train", train_idxs.size) pickle_to_file( { "train_idxs": train_idxs, "recalibrate_idxs": recalibrate_idxs, "support_sim_settings": data_dict["support_sim_settings"], }, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) results = [] for res_file_template, res_name in zip(args.result_files, args.result_names): for res in glob.glob(res_file_template): res_df = pickle_from_file(res)["summary"] res_df["approver"] = res_name results.append(res_df) results = pd.concat(results) mean_res = results.groupby(["approver", "index"]).mean() mean_res["meBAR"] = mean_res["num_bad_approval"]/(args.denom_min + mean_res["num_std_window"]) mean_res["meBSR"] = mean_res["num_bad_std"]/(args.denom_min + mean_res["num_std_window"]) print(mean_res.groupby(["approver"]).max()) mean_res.groupby(["approver"]).max().to_latex(args.out_csv, float_format="%.3f")
def main(args=sys.argv[1:]): args = parse_args(args) models = [] for year in range(args.start_year, args.start_year + args.num_years): for quarter in range(4): model_file = args.path_template % (year, quarter) print("model", model_file) assert os.path.exists(model_file) if len(models) == 0: models.append(pickle_from_file(model_file)) else: models.append(model_file) proposer = FixedProposer(models) print("pickling...") pickle_to_file(proposer, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) print(args) np.random.seed(args.seed) approval_history = pickle_from_file(args.history_file) print(approval_history) title = "%s: loss %.3f, human %.2f" % ( args.policy_name, np.mean(approval_history.expected_policy_loss_history), np.mean(approval_history.human_history), ) plot_loss( np.array(approval_history.expected_policy_loss_history), args.loss_plot, title=title, alpha=approval_history.human_max_loss, ymin=0, ymax=min(approval_history.human_max_loss * 5, args.y_max), ) plot_human_use( np.array(approval_history.human_history), args.human_plot, title=title )
def load_data(args): trial_data_dict = pickle_from_file(args.data_file) trial_data = trial_data_dict["data"] trial_meta = trial_data_dict["meta"] return trial_data, trial_meta
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) nn_class = EnsembleSimultaneousDensityDecisionNNs scratch_dir = make_scratch_dir(args) np.random.seed(args.seed) tf.set_random_seed(args.seed) # Read data data_dict = pickle_from_file(args.data_file) #assert data_dict["support_sim_settings"].check_dataset(data_dict["train"]) # Get the appropriate datasplit split_dict = pickle_from_file(args.data_split_file) print(split_dict["train_idxs"]) print(data_dict["train"].x.shape) print(data_dict["train"].y.shape) print(split_dict["train_idxs"].shape) train_split_dataset = data_dict["train"].subset(split_dict["train_idxs"]) print(train_split_dataset.y.shape) if args.density_parametric_form == "multinomial": print("num classes", train_split_dataset.num_classes) density_parametric_form = "multinomial%d" % train_split_dataset.num_classes else: density_parametric_form = args.density_parametric_form if args.use_train_data_support: support_data = train_split_dataset old_support_settings = data_dict["support_sim_settings"] data_dict["support_sim_settings"] = SupportSimSettingsEmpirical( support_data.x, scale=args.support_empirical_scale, min_x=old_support_settings.min_x, max_x=old_support_settings.max_x) elif args.empirical_support_file: empirical_support = pickle_from_file(args.empirical_support_file) old_support_settings = data_dict["support_sim_settings"] data_dict["support_sim_settings"] = SupportSimSettingsEmpirical( empirical_support, scale=args.support_empirical_scale, min_x=old_support_settings.min_x, max_x=old_support_settings.max_x) # Setup the parameters we will tune over param_grid = [{ 'density_layer_sizes': args.density_layer_sizes, 'decision_layer_sizes': args.decision_layer_sizes, 'dropout_rate': args.dropout_rate, 'density_parametric_form': [density_parametric_form], 'density_weight_param': args.density_weight_params, 'decision_weight_param': args.decision_weight_params, 'weight_penalty_type': [args.weight_penalty_type], 'cost_decline': [args.cost_decline], 'do_no_harm_param': args.do_no_harm_params, 'log_barrier_param': args.log_barrier_params, 'max_iters': [args.max_iters], 'num_inits': [args.num_inits], 'num_ensemble': [args.num_ensemble], 'do_distributed': [args.do_distributed], 'scratch_dir': [scratch_dir], 'act_func': [args.act_func], 'learning_rate': [args.learning_rate], 'support_sim_settings': [data_dict["support_sim_settings"]], 'support_sim_num': [args.support_sim_num], }] # Fit model fitted_model, best_hyperparams, cv_results = do_cross_validation( train_split_dataset, nn_class=nn_class, param_grid=param_grid, cv=args.cv) logging.info("Best hyperparams %s", best_hyperparams) # Save model pickle_to_file({ "nn_class": nn_class, "fitted_params": [m.model_params for m in fitted_model.nns], "hyperparams": best_hyperparams, "cv_results": cv_results, }, args.fitted_file) # DOUBLE CHECKING THINGS WORK #pickle_from_file(args.fitted_file) fitted_model.get_accept_prob(train_split_dataset.x[:10,:]) fitted_model.get_prediction_interval(train_split_dataset.x[:10,:])
def main(args=sys.argv[1:]): args = parse_args(args) np.random.seed(args.seed) logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) logging.info(args) data_dict = pickle_from_file(args.data_file) test_data, _ = data_dict["data_gen"].create_data(args.num_test) fitted_models = [] agg_dict = {} for fitted_file, coverage_file in zip(args.fitted_files, args.coverage_files): fitted_model = load_model(fitted_file) fitted_models.append(fitted_model) coverage_dict = pickle_from_file(coverage_file) for pi_alpha, inference_dict in coverage_dict.items(): if pi_alpha not in agg_dict: agg_dict[pi_alpha] = [] agg_dict[pi_alpha].append(inference_dict) unif_x = data_dict["support_sim_settings"].support_unif_rvs(args.num_test) unif_test_data = data_dict["data_gen"].create_data_given_x(unif_x) coverage_agg_results = {} for pi_alpha, inference_dicts in agg_dict.items(): aggregator = DecisionIntervalAggregator(fitted_models, pi_alpha, inference_dicts) indiv_test_datas = [ data_dict["data_gen"].create_data(args.num_test)[0] for _ in fitted_models ] indiv_test_inf_dicts = [ DecisionIntervalRecalibrator(fitted_model, pi_alpha).recalibrate(indiv_test_data) for fitted_model, indiv_test_data in zip(fitted_models, indiv_test_datas) ] individual_is_covereds = [] for test_coverage_dict, inf_dict in zip(indiv_test_inf_dicts, inference_dicts): print(inf_dict) test_coverage = test_coverage_dict["cov_given_accept"]["mean"] test_coverage_ci = get_normal_ci( test_coverage_dict["cov_given_accept"], args.ci_alpha) individual_ci = get_normal_ci(inf_dict["cov_given_accept"], args.ci_alpha) indiv_covered = individual_ci[ 0] <= test_coverage and test_coverage <= individual_ci[1] logging.info("indiv est %f ci %s", inf_dict["cov_given_accept"]["mean"], individual_ci) logging.info("true indiv %f ci %s", test_coverage, test_coverage_ci) logging.info("indiv is covered? %s", indiv_covered) individual_is_covereds.append(indiv_covered) # Calculate the width of the individual CI diams for comparison individual_ci_diams = get_individual_ci_diams(inference_dicts, args.ci_alpha) # Evaluate if the true coverage value is covered agg_cov_given_accept_dict = aggregator.calc_agg_cover_given_accept( args.ci_alpha) true_cov_given_accept_dict = aggregator.eval_cov_given_accept( test_data)["cov_given_accept"] true_cov_given_accept = true_cov_given_accept_dict["mean"] agg_ci = agg_cov_given_accept_dict["ci"] is_covered = true_cov_given_accept > agg_ci[ 0] and true_cov_given_accept < agg_ci[1] # Evaluate coverage if using independence assumption indpt_aggregator = DecisionIntervalIndptAggregator( fitted_models, pi_alpha, inference_dicts) indpt_agg_cov_given_accept_dict = indpt_aggregator.calc_agg_cover_given_accept( args.ci_alpha) indpt_ci = indpt_agg_cov_given_accept_dict["ci"] indpt_is_covered = true_cov_given_accept > indpt_ci[ 0] and true_cov_given_accept < indpt_ci[1] coverage_agg_results[pi_alpha] = { "is_covered": { "agg": [is_covered], "independent": [indpt_is_covered], "individual": individual_is_covereds }, "ci_diams": { "agg": [agg_ci[1] - agg_ci[0]], "independent": [indpt_ci[1] - indpt_ci[0]], "individual": individual_ci_diams }, "true_cov": { "agg": [true_cov_given_accept], "independent": [true_cov_given_accept], "individual": [ test_inf_dict["cov_given_accept"]["mean"] for test_inf_dict in indiv_test_inf_dicts ] } } # Evaluate local coverage local_coverages = assess_local_agg_coverage_true( aggregator, test_data, data_dict["data_gen"]) for key, val in local_coverages.items(): coverage_agg_results[pi_alpha][key] = val logging.info("PI alpha %f", pi_alpha) logging.info("estimated agg cover given accept %f %s", agg_cov_given_accept_dict["mean"], agg_ci) logging.info("indepttt estimated agg cover given accept %f %s", indpt_agg_cov_given_accept_dict["mean"], indpt_ci) logging.info("true cov given accept %f, se %f", true_cov_given_accept, true_cov_given_accept_dict["se"]) logging.info("is covered? %s", is_covered) logging.info("indept is covered? %s", indpt_is_covered) logging.info(coverage_agg_results) pickle_to_file(coverage_agg_results, args.out_file)
def main(args=sys.argv[1:]): args = parse_args(args) print(args) # Save things test_data = pickle_from_file(args.in_test_data) train_data_dict = pickle_from_file(args.in_train_data) train_data = train_data_dict["train"] num_meta_feats = 3 assert train_data.x.shape[1] % 2 == 1 num_non_meta_feats = int((train_data.x.shape[1] - num_meta_feats) / 2) start_missing_idx = num_non_meta_feats + num_meta_feats print(start_missing_idx) is_missingness_acceptable = np.mean(train_data.x[:, start_missing_idx:], axis=0) < 0.1 keep_cols = np.concatenate([ np.array([True, True, True]), is_missingness_acceptable, np.zeros(num_non_meta_feats, dtype=bool) ]) print(keep_cols) print(keep_cols.shape) train_data.x = train_data.x[:, keep_cols] test_data.x = test_data.x[:, keep_cols] orig_support_sim_settings = SupportSimSettingsComplex.create_from_dataset( train_data.x, inflation_factor=0) orig_cts_feature_idxs = orig_support_sim_settings.cts_feature_idxs orig_discrete_feature_idxs = orig_support_sim_settings.discrete_feature_idxs print(orig_cts_feature_idxs[:10]) print(orig_discrete_feature_idxs[:10]) if args.holdout_age: age = train_data.x[:, 0] age_mask = ((age < args.holdout_min_age) + (age > args.holdout_max_age)).astype(bool) heldin_train_data = train_data.subset(age_mask) # REMOVE AGE FROM CTS FEATURES orig_cts_feature_idxs = orig_cts_feature_idxs[1:] else: heldin_train_data = train_data offset_idx = 0 print("max train age", np.max(heldin_train_data.x[:, 0])) pca = PCA(n_components=args.num_pca, whiten=True) print("ORIG SHAPE", heldin_train_data.x.shape) heldin_train_data_x_cts = pca.fit_transform( heldin_train_data.x[:, orig_cts_feature_idxs]) print(pca.explained_variance_ratio_) print("NUM DIS", orig_discrete_feature_idxs.size) test_data_x_cts = pca.transform(test_data.x[:, orig_cts_feature_idxs]) heldin_train_data.x = np.hstack([ heldin_train_data.x[:, orig_discrete_feature_idxs], heldin_train_data_x_cts ]) if args.holdout_age: test_data.x = np.hstack([ test_data.x[:, 0:1], # age feature test_data.x[:, orig_discrete_feature_idxs], test_data_x_cts ]) else: test_data.x = np.hstack( [test_data.x[:, orig_discrete_feature_idxs], test_data_x_cts]) print('NEW TEST SHAPE', test_data.x.shape) print('NEW TRAIN SHAPE', heldin_train_data.x.shape) support_sim_settings = SupportSimSettingsComplex.create_from_dataset( heldin_train_data.x, inflation_factor=0) support_sim_settings._process_feature_ranges() print("dataset check", support_sim_settings.check_dataset(heldin_train_data)) train_data_dict["train"] = heldin_train_data train_data_dict["support_sim_settings"] = support_sim_settings heldin_train_data.num_p = heldin_train_data.x.shape[1] pickle_to_file(train_data_dict, args.out_train_data) test_data.num_p = test_data.x.shape[1] pickle_to_file(test_data, args.out_test_data) print("num obs", heldin_train_data.num_obs) print("num obs", train_data.num_obs) print("FINAL NUM FEATS", heldin_train_data.num_p) print("FINAL NUM FEATS", test_data.num_p)
def main(args=sys.argv[1:]): args = parse_args(args) logging.basicConfig( format="%(message)s", filename=args.log_file, level=logging.DEBUG ) print(args) logging.info(args) np.random.seed(args.seed) nature = pickle_from_file(args.nature_file) logging.info("BATCH SIZES %s", nature.batch_sizes) proposer = pickle_from_file(args.proposer_file) nature.next(None) model = proposer.propose_model(nature.get_trial_data(0), None) if args.human_max_loss is None: args.human_max_loss = np.mean( proposer.score_models( nature.create_test_data(time_t=0, num_obs=args.num_test_obs) )[0] ) logging.info("HUMAN MAX %f", args.human_max_loss) nature.next(None) print("POLICY") policy = create_policy( args.policy_name, args, human_max_loss=args.human_max_loss, drift=args.human_max_loss * args.drift_scale, total_time=nature.total_time, num_experts=nature.total_time, batch_size=np.mean(nature.batch_sizes[1:]), ) st_time = time.time() if args.prefetched_file is None: sim = Simulation( nature, proposer, policy, args.human_max_loss, num_test_obs=args.num_test_obs, holdout_last_batch=args.holdout_last_batch, ) else: prefetched = pickle_from_file(args.prefetched_file) sim = SimulationPrefetched( nature, proposer, prefetched, policy, args.human_max_loss, num_test_obs=args.num_test_obs, holdout_last_batch=args.holdout_last_batch, ) sim.run(lambda approval_hist: pickle_to_file(approval_hist, args.out_file)) logging.info(sim.approval_hist) print(sim.approval_hist) logging.info("run time %d", time.time() - st_time) pickle_to_file(sim.approval_hist, args.out_file) if args.out_nature_file is not None: pickle_to_file(nature.to_fixed(), args.out_nature_file)