def get_true_model_scores(seed, data_generator: DataGenerator, trial_meta: TrialMetaData, num_test_obs: int, approved_models: List): """ @return Dict: key = endpoint, value = matrix of true model scores over time """ print("Evaluating...") np.random.seed(seed) true_model_score_dict = {} is_const_dist = "constant" in data_generator.support_sim_settings.min_func_name if is_const_dist: batch_data = data_generator.create_data(num_test_obs * 2, 0) for batch_index in range(trial_meta.num_batches): if not is_const_dist: batch_data = data_generator.create_data(num_test_obs, batch_index) true_model_scores = [ score_model_batch(model, batch_data, batch_index, trial_meta.score_func) for model in approved_models ] for k in true_model_scores[0].keys(): if k not in true_model_score_dict: true_model_score_dict[k] = np.array( [[score_dict[k]] for score_dict in true_model_scores]) else: new_scores = np.array([[score_dict[k]] for score_dict in true_model_scores]) true_model_score_dict[k] = np.concatenate( [true_model_score_dict[k], new_scores], axis=1) print("Done evaluating") return true_model_score_dict
def main(args=sys.argv[1:]): args = parse_args() print(args) data_gen = DataGenerator( args.num_p, getattr(data_gen_funcs, args.func_name), data_gen_funcs.CLASSIFICATION_DICT[args.func_name], snr=args.snr) train_data = data_gen.create_data(args.n_train) test_data = data_gen.create_data(args.n_test) print("data_file %s" % args.out_file) with open(args.out_file, "wb") as f: pickle.dump({ "train": train_data, "test": test_data}, f)
def main(args=sys.argv[1:]): args = parse_args() print(args) np.random.seed(args.seed) data_gen = DataGenerator( sim_func_form=args.sim_func_form, sim_func_name=args.sim_func, num_p=args.num_p, num_classes=args.num_classes, noise_sd=args.sim_noise_sd, std_dev_x=args.std_dev_x, max_x=args.max_x, ) train_data, support_sim_settings = data_gen.create_data(args.num_train) # Write data to file pickle_to_file( { "train": train_data, "support_sim_settings": support_sim_settings, "data_gen": data_gen }, args.out_data_file)
def main(args=sys.argv[1:]): args = parse_args() logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) print(args) logging.info(args) np.random.seed(args.seed) tf.set_random_seed(args.seed) if args.sim_func is not None: # Create data data_gen = DataGenerator( func_name=args.sim_func, n_train=args.num_train, n_test=args.num_test, num_p=args.num_p, noise_sd=args.sim_noise_sd, ) data = data_gen.create_data() # Write data pickle_to_file(data, args.data_file) else: # Read data data = pickle_from_file(args.input_data) param_grid = [{ 'layer_sizes': args.layer_sizes, 'ridge_param': args.ridge_params, 'max_iters': [args.max_iters], 'num_inits': [args.num_inits], 'act_func': [args.act_func], 'output_act_func': [args.output_act_func], }] # Fit neural network and calculate variable importance var_imports, fitted_models = calculate_var_imports_refits( data, param_grid=param_grid, cond_layer_sizes=args.cond_layer_sizes_separate, var_import_idxs=args.var_import_idx) # Save model pickle_to_file(fitted_models, args.model_file) # Store var import results pickle_to_file(var_imports, args.var_import_file) # Print output for i in range(len(var_imports)): v = var_imports[i]["std-True"] if i == 0: logging.info("full final r2 %f (1 is best)", v["r2.full"]) logging.info("full final r2 test %f", v["r2.test.full"]) logging.info("small final r2 %d : %f", i, v["r2.small"]) logging.info("small final r2 test %d : %f", i, v["r2.test.small"]) logging.info("one step est std=True %d : %f, %s", i, v["onestep"], v["onestep.ci"]) v_not_std = var_imports[i]["std-False"] logging.info("one step est std=False %d : %f, %s", i, v_not_std["onestep"], v_not_std["onestep.ci"])
def main(args=sys.argv[1:]): args = parse_args() logging.basicConfig(format="%(message)s", filename=args.log_file, level=logging.DEBUG) logging.info(args) # This random seed thing seems to only apply to the data-generation process. # The initialization of the neural net doesn't seem to be affected by this # ... which is really annoying. np.random.seed(args.seed) tf.set_random_seed(args.seed) if args.sim_func is not None: # Create data data_gen = DataGenerator( func_name=args.sim_func, n_train=args.num_train, n_test=args.num_test, num_p=args.num_p, noise_sd=args.sim_noise_sd, ) data = data_gen.create_data() # Write data to file pickle_to_file(data, args.data_file) else: # Read data data = pickle_from_file(args.input_data) param_grid = [{ 'layer_sizes': args.layer_sizes, 'ridge_param': args.ridge_params, 'max_iters': [args.max_iters], 'num_inits': [args.num_inits], 'act_func': [args.act_func], 'output_act_func': [args.output_act_func], 'var_import_idxs': [args.var_import_idx], 'sgd_sample_size': [args.sgd_sample_size], 'nan_fill_config': [args.nan_fill_config], 'missing_value_fill': [args.missing_value_fill], }] # Fit neural network and calculate variable importance reduced_func = getattr( data_gen_funcs, args.sim_func_reduced) if args.sim_func_reduced else None var_imports, fitted_model = calculate_var_imports_no_refit( data, param_grid=param_grid, cv=args.cv, reduced_func=reduced_func) # Save model pickle_to_file(fitted_model, args.model_file) # Store var import results pickle_to_file(var_imports, args.var_import_file) # Print output for each var import estimate coverage = [] for i, var_group in enumerate(args.var_import_idx): v = var_imports[i]["std-True"] if i == 0: logging.info("full final r2 %f (1 is best)", v["r2.full"]) logging.info("full final r2 test %f", v["r2.test.full"]) logging.info("full final mse train %f", v["mse.train.full"]) logging.info("full final mse test %f", v["mse.test.full"]) logging.info(" --- small --- %s ---- ", var_group) logging.info("small final r2: %f", v["r2.small"]) logging.info("small final r2 test: %f", v["r2.test.small"]) if v["mse.train.small"] is not None: logging.info("small final mse train: %f", v["mse.train.small"]) logging.info("small final mse test: %f", v["mse.test.small"]) logging.info("one step est std=True: %f, %s", v["onestep"], v["onestep.ci"]) v_not_std = var_imports[i]["std-False"] logging.info("one step est std=False: %f, %s", v_not_std["onestep"], v_not_std["onestep.ci"]) if args.sim_func_reduced: logging.info("Average coverage over all the groups: %f (%d/%d)", np.mean(coverage), np.sum(coverage), len(coverage))