def create_kaggle_dir_structure(tmp, script_path, dataset_path, max_size=None, random_state=42): data_dir = os.path.join(tmp, "input") src_dir = os.path.join(tmp, "src") os.makedirs(data_dir, exist_ok=True) os.makedirs(src_dir, exist_ok=True) script_name = os.path.basename(script_path) new_script_path = os.path.join(src_dir, script_name) shutil.copy(script_path, new_script_path) # create downsampled dataset if needed utils.set_seed(random_state) dataset_path = prepare_dataset(dataset_path, max_size) # symlink the dataset as train.csv *and* test.csv # we are only interested in getting the pipeline # not any performance coming from this train_path = os.path.join(data_dir, "train.csv") if os.path.exists(train_path): os.remove(train_path) os.symlink(dataset_path, train_path) test_path = os.path.join(data_dir, "test.csv") if os.path.exists(test_path): os.remove(test_path) os.symlink(dataset_path, test_path) return new_script_path
def stat_test_score_diff_improvements(df, strategies, num_comparisons=None, random_state=42): assert len(strategies) == 2 df = df[df["strategy"].isin(strategies) & df["improved"]] strategies_present = df["strategy"].unique() if len(strategies_present) == 1: print("Single strategy", strategies_present[0]) print("Can't compute paired t-test") return None, None pv_df = pd.pivot_table( df, index=["dataset", "id"], columns="strategy", values="score_diff", ).reset_index() no_missing = (~pd.isnull(pv_df[strategies[0]])) & ( ~pd.isnull(pv_df[strategies[1]])) pv_df = pv_df[no_missing] scores_0 = pv_df[strategies[0]] scores_1 = pv_df[strategies[1]] utils.set_seed(random_state) stat, p_value = scipy.stats.ttest_rel(scores_0, scores_1) if num_comparisons is not None: p_value = p_value * num_comparisons return stat, p_value
def main(): args = get_args() pipelines_df = pd.read_pickle(args.input) utils.set_seed(args.seed) pairs = build_paired_corpus( pipelines_df, args.num_pre, args.num_post, args.k, sample_method=args.sample_method, ) with open(args.output, "wb") as fout: pickle.dump(pairs, fout)
def stat_test_count_improved( df, strategies, num_comparisons=None, random_state=42, ): assert len(strategies) == 2 df = df[df["strategy"].isin(strategies)] strategies_present = df["strategy"].unique() if len(strategies_present) == 1: print("Single strategy", strategies_present[0]) print("Can't compute mcnemar") return None, None pv = pd.pivot_table(df, index=["dataset", "id"], columns="strategy", values="improved").reset_index() pvg = pv.groupby(strategies).size() pvg = pvg.to_frame(name="ct").reset_index() pvg_pv = pd.pivot_table( pvg, index=strategies[0], columns=strategies[1], values="ct", ) # non-parametric for paired tests utils.set_seed(random_state) if pd.isnull(pvg_pv.values.flatten()).any(): print("McNemar can't handle nans") return None, None cont_table = pvg_pv.values if cont_table.shape != (2, 2): print("McNemar requires well formed contingency table") return None, None obj = statsmodels.stats.contingency_tables.mcnemar(cont_table) stat = obj.statistic p_value = obj.pvalue if num_comparisons is not None: p_value = p_value * num_comparisons return stat, p_value
def get_repair_hashes(repairer, seed, num_passes=2, num_pipelines=3): passes = [[]] * num_passes for i in range(0, num_passes): print("Pass: {}".format(i)) utils.set_seed(seed) num_remaining = num_pipelines pbar = tqdm.tqdm(total=num_pipelines) for p in data.pipelines: if num_remaining <= 0: break repaired = repairer.repair(p, data.X, data.y, bound_num_repairs=1) orig_md5 = pt.md5(p) if repaired is None: continue repaired_md5 = pt.md5(repaired) if orig_md5 == repaired_md5: continue passes[i].append(repaired_md5) num_remaining -= 1 pbar.update(1) pbar.close() return passes
def run_single_tree( X_search, y_search, X_test, y_test, test_pipeline_tree, enumerator, bound_num_repaired_pipelines, dev_cv=3, bound_k=3, cv=5, scoring="f1_macro", random_state=42, ): repairer = PipelineRepairer(enumerator) results_summary = [] orig_info = { "type": "orig", "graph": test_pipeline_tree, } orig_compiled = pt.to_pipeline(test_pipeline_tree) # TODO: this should be a param # should be about 5% of dataset, since search is 50% num_obs_search = int(X_search.shape[0] * 0.1) assert num_obs_search >= 1 if isinstance(X_search, pd.DataFrame): X_search = X_search.values if isinstance(y_search, (pd.DataFrame, pd.Series)): y_search = y_search.values X_search = X_search[:num_obs_search] y_search = y_search[:num_obs_search] utils.set_seed(random_state) repaired = repairer.repair( orig_compiled, X_search, y_search, bound_k=bound_k, bound_num_repairs=bound_num_repaired_pipelines, scoring=scoring, cv=dev_cv, random_state=random_state, verbosity=1, ) try: print("Evaluate original") utils.set_seed(random_state) orig_results = mp_utils.run( DEFAULT_TIMEOUT_EVAL, cross_validate, orig_compiled, X_test, y_test, cv=StratifiedKFold( cv, random_state=random_state, shuffle=True, ), scoring=scoring, return_estimator=True, return_train_score=True, ) orig_info["test_scores"] = orig_results["test_score"] orig_info["mean_test_score"] = np.mean(orig_results["test_score"]) orig_info["failed"] = False orig_info["timedout"] = False except mp_utils.TimeoutError: print("Timedout on original pipeline") orig_info["failed"] = True orig_info["timedout"] = True orig_info["test_scores"] = [] orig_info["mean_test_score"] = np.nan except Exception as err: print("Failed to run original pipeline") print(err) orig_info["failed"] = True orig_info["timedout"] = False orig_info["test_scores"] = [] orig_info["mean_test_score"] = np.nan if repaired is None: print("No repair found") orig_info["no_repaired_candidates"] = True results_summary.append(orig_info) return pd.DataFrame(results_summary) else: orig_info["no_repaired_candidates"] = False results_summary.append(orig_info) repair_info = { "type": "repair", "graph": pt.to_tree(repaired), "no_repaired_candidates": False, } try: print("Evaluate repaired") utils.set_seed(random_state) repaired_results = mp_utils.run( DEFAULT_TIMEOUT_EVAL, cross_validate, repaired, X_test, y_test, cv=StratifiedKFold( cv, random_state=random_state, shuffle=True, ), scoring=scoring, return_estimator=True, return_train_score=True, ) repair_info["test_scores"] = repaired_results["test_score"] repair_info["mean_test_score"] = np.mean( repaired_results["test_score"]) repair_info["failed"] = False repair_info["timedout"] = False except mp_utils.TimeoutError: print("Timedout on repair pipeline") orig_info["failed"] = True orig_info["timedout"] = True orig_info["test_scores"] = [] orig_info["mean_test_score"] = np.nan except Exception as err: print("Failed to run repaired pipeline") print(err) repair_info["test_scores"] = [] repair_info["mean_test_score"] = np.nan repair_info["failed"] = True repair_info["timedout"] = False repair_info["repairer_statistics"] = repairer.statistics orig_info["repairer_statistics"] = None results_summary.append(repair_info) return pd.DataFrame(results_summary)
def run_evaluation( script_paths, enumerator, bound_num_repaired_pipelines, scoring=None, max_size=5000, dev_fraction=0.2, dev_cv=3, bound_k=3, cv=5, random_state=42, ): results = [] for ix, script_path in tqdm.tqdm(enumerate(script_paths)): user_script = UserScript(script_path) X, y = utils.get_dataset(user_script.dataset) # same sampling/train/test split based on dataset dataset_seed = get_dataset_seed(user_script.dataset) utils.set_seed(dataset_seed) if X.shape[0] > max_size: sample_idx, _ = train_test_split( np.arange(0, X.shape[0]), train_size=max_size, random_state=dataset_seed, stratify=y, ) X = X[sample_idx] y = y[sample_idx] X_rest, X_search, y_rest, y_search = train_test_split( X, y, test_size=dev_fraction, random_state=dataset_seed, stratify=y, ) orig_pipeline = pt.to_tree(user_script.make_pipeline()) result = run_single_tree( X_search, y_search, X_rest, y_rest, orig_pipeline, enumerator, bound_num_repaired_pipelines, dev_cv=dev_cv, bound_k=bound_k, cv=cv, scoring=user_script.metric if scoring is None else scoring, random_state=random_state + ix, ) result["script_path"] = user_script.path result["dataset"] = user_script.dataset result["metric"] = user_script.metric result["id"] = ix # doesn't matter ... just adding for consistency # with what we output for synthetic_evaluation result["timestamp"] = np.random.random() results.append(result) df_results = pd.concat(results, axis=0) return df_results