コード例 #1
0
def create_kaggle_dir_structure(tmp,
                                script_path,
                                dataset_path,
                                max_size=None,
                                random_state=42):
    data_dir = os.path.join(tmp, "input")
    src_dir = os.path.join(tmp, "src")
    os.makedirs(data_dir, exist_ok=True)
    os.makedirs(src_dir, exist_ok=True)

    script_name = os.path.basename(script_path)
    new_script_path = os.path.join(src_dir, script_name)
    shutil.copy(script_path, new_script_path)

    # create downsampled dataset if needed
    utils.set_seed(random_state)
    dataset_path = prepare_dataset(dataset_path, max_size)

    # symlink the dataset as train.csv *and* test.csv
    # we are only interested in getting the pipeline
    # not any performance coming from this
    train_path = os.path.join(data_dir, "train.csv")
    if os.path.exists(train_path):
        os.remove(train_path)
    os.symlink(dataset_path, train_path)

    test_path = os.path.join(data_dir, "test.csv")
    if os.path.exists(test_path):
        os.remove(test_path)
    os.symlink(dataset_path, test_path)

    return new_script_path
コード例 #2
0
def stat_test_score_diff_improvements(df,
                                      strategies,
                                      num_comparisons=None,
                                      random_state=42):
    assert len(strategies) == 2
    df = df[df["strategy"].isin(strategies) & df["improved"]]
    strategies_present = df["strategy"].unique()
    if len(strategies_present) == 1:
        print("Single strategy", strategies_present[0])
        print("Can't compute paired t-test")
        return None, None

    pv_df = pd.pivot_table(
        df,
        index=["dataset", "id"],
        columns="strategy",
        values="score_diff",
    ).reset_index()
    no_missing = (~pd.isnull(pv_df[strategies[0]])) & (
        ~pd.isnull(pv_df[strategies[1]]))
    pv_df = pv_df[no_missing]

    scores_0 = pv_df[strategies[0]]
    scores_1 = pv_df[strategies[1]]
    utils.set_seed(random_state)
    stat, p_value = scipy.stats.ttest_rel(scores_0, scores_1)
    if num_comparisons is not None:
        p_value = p_value * num_comparisons
    return stat, p_value
コード例 #3
0
def main():
    args = get_args()
    pipelines_df = pd.read_pickle(args.input)
    utils.set_seed(args.seed)
    pairs = build_paired_corpus(
        pipelines_df,
        args.num_pre,
        args.num_post,
        args.k,
        sample_method=args.sample_method,
    )
    with open(args.output, "wb") as fout:
        pickle.dump(pairs, fout)
コード例 #4
0
def stat_test_count_improved(
    df,
    strategies,
    num_comparisons=None,
    random_state=42,
):
    assert len(strategies) == 2
    df = df[df["strategy"].isin(strategies)]
    strategies_present = df["strategy"].unique()
    if len(strategies_present) == 1:
        print("Single strategy", strategies_present[0])
        print("Can't compute mcnemar")
        return None, None

    pv = pd.pivot_table(df,
                        index=["dataset", "id"],
                        columns="strategy",
                        values="improved").reset_index()
    pvg = pv.groupby(strategies).size()
    pvg = pvg.to_frame(name="ct").reset_index()
    pvg_pv = pd.pivot_table(
        pvg,
        index=strategies[0],
        columns=strategies[1],
        values="ct",
    )
    # non-parametric for paired tests
    utils.set_seed(random_state)
    if pd.isnull(pvg_pv.values.flatten()).any():
        print("McNemar can't handle nans")
        return None, None

    cont_table = pvg_pv.values
    if cont_table.shape != (2, 2):
        print("McNemar requires well formed contingency table")
        return None, None

    obj = statsmodels.stats.contingency_tables.mcnemar(cont_table)
    stat = obj.statistic
    p_value = obj.pvalue
    if num_comparisons is not None:
        p_value = p_value * num_comparisons
    return stat, p_value
コード例 #5
0
def get_repair_hashes(repairer, seed, num_passes=2, num_pipelines=3):
    passes = [[]] * num_passes
    for i in range(0, num_passes):
        print("Pass: {}".format(i))
        utils.set_seed(seed)
        num_remaining = num_pipelines
        pbar = tqdm.tqdm(total=num_pipelines)
        for p in data.pipelines:
            if num_remaining <= 0:
                break
            repaired = repairer.repair(p, data.X, data.y, bound_num_repairs=1)
            orig_md5 = pt.md5(p)
            if repaired is None:
                continue
            repaired_md5 = pt.md5(repaired)
            if orig_md5 == repaired_md5:
                continue
            passes[i].append(repaired_md5)
            num_remaining -= 1
            pbar.update(1)
        pbar.close()
    return passes
コード例 #6
0
def run_single_tree(
    X_search,
    y_search,
    X_test,
    y_test,
    test_pipeline_tree,
    enumerator,
    bound_num_repaired_pipelines,
    dev_cv=3,
    bound_k=3,
    cv=5,
    scoring="f1_macro",
    random_state=42,
):
    repairer = PipelineRepairer(enumerator)

    results_summary = []
    orig_info = {
        "type": "orig",
        "graph": test_pipeline_tree,
    }

    orig_compiled = pt.to_pipeline(test_pipeline_tree)

    # TODO: this should be a param
    # should be about 5% of dataset, since search is 50%
    num_obs_search = int(X_search.shape[0] * 0.1)
    assert num_obs_search >= 1
    if isinstance(X_search, pd.DataFrame):
        X_search = X_search.values
    if isinstance(y_search, (pd.DataFrame, pd.Series)):
        y_search = y_search.values

    X_search = X_search[:num_obs_search]
    y_search = y_search[:num_obs_search]

    utils.set_seed(random_state)
    repaired = repairer.repair(
        orig_compiled,
        X_search,
        y_search,
        bound_k=bound_k,
        bound_num_repairs=bound_num_repaired_pipelines,
        scoring=scoring,
        cv=dev_cv,
        random_state=random_state,
        verbosity=1,
    )

    try:
        print("Evaluate original")
        utils.set_seed(random_state)
        orig_results = mp_utils.run(
            DEFAULT_TIMEOUT_EVAL,
            cross_validate,
            orig_compiled,
            X_test,
            y_test,
            cv=StratifiedKFold(
                cv,
                random_state=random_state,
                shuffle=True,
            ),
            scoring=scoring,
            return_estimator=True,
            return_train_score=True,
        )
        orig_info["test_scores"] = orig_results["test_score"]
        orig_info["mean_test_score"] = np.mean(orig_results["test_score"])
        orig_info["failed"] = False
        orig_info["timedout"] = False
    except mp_utils.TimeoutError:
        print("Timedout on original pipeline")
        orig_info["failed"] = True
        orig_info["timedout"] = True
        orig_info["test_scores"] = []
        orig_info["mean_test_score"] = np.nan
    except Exception as err:
        print("Failed to run original pipeline")
        print(err)
        orig_info["failed"] = True
        orig_info["timedout"] = False
        orig_info["test_scores"] = []
        orig_info["mean_test_score"] = np.nan

    if repaired is None:
        print("No repair found")
        orig_info["no_repaired_candidates"] = True
        results_summary.append(orig_info)
        return pd.DataFrame(results_summary)
    else:
        orig_info["no_repaired_candidates"] = False

    results_summary.append(orig_info)

    repair_info = {
        "type": "repair",
        "graph": pt.to_tree(repaired),
        "no_repaired_candidates": False,
    }
    try:
        print("Evaluate repaired")
        utils.set_seed(random_state)
        repaired_results = mp_utils.run(
            DEFAULT_TIMEOUT_EVAL,
            cross_validate,
            repaired,
            X_test,
            y_test,
            cv=StratifiedKFold(
                cv,
                random_state=random_state,
                shuffle=True,
            ),
            scoring=scoring,
            return_estimator=True,
            return_train_score=True,
        )
        repair_info["test_scores"] = repaired_results["test_score"]
        repair_info["mean_test_score"] = np.mean(
            repaired_results["test_score"])
        repair_info["failed"] = False
        repair_info["timedout"] = False
    except mp_utils.TimeoutError:
        print("Timedout on repair pipeline")
        orig_info["failed"] = True
        orig_info["timedout"] = True
        orig_info["test_scores"] = []
        orig_info["mean_test_score"] = np.nan
    except Exception as err:
        print("Failed to run repaired pipeline")
        print(err)
        repair_info["test_scores"] = []
        repair_info["mean_test_score"] = np.nan
        repair_info["failed"] = True
        repair_info["timedout"] = False

    repair_info["repairer_statistics"] = repairer.statistics
    orig_info["repairer_statistics"] = None
    results_summary.append(repair_info)
    return pd.DataFrame(results_summary)
コード例 #7
0
def run_evaluation(
        script_paths,
        enumerator,
        bound_num_repaired_pipelines,
        scoring=None,
        max_size=5000,
        dev_fraction=0.2,
        dev_cv=3,
        bound_k=3,
        cv=5,
        random_state=42,
):

    results = []
    for ix, script_path in tqdm.tqdm(enumerate(script_paths)):
        user_script = UserScript(script_path)

        X, y = utils.get_dataset(user_script.dataset)

        # same sampling/train/test split based on dataset
        dataset_seed = get_dataset_seed(user_script.dataset)
        utils.set_seed(dataset_seed)
        if X.shape[0] > max_size:
            sample_idx, _ = train_test_split(
                np.arange(0, X.shape[0]),
                train_size=max_size,
                random_state=dataset_seed,
                stratify=y,
            )
            X = X[sample_idx]
            y = y[sample_idx]

        X_rest, X_search, y_rest, y_search = train_test_split(
            X,
            y,
            test_size=dev_fraction,
            random_state=dataset_seed,
            stratify=y,
        )

        orig_pipeline = pt.to_tree(user_script.make_pipeline())
        result = run_single_tree(
            X_search,
            y_search,
            X_rest,
            y_rest,
            orig_pipeline,
            enumerator,
            bound_num_repaired_pipelines,
            dev_cv=dev_cv,
            bound_k=bound_k,
            cv=cv,
            scoring=user_script.metric if scoring is None else scoring,
            random_state=random_state + ix,
        )
        result["script_path"] = user_script.path
        result["dataset"] = user_script.dataset
        result["metric"] = user_script.metric
        result["id"] = ix
        # doesn't matter ... just adding for consistency
        # with what we output for synthetic_evaluation
        result["timestamp"] = np.random.random()
        results.append(result)

    df_results = pd.concat(results, axis=0)
    return df_results