Ejemplo n.º 1
0
def setup_learning_curve_point(  # pylint:disable=too-many-arguments, too-many-locals
        trainxpath, trainypath, validxpath, validypath, points, basepath):
    # Set up directories
    learning_curve_point_dir = os.path.join(
        basepath, "_".join(["model", str(int(points))]))
    make_if_not_exist(learning_curve_point_dir)

    datadir = os.path.join(learning_curve_point_dir, "data")
    make_if_not_exist(datadir)

    modeldir = os.path.join(learning_curve_point_dir, "model")
    make_if_not_exist(modeldir)

    # Select diverse subset
    x = np.load(trainxpath)
    y = np.load(trainypath)

    xvalid = np.load(validxpath)
    yvalid = np.load(validypath)

    xsummarized, ysummarized = summarize_data(x, y, points)
    xsummarized_valid, ysummarized_valid = summarize_data(
        xvalid, yvalid, points)

    np.save(os.path.join(datadir, "features"), xsummarized)
    np.save(os.path.join(datadir, "labels"), ysummarized)

    np.save(os.path.join(datadir, "features_valid"), xsummarized_valid)
    np.save(os.path.join(datadir, "labels_valid"), ysummarized_valid)

    return datadir, modeldir
def main(modelpath, xtrainpath, ytrainpath, xtestpath, ytestpath, outdir):  # pylint:disable=too-many-arguments,too-many-locals
    """CLI"""
    scalerpath = os.path.join(modelpath, "scaler_0.joblib")
    assert os.path.exists(scalerpath)
    scaler = joblib.load(scalerpath)

    make_if_not_exist(outdir)

    print("loading data")
    X_train = scaler.transform(np.load(xtrainpath))
    X_test = scaler.transform(np.load(xtestpath))

    y_train = np.load(ytrainpath).astype(np.int)
    y_test = np.load(ytestpath).astype(np.int)

    models = glob(os.path.join(modelpath, "*.joblib"))

    print("now starting dask and running the actual computation")
    global cluster
    global client
    cluster = LocalCluster(memory_limit="28GB", n_workers=4)
    client = Client(cluster)

    relevant_models = [[model, joblib.load(model)] for model in models
                       if not "scaler" in model]

    bvpartial = partial(bv_decomp_wrapper,
                        xtrain=X_train,
                        ytrain=y_train,
                        xtest=X_test,
                        ytest=y_test)
    futures = client.map(bvpartial, relevant_models)

    results = client.gather(futures)

    print("finished crunching, now dumping results")

    with open(os.path.join(outdir, "bv_decomposition.pkl"), "wb") as fh:
        pickle.dump(results, fh)
def setup_learning_curve_point(trainxpath, trainypath, points):
    # Set up directories
    learning_curve_point_dir = "_".join(["model", str(int(points))])
    make_if_not_exist(learning_curve_point_dir)

    datadir = os.path.join(learning_curve_point_dir, "data")
    make_if_not_exist(datadir)

    modeldir = os.path.join(learning_curve_point_dir, "model")
    make_if_not_exist(modeldir)

    # Select diverse subset
    x = np.load(trainxpath)
    y = np.load(trainypath)

    xsummarized, ysummarized = summarize_data(x, y, points)

    np.save(os.path.join(datadir, "features"), xsummarized)
    np.save(os.path.join(datadir, "labels"), ysummarized)

    return datadir, modeldir
def main(submit):
    make_if_not_exist("learning_curves")
    for estimator in to_analyze:
        modelpath = underscore_join(["model", estimator])
        if check_if_model_exists(modelpath):
            for model in glob(os.path.join(modelpath, "*.joblib")):
                if "ensemble" in model:
                    p = Path(model)
                    modelbasename = os.path.join(
                        "learning_curves", underscore_join([estimator,
                                                            p.stem]))
                    make_if_not_exist(modelbasename)
                    for point in POINTS:
                        modelpointname = os.path.join(modelbasename,
                                                      str(point))
                        make_if_not_exist(modelpointname)
                        command = write_run_command(
                            model,
                            underscore_join(["data", estimator]),
                            os.path.join(
                                underscore_join(["houldout", estimator]),
                                "valid"),
                            underscore_join(["houldout", estimator]),
                            modelpointname,
                            point,
                        )
                        submission_name = underscore_join(
                            [estimator, Path(modelpointname).stem])
                        write_slurmfile(submission_name, command)

                        if submit:
                            subprocess.call(
                                "sbatch submit_{}.slurm".format(
                                    submission_name),
                                shell=True,
                            )
def main(  # pylint:disable=too-many-arguments, too-many-locals
    model,
    xtrainpath,
    ytrainpath,
    xtestpath,
    ytestpath,
    featurenamespath,
    outpath,
    rounds,
    points,
    use_shap,
):
    # load model and data and also scale the data
    print("loading model and data")
    model = joblib.load(model)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(np.load(xtrainpath))
    y_train = np.load(ytrainpath)

    X_test = scaler.transform(np.load(xtestpath))
    y_test = np.load(ytestpath)

    X_train, y_train = summarize_data(X_train, y_train, points)
    X_test, y_test = summarize_data(X_test, y_test, points)

    # load the feature names
    feature_names = read_pickle(featurenamespath)

    permuation_importance_partial = partial(permuation_importance_wrapper,
                                            rounds=rounds,
                                            model=model)

    sets = [(X_train, y_train, "train"), (X_test, y_test, "test")]

    if not use_shap:
        print("starting permutation feature importance")
        # We do permutation feature importance for rounds rounds, using balanced accuracy as metric
        with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor:
            results = []
            for result in executor.map(permuation_importance_partial, sets):
                results.append(result)

        make_if_not_exist(outpath)
        with open(os.path.join(outpath, "permutation_feature_importance.pkl"),
                  "wb") as fh:
            pickle.dump(results, fh)

    else:
        print("starting SHAP feature importance")
        make_if_not_exist(outpath)
        explainer = shap.KernelExplainer(
            model.predict, X_train
        )  # note that we use the training set as the background dataset to integrate out features
        shap_values = explainer.shap_values(X_test)
        shap_values_df = pd.DataFrame()
        shap_values_df["feature"] = feature_names

        for i, shap_value in enumerate(shap_values):
            # Computing average impact of each feature in on model output (mean(abs(shap_values)) / per fold
            abs_mean_shap_values = np.mean(np.abs(shap_value), axis=0)
            expected_value = (explainer.expected_value[i]
                              if explainer.expected_value[i] is not None else
                              None)
            shap_values_df["shap_value_target_{}".format(
                str(i))] = abs_mean_shap_values
            shap_values_df["expected_value_target_{}".format(
                str(i))] = expected_value

        joblib.dump(explainer, os.path.join(outpath, "shap_explainer"))
        shap_values_df.to_csv(os.path.join(outpath, "shap_df.csv"),
                              index=False)