Beispiel #1
0
def loco_calc(experiment, cache_check=False, **kwargs):
    """Calculate LOCO values.

    Args:
        experiment (str): Experiment (e.g. 'ALL').
        cache_check (bool): Whether to check for cached data exclusively.

    """
    # Operate on cached data only.
    get_experiment_split_data.check_in_store(experiment)
    X_train, X_test, y_train, y_test = get_experiment_split_data(experiment)

    loco_results = optional_client_call(
        calculate_loco,
        dict(
            rf=DaskRandomForestRegressor(**param_dict),
            X_train=X_train,
            y_train=y_train,
            X_test=X_test,
            y_test=y_test,
            leave_out=("", *selected_features[experiment]),
            local_n_jobs=(1 if (get_ncpus() < 4) else (get_ncpus() - 2)),
        ),
        cache_check=cache_check,
        add_client=True,
    )[0]

    if cache_check:
        return IN_STORE
    return loco_results
def common_get_model_scores(rf, X_test, X_train, y_test, y_train):
    rf.n_jobs = get_ncpus()
    with parallel_backend("threading", n_jobs=get_ncpus()):
        y_pred = rf.predict(X_test)
        y_train_pred = rf.predict(X_train)
    return {
        "test_r2": r2_score(y_test, y_pred),
        "test_mse": mean_squared_error(y_test, y_pred),
        "train_r2": r2_score(y_train, y_train_pred),
        "train_mse": mean_squared_error(y_train, y_train_pred),
    }
Beispiel #3
0
def get_model_scores(model, X_test, X_train, y_test, y_train):
    # XXX: Get train OOB score (check Dask impl.), train CV score
    model.n_jobs = get_ncpus()

    with parallel_backend("threading", n_jobs=get_ncpus()):
        y_pred = model.predict(X_test)
        y_train_pred = model.predict(X_train)

    return {
        "test_r2": r2_score(y_test, y_pred),
        "test_mse": mean_squared_error(y_test, y_pred),
        "train_r2": r2_score(y_train, y_train_pred),
        "train_mse": mean_squared_error(y_train, y_train_pred),
        "oob_r2": model.oob_score_,
    }
Beispiel #4
0
def func():
    def save_pdp_plot_2d(model, X_train, features, n_jobs):
        model.n_jobs = n_jobs
        with parallel_backend("threading", n_jobs=n_jobs):
            pdp_interact_out = pdp.pdp_interact(
                model=model,
                dataset=X_train,
                model_features=X_train.columns,
                features=features,
                num_grid_points=[20, 20],
            )

        fig, axes = pdp.pdp_interact_plot(
            pdp_interact_out, features, x_quantile=True, figsize=(7, 8)
        )
        axes["pdp_inter_ax"].xaxis.set_tick_params(rotation=45)
        figure_saver.save_figure(fig, "__".join(features), sub_directory="pdp_2d")

    X_train, X_test, y_train, y_test = data_split_cache.load()
    results, rf = cross_val_cache.load()
    columns_list = list(combinations(X_train.columns, 2))

    index = int(os.environ["PBS_ARRAY_INDEX"])
    print("Index:", index)
    print("Columns:", columns_list[index])

    ncpus = get_ncpus()
    print("NCPUS:", ncpus)

    # Use the array index to select the desired columns.
    save_pdp_plot_2d(rf, X_train, columns_list[index], ncpus)
Beispiel #5
0
def gfed4_variogram(i):
    chosen_coords, chosen_ba_data, title = get_gfed4_variogram_data(i)

    fig, ax1, ax2 = plot_variogram(
        chosen_coords,
        chosen_ba_data,
        bins=50,
        max_lag=2000,
        n_jobs=get_ncpus(),
        n_per_job=6000,
        verbose=True,
    )
    # fig.suptitle(f"{title}, {inds.shape[0]} samples (out of {valid_indices.shape[0]})")
    ax1.set_ylabel("Semivariance")
    ax2.set_ylabel("N")
    ax2.set_yscale("log")
    ax1.set_xlabel("Lag (km)")

    for ax in (ax1, ax2):
        ax.grid()

    format_label_string_with_exponent(ax1, axis="y")

    fig.align_labels()

    figure_saver.save_figure(fig, "mean_gfed4_variogram")
Beispiel #6
0
def fit_combination(X, y, combination, split_index):
    train_indices, test_indices = zip(
        *KFold(n_splits=n_splits, shuffle=True, random_state=0).split(X)
    )

    X = X[list(combination)].to_numpy()
    y = y.to_numpy()

    assert X.shape[1] == 15

    X_train = X[train_indices[split_index]]
    y_train = y[train_indices[split_index]]

    X_test = X[test_indices[split_index]]
    y_test = y[test_indices[split_index]]

    scores = {}

    with parallel_backend("threading", n_jobs=get_ncpus()):
        rf = DaskRandomForestRegressor(**param_dict)
        rf.fit(X_train, y_train)

        y_test_pred = rf.predict(X_test)
        scores[("test_score", split_index)] = {
            "r2": r2_score(y_true=y_test, y_pred=y_test_pred),
            "mse": mean_squared_error(y_true=y_test, y_pred=y_test_pred),
        }

        y_train_pred = rf.predict(X_train)
        scores[("train_score", split_index)] = {
            "r2": r2_score(y_true=y_train, y_pred=y_train_pred),
            "mse": mean_squared_error(y_true=y_train, y_pred=y_train_pred),
        }

    return scores
Beispiel #7
0
def threading_get_model_predict(*, cache_check=False, **kwargs):
    """Cached model prediction with the local threading backend."""
    kwargs["parallel_backend_call"] = (
        # Use local threading backend.
        partial(parallel_backend, "threading", n_jobs=get_ncpus()))
    if cache_check:
        return get_model_predict.check_in_store(**kwargs)
    return get_model_predict(**kwargs)
def calculate_pfi(rf, X, y):
    """Calculate the PFI."""
    rf.n_jobs = get_ncpus()
    perm_importance = eli5.sklearn.PermutationImportance(rf,
                                                         random_state=1).fit(
                                                             X, y)
    return eli5.explain_weights_df(perm_importance,
                                   feature_names=list(X.columns))
Beispiel #9
0
def get_client(*args, **kwargs):
    """Wrapper around wildfires.dask_cx1.get_client.

    Only tries to connect to a distributed scheduler if not running as a CX1 job. This
    is controlled by an environment variable.

    """
    if "RUNNING_AS_JOB" in os.environ:
        # Do not connect to a distributed scheduler.
        return Client(n_workers=1, threads_per_worker=get_ncpus())
    else:
        return wildfires_get_client(*args, **kwargs)
def fit_experiment_model(experiment, cache_check=False, **kwargs):
    if cache_check:
        get_experiment_split_data.check_in_store(experiment)
    X_train, X_test, y_train, y_test = get_experiment_split_data(experiment)

    if cache_check:
        return get_model(X_train=X_train, y_train=y_train, cache_check=True)
    model = get_model(
        X_train=X_train,
        y_train=y_train,
        parallel_backend_call=(
            # Use local threading backend - avoid the Dask backend.
            partial(parallel_backend, "threading", n_jobs=get_ncpus())),
    )
    return model
Beispiel #11
0
def common_get_model(cache_dir, X_train=None, y_train=None):
    cached = CachedResults(
        estimator_class=DaskRandomForestRegressor,
        n_splits=n_splits,
        cache_dir=cache_dir,
    )
    model = DaskRandomForestRegressor(**param_dict)
    model_key = tuple(sorted(model.get_params().items()))
    try:
        model = cached.get_estimator(model_key)
    except KeyError:
        with parallel_backend("dask"):
            model.fit(X_train, y_train)
        cached.store_estimator(model_key, model)
    model.n_jobs = get_ncpus()
    return model
Beispiel #12
0
FigureSaver.debug = True
FigureSaver.directory = os.path.expanduser(
    os.path.join("~", "tmp", "fire_season_dataset_diffs"))
os.makedirs(FigureSaver.directory, exist_ok=True)

normal_coast_linewidth = 0.5
mpl.rc("figure", figsize=(14, 6))
mpl.rc("font", size=9.0)

np.random.seed(1)

n_jobs = 5
with parallel_backend("loky",
                      n_jobs=n_jobs,
                      inner_max_num_threads=math.floor(get_ncpus() / n_jobs)):
    outputs = thres_fire_season_stats(0.1)

dataset_names = [output[0] for output in outputs]
lengths = [output[3].reshape(1, *output[3].shape) for output in outputs]

# Stack the lengths into one array.
lengths = np.ma.vstack(lengths)

mean_length = np.ma.mean(lengths, axis=0)

# Mean BAs
ba_variable_names = (
    "CCI MERIS BA",
    "CCI MODIS BA",
    "GFED4 BA",
Beispiel #13
0
def assign_n_jobs(model):
    """Assign `n_jobs` to the number of currently available CPUs."""
    model.n_jobs = get_ncpus()
    return model
Beispiel #14
0
def plot_2d_ale(experiment, single=False, nargs=None, verbose=False, **kwargs):
    exp_figure_saver = figure_saver(sub_directory=experiment.name)

    # Operate on cached data only.
    get_experiment_split_data.check_in_store(experiment)
    X_train, X_test, y_train, y_test = get_experiment_split_data(experiment)

    # Operate on cached fitted models only.
    get_model(X_train, y_train, cache_check=True)
    model = get_model(X_train, y_train)

    columns_list = list(combinations(X_train.columns, 2))

    # Deterministic sorting with FAPAR & FAPAR 1M and FAPAR & DRY_DAY_PERIOD at the
    # front since these are used in the paper.

    def get_combination_value(column_combination):
        # Handle special cases first.
        if (
            variable.FAPAR[0] in column_combination
            and variable.FAPAR[1] in column_combination
        ):
            return -1000
        elif (
            variable.FAPAR[0] in column_combination
            and variable.DRY_DAY_PERIOD[0] in column_combination
        ):
            return -999
        out = ""
        for var in column_combination:
            out += str(var.rank) + str(var.shift)
        return int(out)

    columns_list = sorted(columns_list, key=get_combination_value)

    def param_iter():
        for columns in columns_list:
            for plot_samples in [True, False]:
                yield columns, plot_samples

    if single:
        total = 1
    elif nargs:
        total = nargs
    else:
        total = 2 * len(columns_list)

    for columns, plot_samples in tqdm(
        islice(param_iter(), None, total),
        desc=f"2D ALE plotting ({experiment})",
        total=total,
        disable=not verbose,
    ):
        save_ale_2d(
            experiment=experiment,
            model=model,
            train_set=X_train,
            features=columns,
            n_jobs=get_ncpus(),
            include_first_order=True,
            plot_samples=plot_samples,
            figure_saver=exp_figure_saver,
            ale_factor_exp=plotting_configuration.ale_factor_exps.get(
                (columns[0].parent, columns[1].parent), -2
            ),
            x_factor_exp=plotting_configuration.factor_exps.get(columns[0].parent, 0),
            x_ndigits=plotting_configuration.ndigits.get(columns[0].parent, 2),
            y_factor_exp=plotting_configuration.factor_exps.get(columns[1].parent, 0),
            y_ndigits=plotting_configuration.ndigits.get(columns[1].parent, 2),
        )
        plt.close("all")
def single_ax_multi_ale_1d(
    ax,
    feature_data,
    feature,
    xlabel=None,
    ylabel=None,
    title=None,
    verbose=False,
    x_ndigits=2,
    x_factor=1,
    x_rotation=18,
):
    quantile_list = []
    ale_list = []

    for experiment, single_experiment_data in zip(
            tqdm(
                feature_data["experiment"],
                desc="Calculating feature ALEs",
                disable=not verbose,
            ),
            feature_data["single_experiment_data"],
    ):
        model = single_experiment_data["model"]
        X_train = single_experiment_data["X_train"]

        with parallel_backend("threading", n_jobs=get_ncpus()):
            quantiles, ale = alepython.ale.first_order_ale_quant(
                process_proxy((model, ), (get_model_predict, ))[0],
                X_train,
                feature,
                bins=20,
            )

        quantile_list.append(quantiles)
        ale_list.append(ale)

    # Construct quantiles from the individual quantiles, minimising the amount of interpolation.
    combined_quantiles = np.vstack(
        [quantiles[None] for quantiles in quantile_list])

    final_quantiles = np.mean(combined_quantiles, axis=0)

    # The chosen variable for this plot comes from the same dataset (i.e. subsets of
    # the ALL-dataset), thus the quantiles, which are purely informed by the data,
    # should match.
    assert np.allclose(final_quantiles, combined_quantiles)

    mod_quantiles = np.arange(len(quantiles))

    for plot_kwargs, quantiles, ale in zip(feature_data["plot_kwargs"],
                                           quantile_list, ale_list):
        # Interpolate each of the quantiles relative to the accumulated final quantiles.
        ax.plot(
            np.interp(quantiles, final_quantiles, mod_quantiles),
            ale,
            **{
                "marker": "o",
                "ms": 3,
                **plot_kwargs
            },
        )

        ax.set_xticks(mod_quantiles[::2])
        ax.set_xticklabels(
            map(
                lambda x: get_float_format(
                    ndigits=x_ndigits, factor=x_factor, atol=np.inf)(x, None),
                final_quantiles[::2],
            ))
        ax.xaxis.set_tick_params(rotation=x_rotation)

        ax.grid(True)

        ax.set_xlabel(xlabel + f"({x_factor})")
        ax.set_ylabel(ylabel)

    ax.set_title(title)
Beispiel #16
0
def fit_all(**rf_params):
    regr = RandomForestRegressor(**rf_params)
    # Make sure all cores are used.
    regr.n_jobs = get_ncpus()
    regr.fit(all_splits.X_train, all_splits.y_train)
    return regr
Beispiel #17
0
def fit_rf_out_season(**rf_params):
    regr = RandomForestRegressor(**rf_params)
    # Make sure all cores are used.
    regr.n_jobs = get_ncpus()
    regr.fit(out_fs_splits.X_train, out_fs_splits.y_train)
    return regr
        )
        for veg_lag_product in product(*veg_lags)
    ]

    assert all(len(combination) == 15 for combination in combinations)

    print("Starting fitting")

    scores = dask_fit_combinations(
        DaskRandomForestRegressor(**param_dict),
        X_train,
        y_train,
        client,
        combinations,
        n_splits=n_splits,
        local_n_jobs=max(get_ncpus() - 1, 1),
        verbose=True,
        cache_dir=CACHE_DIR,
    )

    r2_test_scores = {
        key: [data["test_score"][i]["r2"] for i in data["test_score"]]
        for key, data in scores.items()
    }
    mse_test_scores = {
        key: [data["test_score"][i]["mse"] for i in data["test_score"]]
        for key, data in scores.items()
    }

    keys = np.array(list(r2_test_scores))
    mean_r2_test_scores = np.array(