*veg_lag_product,
        )
        for veg_lag_product in product(*veg_lags)
    ]

    assert all(len(combination) == 15 for combination in combinations)

    args = [[], []]

    for combination in combinations:
        for i in range(n_splits):
            args[0].append(combination)
            args[1].append(i)

    args_scores = run(
        combination_fit, *args, cx1_kwargs=cx1_kwargs, return_local_args=True
    )

    if args_scores is None or (
        isinstance(args_scores, dict)
        and set(args_scores)
        == {
            "present",
            "uncached",
        }
    ):
        sys.exit(0)

    # Load cached data for all combinations / splits.

    # Get training and test data for all variables.
Beispiel #2
0
        chosen_ba_data,
        bins=50,
        max_lag=2000,
        n_jobs=get_ncpus(),
        n_per_job=6000,
        verbose=True,
    )
    # fig.suptitle(f"{title}, {inds.shape[0]} samples (out of {valid_indices.shape[0]})")
    ax1.set_ylabel("Semivariance")
    ax2.set_ylabel("N")
    ax2.set_yscale("log")
    ax1.set_xlabel("Lag (km)")

    for ax in (ax1, ax2):
        ax.grid()

    format_label_string_with_exponent(ax1, axis="y")

    fig.align_labels()

    figure_saver.save_figure(fig, "mean_gfed4_variogram")


def plot_mean_gfed4_variogram(*args, **kwargs):
    gfed4_variogram(-1)


if __name__ == "__main__":
    cx1_kwargs = dict(ncpus=1, walltime="24:00:00", memory="10GB")
    run(plot_mean_gfed4_variogram, [None], cx1_kwargs=cx1_kwargs)
Beispiel #3
0
        islice(param_iter(), None, total),
        desc=f"2D ALE plotting ({experiment})",
        total=total,
        disable=not verbose,
    ):
        save_ale_2d(
            experiment=experiment,
            model=model,
            train_set=X_train,
            features=columns,
            n_jobs=get_ncpus(),
            include_first_order=True,
            plot_samples=plot_samples,
            figure_saver=exp_figure_saver,
            ale_factor_exp=plotting_configuration.ale_factor_exps.get(
                (columns[0].parent, columns[1].parent), -2
            ),
            x_factor_exp=plotting_configuration.factor_exps.get(columns[0].parent, 0),
            x_ndigits=plotting_configuration.ndigits.get(columns[0].parent, 2),
            y_factor_exp=plotting_configuration.factor_exps.get(columns[1].parent, 0),
            y_ndigits=plotting_configuration.ndigits.get(columns[1].parent, 2),
        )
        plt.close("all")


if __name__ == "__main__":
    # Relevant if called with the command 'cx1' instead of 'local'.
    cx1_kwargs = dict(walltime="01:00:00", ncpus=1, mem="10GB")

    run(plot_2d_ale, list(Experiment), cx1_kwargs=cx1_kwargs)
            Experiment["15VEG_FAPAR"],
            variable.DRY_DAY_PERIOD[3],
            "(c) 15VEG_FAPAR",
        ),
        axes[1, 1]: (
            Experiment["15VEG_FAPAR_MON"],
            variable.DRY_DAY_PERIOD[3],
            "(d) 15VEG_FAPAR_MON",
        ),
    }
    for (ax, (experiment, column, title)) in tqdm(plot_spec.items(),
                                                  desc="ALE plots",
                                                  disable=not verbose):
        plot_single_1d_ale(experiment, column, ax=ax, verbose=verbose)
        ax.set_title(title)
        gc.collect()

    for ax in axes[:, 1]:
        ax.set_ylabel("")

    fig.tight_layout()
    fig.align_labels()

    figure_saver.save_figure(fig, "15VEG_FAPAR_15VEG_FAPAR_MON_ALE_comp")


if __name__ == "__main__":
    # Relevant if called with the command 'cx1' instead of 'local'.
    cx1_kwargs = dict(walltime="24:00:00", ncpus=32, mem="60GB")
    run(plot_clim_mon_ale_comp, [None], cx1_kwargs=cx1_kwargs)
    ) = random_binary_dilation_split(**split_kwargs)

    model = optional_client_call(
        get_model,
        dict(X_train=X_train, y_train=y_train),
        cache_check=cache_check,
    )[0]

    if cache_check:
        return get_model_scores.check_in_store(model, X_test, X_train, y_test,
                                               y_train)
    return data_info, get_model_scores(model, X_test, X_train, y_test, y_train)


if __name__ == "__main__":
    cx1_kwargs = dict(walltime="04:00:00", ncpus=32, mem="60GB")
    experiments = list(Experiment)
    args = []
    for experiment in experiments:
        for structure_info, structure in structures:
            for test_frac in [0.1, 0.05, 0.01]:
                for seed in range(4):
                    args.append((experiment, structure, test_frac, seed))
    output = run(fit_random_binary_dilation,
                 *zip(*args),
                 cx1_kwargs=cx1_kwargs)

    from pprint import pprint

    pprint(output)
warnings.filterwarnings(
    "ignore", 'Setting feature_perturbation = "tree_path_dependent".*')


def get_experiment_model_scores(experiment, cache_check=False, **kwargs):
    # Operate on cached data only.
    get_experiment_split_data.check_in_store(experiment)
    X_train, X_test, y_train, y_test = get_experiment_split_data(experiment)

    # Operate on cached fitted models only.
    get_model(X_train, y_train, cache_check=True)
    model = get_model(X_train, y_train)

    if cache_check:
        return get_model_scores.check_in_store(model, X_test, X_train, y_test,
                                               y_train)
    return get_model_scores(model, X_test, X_train, y_test, y_train)


if __name__ == "__main__":
    scores = {
        exp.name: vals
        for exp, vals in zip(
            list(Experiment),
            run(get_experiment_model_scores,
                list(Experiment),
                cx1_kwargs=False),
        )
    }
    print(pd.DataFrame(scores))
Beispiel #7
0
    if cache_check:
        return calculate_pfi.check_in_store(*pfi_train_args)

    return {
        "train": calculate_pfi(*pfi_train_args),
        "test": calculate_pfi(*pfi_test_args),
    }


if __name__ == "__main__":
    # Relevant if called with the command 'cx1' instead of 'local'.
    cx1_kwargs = dict(walltime="24:00:00", ncpus=32, mem="60GB")

    experiments = list(Experiment)
    args_pfi_results = run(
        pfi_calc, experiments, cx1_kwargs=cx1_kwargs, return_local_args=True
    )
    if args_pfi_results is None:
        sys.exit(0)

    args, kwargs, pfi_results = args_pfi_results

    pfi_importances = {}
    for exp, pfi_result in zip(args[0], pfi_results):
        # Join the train and test data.
        pfi_importances[exp] = (
            pfi_result["train"]
            .set_index("feature", drop=True)
            .rename({"weight": "train weight", "std": "train std"}, axis="columns")
            .join(
                pfi_result["test"]
    def df_cols_to_str(df):
        df.columns = list(map(lambda s: shorten_features(str(s)), df.columns))
        return df

    # with exp_figure_saver("corr_plot"):
    #     corr_plot(
    #         df_cols_to_str(
    #             exog_data[
    #                 list(
    #                     sort_variables(
    #                         var for var in exog_data.columns if var.shift <= 9
    #                     )
    #                 )
    #             ]
    #         ),
    #         fig_kwargs={"figsize": (12, 8)},
    #     )
    #     plt.grid(False)

    with exp_figure_saver(f"{experiment.name}_corr_plot_full"):
        corr_plot(
            df_cols_to_str(exog_data[list(sort_variables(exog_data.columns))]),
            rotation=70,
            fig_kwargs={"figsize": (8.2, 6.3)},
        )
        plt.grid(False)


if __name__ == "__main__":
    run(correlation_plot, list(Experiment), cx1_kwargs=False)
        X_train, X_test, y_train, y_test = get_experiment_split_data(
            experiment)
        for kind in ["train", "test"]:
            if kind == "train":
                X = X_train
            elif kind == "test":
                X = X_test
            else:
                raise ValueError(f"Unknown kind '{kind}'.")
            N = get_shap_params(X)["max_index"] + 1
            indices = np.arange(N)
            args[0].extend([experiment] * N)
            args[1].extend(indices)
            args[2].extend([kind] * N)

    raw_shap_data = run(shap_values, *args, cx1_kwargs=cx1_kwargs)

    if raw_shap_data is None:
        if run_experiments:
            # Experiments were submitted as CX1 jobs.
            sys.exit(0)
        # Otherwise, experiments were already present as a fully cached value.

    if isinstance(raw_shap_data, dict) and set(raw_shap_data) == {
            "present",
            "uncached",
    }:
        # Checking was performed.
        print("Full cache present for:", end="")
        pprint(
            set(
Beispiel #10
0
            y_test=y_test,
            leave_out=("", *selected_features[experiment]),
            local_n_jobs=(1 if (get_ncpus() < 4) else (get_ncpus() - 2)),
        ),
        cache_check=cache_check,
        add_client=True,
    )[0]

    if cache_check:
        return IN_STORE
    return loco_results


if __name__ == "__main__":
    args_loco_results = run(
        loco_calc, list(Experiment), cx1_kwargs=False, return_local_args=True
    )

    if args_loco_results is None:
        sys.exit(0)

    args, kwargs, loco_results = args_loco_results

    vis_data = {}
    for experiment, exp_results in zip(args[0], loco_results):
        for leave_out, results in exp_results.items():
            vis_data[(experiment, leave_out)] = results

    combined_df = pd.DataFrame(vis_data).T
    combined_df.index.names = ["experiment", "feature"]
    combined_df.rename(
        get_experiment_split_data.check_in_store(experiment)
    X_train, X_test, y_train, y_test = get_experiment_split_data(experiment)

    if cache_check:
        return get_model(X_train=X_train, y_train=y_train, cache_check=True)
    model = get_model(
        X_train=X_train,
        y_train=y_train,
        parallel_backend_call=(
            # Use local threading backend - avoid the Dask backend.
            partial(parallel_backend, "threading", n_jobs=get_ncpus())),
    )
    return model


if __name__ == "__main__":
    cx1_kwargs = dict(walltime="24:00:00", ncpus=32, mem="60GB")
    args_models = run(
        fit_experiment_model,
        list(Experiment),
        cx1_kwargs=cx1_kwargs,
        return_local_args=True,
    )

    if args_models is None:
        sys.exit(0)

    args, kwargs, models = args_models

    models = {exp: fitted_model for exp, fitted_model in zip(args[0], models)}
warnings.filterwarnings("ignore", ".*Collapsing a non-contiguous coordinate.*")
warnings.filterwarnings("ignore", ".*DEFAULT_SPHERICAL_EARTH_RADIUS.*")
warnings.filterwarnings("ignore", ".*guessing contiguous bounds.*")

warnings.filterwarnings(
    "ignore", 'Setting feature_perturbation = "tree_path_dependent".*'
)


def get_experiment_data(experiment, cache_check=False, **kwargs):
    if cache_check:
        get_experiment_split_data.check_in_store(experiment)
    X_train, X_test, y_train, y_test = get_experiment_split_data(experiment)
    return X_train, X_test, y_train, y_test


if __name__ == "__main__":
    cx1_kwargs = dict(walltime="04:00:00", ncpus=32, mem="60GB")
    experiments = list(Experiment)
    experiment_data = dict(
        zip(
            experiments,
            run(get_experiment_data, experiments, cx1_kwargs=cx1_kwargs),
        )
    )
    for (experiment, (X_train, X_test, y_train, y_test)) in experiment_data.items():
        print(f"{experiment} → {y_train.name}")
        pprint(X_train.columns)
        print()
    # Prevents memory buildup over repeated calls.
    gc.collect()

    return (data_info, hold_out_y, predicted_y)


if __name__ == "__main__":
    # For 40 estimators, ~25 minutes per fit operation.
    cx1_kwargs = dict(walltime="24:00:00", ncpus=1, mem="5GB")
    experiments = list(Experiment)

    max_rad = 50

    # Batches of 1000s (x8 rads) submitted as separate CX1 array jobs due to job size limitations.
    for seeds in [
            range(1000),
            range(1000, 2000),
            range(2000, 3000),
            range(3000, 4000)
    ]:
        args = [[], [], [], []]
        for experiment in experiments:
            for radius in np.linspace(0, max_rad, 8):
                for seed in seeds:
                    args[0].append(experiment)
                    args[1].append(radius)
                    args[2].append(max_rad)
                    args[3].append(seed)

        results = run(fit_buffered_loo_sample, *args, cx1_kwargs=cx1_kwargs)
    cmd_args = get_parsers()["parser"].parse_args()

    if cmd_args.experiment is not None:
        chosen_experiments = [
            exp
            for exp in experiments
            if exp in tuple(Experiment[exp] for exp in cmd_args.experiment)
        ]
    else:
        chosen_experiments = experiments.copy()

    chosen_experiments = chosen_experiments[: 1 if cmd_args.single else None]

    for experiment in tqdm(
        chosen_experiments,
        desc="Preparing ALE 1D arguments",
        disable=not cmd_args.verbose,
    ):
        # Operate on cached data / models only.
        get_experiment_split_data.check_in_store(experiment)
        X_train, X_test, y_train, y_test = get_experiment_split_data(experiment)

        get_model(X_train, y_train, cache_check=True)

        for column in X_train.columns:
            args[0].append(experiment)
            args[1].extend(column)

    run(plot_1d_ale, *args, cx1_kwargs=cx1_kwargs)
Beispiel #15
0
                lag_m = ""
            ax.set_ylabel(f"ALE{lag_m} ({y_factor_str} BA)")
        for ax in axes.flatten():
            ax.set_xlabel("")

        for ax, var in zip(axes[-1], comp_vars):
            assert x_factor_exp == 0
            ax.set_xlabel(
                f"{shorten_features(str(var))} ({variable.units[var]})")

        for ax, title in zip(axes.flatten(), ascii_lowercase):
            ax.text(0.5, 1.05, f"({title})", transform=ax.transAxes)

        margin = 0.4

        for ax in axes.ravel():
            ax.set_xlim(-margin, 20 + margin)

        fig.tight_layout(h_pad=0.4)
        fig.align_labels()

        figure_saver.save_figure(
            fig,
            f"{'__'.join(map(shorten_features, map(str, comp_vars)))}_ale_comp",
            sub_directory="ale_comp",
        )


if __name__ == "__main__":
    run(multi_model_ale_plot, [None], cx1_kwargs=False)
Beispiel #16
0
        mew=1,
        clip_on=False,
    )
    ax1.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs)
    ax2.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs)

    for ax in (ax1, ax2):
        ax.set_xticks(list(range(len(experiments))))

    figure_saver.save_figure(fig, "model_comp_scores")


if __name__ == "__main__":
    experiment_groups = (
        (
            Experiment.ALL,
            Experiment.TOP15,
            Experiment.CURR,
            Experiment["15VEG_FAPAR"],
            Experiment["15VEG_LAI"],
            Experiment["15VEG_SIF"],
            Experiment["15VEG_VOD"],
            Experiment.CURRDD_FAPAR,
            Experiment.CURRDD_LAI,
            Experiment.CURRDD_SIF,
            Experiment.CURRDD_VOD,
            Experiment.BEST15,
        ),
    )
    run(plot_score_groups, experiment_groups, cx1_kwargs=False)
        X_train=X_train,
        y_train=y_train,
        predict_X=X_test,
    )

    obs_pred_diff_cube = get_obs_pred_diff_cube(y_val, u_pre, master_mask)

    with map_figure_saver(sub_directory=experiment.name)(
            f"{experiment.name}_obs_pred_comp", sub_directory="predictions"):
        disc_cube_plot(
            obs_pred_diff_cube,
            fig=plt.figure(figsize=(5.1, 2.3)),
            cmap="BrBG",
            cmap_midpoint=0,
            cmap_symmetric=False,
            bin_edges=[-0.01, -0.001, -1e-4, 0, 0.001, 0.01, 0.02],
            extend="both",
            cbar_format=get_sci_format(ndigits=0),
            cbar_pad=0.025,
            cbar_label="Ob. - Pr.",
            **get_aux0_aux1_kwargs(y_val, master_mask),
            loc=(0.83, 0.14),
            height=0.055,
            aspect=1,
            spacing=0.06 * 0.2,
        )


if __name__ == "__main__":
    run(plot_obs_pred_comp, list(Experiment), cx1_kwargs=False)
Beispiel #18
0
    # Plot the legend in between the two axes.
    axes[1].legend(
        loc="center",
        ncol=5,
        bbox_to_anchor=(
            np.mean(
                [
                    axes[0].get_position().xmax,
                    axes[1].get_position().xmin,
                ]
            ),
            0.932,
        ),
        bbox_transform=fig.transFigure,
        handletextpad=0.25,
        columnspacing=0.5,
    )

    exp_figure_saver.save_figure(
        fig,
        f'{experiment.name}_{"__".join(map(shorten_features, map(str, features)))}_ale_shifts',
        sub_directory="multi_ale",
        transparent=False,
    )


if __name__ == "__main__":
    # Relevant if called with the command 'cx1' instead of 'local'.
    cx1_kwargs = dict(walltime="24:00:00", ncpus=32, mem="60GB")
    run(plot_multi_ale, list(Experiment), cx1_kwargs=cx1_kwargs)
def plot_ba(experiment, **kwargs):
    # Operate on cached data only.
    get_experiment_split_data.check_in_store(experiment)
    X_train, X_test, y_train, y_test = get_experiment_split_data(experiment)

    # Operate on cached data only.
    get_endog_exog_mask.check_in_store(experiment)
    master_mask = get_endog_exog_mask(experiment)[2]

    check_master_masks(master_mask)

    # Operate on cached fitted models only.
    get_model(X_train, y_train, cache_check=True)

    predicted_test = threading_get_model_predict(
        X_train=X_train,
        y_train=y_train,
        predict_X=X_test,
    )

    ba_plotting(
        *get_ba_plotting_data(predicted_test, y_test, master_mask),
        figure_saver=map_figure_saver(sub_directory=experiment.name),
        **get_aux0_aux1_kwargs(y_test, master_mask),
        filename=f"{experiment.name}_ba_prediction",
    )


if __name__ == "__main__":
    run(plot_ba, list(Experiment), cx1_kwargs=False)
Beispiel #20
0
mpl.rc_file(Path(__file__).resolve().parent.parent / "matplotlibrc")

loguru_logger.enable("alepython")
loguru_logger.remove()
loguru_logger.add(sys.stderr, level="WARNING")

logger = logging.getLogger(__name__)
enable_logging(level="WARNING")

warnings.filterwarnings("ignore", ".*Collapsing a non-contiguous coordinate.*")
warnings.filterwarnings("ignore", ".*DEFAULT_SPHERICAL_EARTH_RADIUS.*")
warnings.filterwarnings("ignore", ".*guessing contiguous bounds.*")

warnings.filterwarnings(
    "ignore", 'Setting feature_perturbation = "tree_path_dependent".*')


def calling_cached(x):
    return cached_example_function(x)


if __name__ == "__main__":
    # Relevant if called with the command 'cx1' instead of 'local'.
    cx1_kwargs = dict(walltime="01:00:00", ncpus=1, mem="1GB")

    # This works both with single jobs...
    run(calling_cached, (1, ), cx1_kwargs=cx1_kwargs)

    # ... and array jobs.
    run(calling_cached, (2, 3, 4), cx1_kwargs=cx1_kwargs)