def loco_calc(experiment, cache_check=False, **kwargs): """Calculate LOCO values. Args: experiment (str): Experiment (e.g. 'ALL'). cache_check (bool): Whether to check for cached data exclusively. """ # Operate on cached data only. get_experiment_split_data.check_in_store(experiment) X_train, X_test, y_train, y_test = get_experiment_split_data(experiment) loco_results = optional_client_call( calculate_loco, dict( rf=DaskRandomForestRegressor(**param_dict), X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, leave_out=("", *selected_features[experiment]), local_n_jobs=(1 if (get_ncpus() < 4) else (get_ncpus() - 2)), ), cache_check=cache_check, add_client=True, )[0] if cache_check: return IN_STORE return loco_results
def common_get_model_scores(rf, X_test, X_train, y_test, y_train): rf.n_jobs = get_ncpus() with parallel_backend("threading", n_jobs=get_ncpus()): y_pred = rf.predict(X_test) y_train_pred = rf.predict(X_train) return { "test_r2": r2_score(y_test, y_pred), "test_mse": mean_squared_error(y_test, y_pred), "train_r2": r2_score(y_train, y_train_pred), "train_mse": mean_squared_error(y_train, y_train_pred), }
def get_model_scores(model, X_test, X_train, y_test, y_train): # XXX: Get train OOB score (check Dask impl.), train CV score model.n_jobs = get_ncpus() with parallel_backend("threading", n_jobs=get_ncpus()): y_pred = model.predict(X_test) y_train_pred = model.predict(X_train) return { "test_r2": r2_score(y_test, y_pred), "test_mse": mean_squared_error(y_test, y_pred), "train_r2": r2_score(y_train, y_train_pred), "train_mse": mean_squared_error(y_train, y_train_pred), "oob_r2": model.oob_score_, }
def func(): def save_pdp_plot_2d(model, X_train, features, n_jobs): model.n_jobs = n_jobs with parallel_backend("threading", n_jobs=n_jobs): pdp_interact_out = pdp.pdp_interact( model=model, dataset=X_train, model_features=X_train.columns, features=features, num_grid_points=[20, 20], ) fig, axes = pdp.pdp_interact_plot( pdp_interact_out, features, x_quantile=True, figsize=(7, 8) ) axes["pdp_inter_ax"].xaxis.set_tick_params(rotation=45) figure_saver.save_figure(fig, "__".join(features), sub_directory="pdp_2d") X_train, X_test, y_train, y_test = data_split_cache.load() results, rf = cross_val_cache.load() columns_list = list(combinations(X_train.columns, 2)) index = int(os.environ["PBS_ARRAY_INDEX"]) print("Index:", index) print("Columns:", columns_list[index]) ncpus = get_ncpus() print("NCPUS:", ncpus) # Use the array index to select the desired columns. save_pdp_plot_2d(rf, X_train, columns_list[index], ncpus)
def gfed4_variogram(i): chosen_coords, chosen_ba_data, title = get_gfed4_variogram_data(i) fig, ax1, ax2 = plot_variogram( chosen_coords, chosen_ba_data, bins=50, max_lag=2000, n_jobs=get_ncpus(), n_per_job=6000, verbose=True, ) # fig.suptitle(f"{title}, {inds.shape[0]} samples (out of {valid_indices.shape[0]})") ax1.set_ylabel("Semivariance") ax2.set_ylabel("N") ax2.set_yscale("log") ax1.set_xlabel("Lag (km)") for ax in (ax1, ax2): ax.grid() format_label_string_with_exponent(ax1, axis="y") fig.align_labels() figure_saver.save_figure(fig, "mean_gfed4_variogram")
def fit_combination(X, y, combination, split_index): train_indices, test_indices = zip( *KFold(n_splits=n_splits, shuffle=True, random_state=0).split(X) ) X = X[list(combination)].to_numpy() y = y.to_numpy() assert X.shape[1] == 15 X_train = X[train_indices[split_index]] y_train = y[train_indices[split_index]] X_test = X[test_indices[split_index]] y_test = y[test_indices[split_index]] scores = {} with parallel_backend("threading", n_jobs=get_ncpus()): rf = DaskRandomForestRegressor(**param_dict) rf.fit(X_train, y_train) y_test_pred = rf.predict(X_test) scores[("test_score", split_index)] = { "r2": r2_score(y_true=y_test, y_pred=y_test_pred), "mse": mean_squared_error(y_true=y_test, y_pred=y_test_pred), } y_train_pred = rf.predict(X_train) scores[("train_score", split_index)] = { "r2": r2_score(y_true=y_train, y_pred=y_train_pred), "mse": mean_squared_error(y_true=y_train, y_pred=y_train_pred), } return scores
def threading_get_model_predict(*, cache_check=False, **kwargs): """Cached model prediction with the local threading backend.""" kwargs["parallel_backend_call"] = ( # Use local threading backend. partial(parallel_backend, "threading", n_jobs=get_ncpus())) if cache_check: return get_model_predict.check_in_store(**kwargs) return get_model_predict(**kwargs)
def calculate_pfi(rf, X, y): """Calculate the PFI.""" rf.n_jobs = get_ncpus() perm_importance = eli5.sklearn.PermutationImportance(rf, random_state=1).fit( X, y) return eli5.explain_weights_df(perm_importance, feature_names=list(X.columns))
def get_client(*args, **kwargs): """Wrapper around wildfires.dask_cx1.get_client. Only tries to connect to a distributed scheduler if not running as a CX1 job. This is controlled by an environment variable. """ if "RUNNING_AS_JOB" in os.environ: # Do not connect to a distributed scheduler. return Client(n_workers=1, threads_per_worker=get_ncpus()) else: return wildfires_get_client(*args, **kwargs)
def fit_experiment_model(experiment, cache_check=False, **kwargs): if cache_check: get_experiment_split_data.check_in_store(experiment) X_train, X_test, y_train, y_test = get_experiment_split_data(experiment) if cache_check: return get_model(X_train=X_train, y_train=y_train, cache_check=True) model = get_model( X_train=X_train, y_train=y_train, parallel_backend_call=( # Use local threading backend - avoid the Dask backend. partial(parallel_backend, "threading", n_jobs=get_ncpus())), ) return model
def common_get_model(cache_dir, X_train=None, y_train=None): cached = CachedResults( estimator_class=DaskRandomForestRegressor, n_splits=n_splits, cache_dir=cache_dir, ) model = DaskRandomForestRegressor(**param_dict) model_key = tuple(sorted(model.get_params().items())) try: model = cached.get_estimator(model_key) except KeyError: with parallel_backend("dask"): model.fit(X_train, y_train) cached.store_estimator(model_key, model) model.n_jobs = get_ncpus() return model
FigureSaver.debug = True FigureSaver.directory = os.path.expanduser( os.path.join("~", "tmp", "fire_season_dataset_diffs")) os.makedirs(FigureSaver.directory, exist_ok=True) normal_coast_linewidth = 0.5 mpl.rc("figure", figsize=(14, 6)) mpl.rc("font", size=9.0) np.random.seed(1) n_jobs = 5 with parallel_backend("loky", n_jobs=n_jobs, inner_max_num_threads=math.floor(get_ncpus() / n_jobs)): outputs = thres_fire_season_stats(0.1) dataset_names = [output[0] for output in outputs] lengths = [output[3].reshape(1, *output[3].shape) for output in outputs] # Stack the lengths into one array. lengths = np.ma.vstack(lengths) mean_length = np.ma.mean(lengths, axis=0) # Mean BAs ba_variable_names = ( "CCI MERIS BA", "CCI MODIS BA", "GFED4 BA",
def assign_n_jobs(model): """Assign `n_jobs` to the number of currently available CPUs.""" model.n_jobs = get_ncpus() return model
def plot_2d_ale(experiment, single=False, nargs=None, verbose=False, **kwargs): exp_figure_saver = figure_saver(sub_directory=experiment.name) # Operate on cached data only. get_experiment_split_data.check_in_store(experiment) X_train, X_test, y_train, y_test = get_experiment_split_data(experiment) # Operate on cached fitted models only. get_model(X_train, y_train, cache_check=True) model = get_model(X_train, y_train) columns_list = list(combinations(X_train.columns, 2)) # Deterministic sorting with FAPAR & FAPAR 1M and FAPAR & DRY_DAY_PERIOD at the # front since these are used in the paper. def get_combination_value(column_combination): # Handle special cases first. if ( variable.FAPAR[0] in column_combination and variable.FAPAR[1] in column_combination ): return -1000 elif ( variable.FAPAR[0] in column_combination and variable.DRY_DAY_PERIOD[0] in column_combination ): return -999 out = "" for var in column_combination: out += str(var.rank) + str(var.shift) return int(out) columns_list = sorted(columns_list, key=get_combination_value) def param_iter(): for columns in columns_list: for plot_samples in [True, False]: yield columns, plot_samples if single: total = 1 elif nargs: total = nargs else: total = 2 * len(columns_list) for columns, plot_samples in tqdm( islice(param_iter(), None, total), desc=f"2D ALE plotting ({experiment})", total=total, disable=not verbose, ): save_ale_2d( experiment=experiment, model=model, train_set=X_train, features=columns, n_jobs=get_ncpus(), include_first_order=True, plot_samples=plot_samples, figure_saver=exp_figure_saver, ale_factor_exp=plotting_configuration.ale_factor_exps.get( (columns[0].parent, columns[1].parent), -2 ), x_factor_exp=plotting_configuration.factor_exps.get(columns[0].parent, 0), x_ndigits=plotting_configuration.ndigits.get(columns[0].parent, 2), y_factor_exp=plotting_configuration.factor_exps.get(columns[1].parent, 0), y_ndigits=plotting_configuration.ndigits.get(columns[1].parent, 2), ) plt.close("all")
def single_ax_multi_ale_1d( ax, feature_data, feature, xlabel=None, ylabel=None, title=None, verbose=False, x_ndigits=2, x_factor=1, x_rotation=18, ): quantile_list = [] ale_list = [] for experiment, single_experiment_data in zip( tqdm( feature_data["experiment"], desc="Calculating feature ALEs", disable=not verbose, ), feature_data["single_experiment_data"], ): model = single_experiment_data["model"] X_train = single_experiment_data["X_train"] with parallel_backend("threading", n_jobs=get_ncpus()): quantiles, ale = alepython.ale.first_order_ale_quant( process_proxy((model, ), (get_model_predict, ))[0], X_train, feature, bins=20, ) quantile_list.append(quantiles) ale_list.append(ale) # Construct quantiles from the individual quantiles, minimising the amount of interpolation. combined_quantiles = np.vstack( [quantiles[None] for quantiles in quantile_list]) final_quantiles = np.mean(combined_quantiles, axis=0) # The chosen variable for this plot comes from the same dataset (i.e. subsets of # the ALL-dataset), thus the quantiles, which are purely informed by the data, # should match. assert np.allclose(final_quantiles, combined_quantiles) mod_quantiles = np.arange(len(quantiles)) for plot_kwargs, quantiles, ale in zip(feature_data["plot_kwargs"], quantile_list, ale_list): # Interpolate each of the quantiles relative to the accumulated final quantiles. ax.plot( np.interp(quantiles, final_quantiles, mod_quantiles), ale, **{ "marker": "o", "ms": 3, **plot_kwargs }, ) ax.set_xticks(mod_quantiles[::2]) ax.set_xticklabels( map( lambda x: get_float_format( ndigits=x_ndigits, factor=x_factor, atol=np.inf)(x, None), final_quantiles[::2], )) ax.xaxis.set_tick_params(rotation=x_rotation) ax.grid(True) ax.set_xlabel(xlabel + f"({x_factor})") ax.set_ylabel(ylabel) ax.set_title(title)
def fit_all(**rf_params): regr = RandomForestRegressor(**rf_params) # Make sure all cores are used. regr.n_jobs = get_ncpus() regr.fit(all_splits.X_train, all_splits.y_train) return regr
def fit_rf_out_season(**rf_params): regr = RandomForestRegressor(**rf_params) # Make sure all cores are used. regr.n_jobs = get_ncpus() regr.fit(out_fs_splits.X_train, out_fs_splits.y_train) return regr
) for veg_lag_product in product(*veg_lags) ] assert all(len(combination) == 15 for combination in combinations) print("Starting fitting") scores = dask_fit_combinations( DaskRandomForestRegressor(**param_dict), X_train, y_train, client, combinations, n_splits=n_splits, local_n_jobs=max(get_ncpus() - 1, 1), verbose=True, cache_dir=CACHE_DIR, ) r2_test_scores = { key: [data["test_score"][i]["r2"] for i in data["test_score"]] for key, data in scores.items() } mse_test_scores = { key: [data["test_score"][i]["mse"] for i in data["test_score"]] for key, data in scores.items() } keys = np.array(list(r2_test_scores)) mean_r2_test_scores = np.array(