ale_results = [] for i, model_name in enumerate(model_set): parameters['model_name'] = model_name calibrated_pipeline = _load_model(**parameters) model = calibrated_pipeline.calibrated_classifiers_[0].base_estimator path = '/work/mflora/ML_DATA/permutation_importance/' fnames = [ join( path, f'permutation_importance_{model_name}_{target}_{time}_training_norm_aupdc{drop_opt}.pkl' ) ] perm_imp_results = load_pickle(fnames) myInterpreter = InterpretToolkit(model=[None]) myInterpreter.set_results(perm_imp_results, option='permutation_importance') important_vars = myInterpreter.get_important_vars( perm_imp_results, multipass=True, combine=False, )[model_name][:9] important_vars = ['srh_0to1_ens_mean_spatial_mean'] examples_transformed, target_values_transformed = just_transforms( model, examples, target_values) myInterpreter = InterpretToolkit(model=[model.steps[-1][1]], model_names=[model_name], examples=examples_transformed,
""" if resample == 'default': resample = resample_dict[time][target][model_name] else: if resample == 'under': model_names = model_name + '_under' else: model_names = model_name resample_method = resample_dict[time][target][model_name] return join( path, f'perm_based_interaction_results_{model_name}_{target}_{time}{drop_opt}{resample_method}.nc' ) myInterpreter = InterpretToolkit(examples=examples, targets=target_values) results = [] for target in targets: fnames = [ get_fnames(model_name, target, time, drop_opt) for model_name in ml_models ] results.append(myInterpreter.load_results(fnames)) fname = '/work/mflora/ML_DATA/INPUT_DATA/20180501/PROBABILITY_OBJECTS_20180501-2330_10.nc' ds = xr.open_dataset(fname) features = [var for var in list(ds.data_vars) if 'matched' not in var ] + ['Run Date'] readable_feature_names = { feature: to_readable_names([feature])[0]
columns=original_feature_names) if model_name == "LogisticRegression": normalize_method = 'standard' if normalize_method != None: unnormalize = UnNormalize(model.steps[1][1], feature_names) feature_values = unnormalize._full_inverse_transform( new_examples_subset) else: unnormalize = None feature_values = new_examples_subset.values myInterpreter = InterpretToolkit(model=[model.steps[-1][1]], model_names=[model_name], examples=original_examples_subset, targets=target_subset, feature_names=feature_names) background_dataset = shap.sample(examples, 200) shap_values, bias = myInterpreter.calc_shap( background_dataset=background_dataset) data = { 'shap_values': shap_values, 'bias': bias, 'examples': original_examples_subset, 'targets': target_subset, 'feature_names': feature_names, 'feature_values': feature_values }
shear_u, shear_v, cin, srh_constant=150.) stp_examples = examples_transformed[stp > 2] indices = stp_examples.index.values stp_target_values = target_values_transformed[indices] stp_examples = stp_examples.values print(stp_examples.shape) base_clf = model.steps[-1][1] myInterpreter = InterpretToolkit(model=[base_clf], model_names=[model_name], examples=stp_examples, targets=stp_target_values, feature_names=feature_names) out_dict = myInterpreter.calc_contributions( method='shap', data_for_shap=examples_transformed, performance_based=True, n_examples=250, shap_sample_size=200) results.append(out_dict) myInterpreter.model_names = model_set results = merge_nested_dict(results) myInterpreter.set_results(results, option='contributions')
'target': target, 'resample': resample_method, 'normalize': normalize_method, 'imputer': imputer_method, 'drop_opt': drop_opt, 'model_name': model_name } parameters['model_name'] = model_name calibrated_pipeline = _load_model(**parameters)['model'] model_names = [ model_name ] #[model_name+'_under'] if resample == 'under' model_names = [model_name] myInterpreter = InterpretToolkit( models=calibrated_pipeline, model_names=model_names, ) fnames = join( ale_path, f'pd_results_{model_name}_{target}_{time}{drop_opt}{resample_method}.nc' ) pd_1d = myInterpreter.load_results(fnames=fnames) fnames = join( ale_path, f'pd_2d_results_{model_name}_{target}_{time}{drop_opt}{resample_method}.nc' ) pd_2d = myInterpreter.load_results(fnames=fnames) # Load the permutation important results from the saved pickle file
print('First load of the data...') examples, target_values = _load_train_data(**parameters) feature_names = list(examples.columns) feature_names.remove('Run Date') ale_results = [] for i, model_name in enumerate(model_set): parameters['model_name'] = model_name calibrated_pipeline = _load_model(**parameters) model = calibrated_pipeline.calibrated_classifiers_[0].base_estimator examples_transformed, target_values_transformed = just_transforms( model, examples, target_values) myInterpreter = InterpretToolkit(model=[model.steps[-1][1]], model_names=[model_name], examples=examples_transformed, targets=target_values_transformed, feature_names=feature_names) result_dict = myInterpreter.calc_ale( features=feature_names, nbootstrap=1, subsample=0.6, njobs=40, nbins=30, ) ale_std = [] for f in feature_names: ale_std.append(np.std(result_dict[f][model_name]['values'], ddof=1))
print(display_feature_names) display_feature_names = _fix_long_names(display_feature_names) feature_units = {f: get_units(f) for f in important_vars} ale_results = [] for i, model_name in enumerate(model_set): parameters['model_name'] = model_name calibrated_pipeline = _load_model(**parameters) model = calibrated_pipeline.calibrated_classifiers_[0].base_estimator examples_transformed, target_values_transformed = just_transforms( model, examples, target_values) myInterpreter = InterpretToolkit(model=[model.steps[-1][1]], model_names=[model_name], examples=examples_transformed, targets=target_values_transformed, feature_names=feature_names) njobs = 1 if model_name == 'XGBoost' else len(important_vars) if normalize_method != None: unnormalize_func = UnNormalize(model.steps[1][1], feature_names) else: unnormalize_func = None result_dict = myInterpreter.calc_ale( features=important_vars, nbootstrap=100, subsample=1.0, njobs=njobs,
f'ale_2d_results_{model_name}_{target}_{time}{drop_opt}{resample_method}.nc' ) ######################################## fname = '/work/mflora/ML_DATA/INPUT_DATA/20180501/PROBABILITY_OBJECTS_20180501-2330_10.nc' ds = xr.open_dataset(fname) features = [var for var in list(ds.data_vars) if 'matched' not in var ] + ['Run Date'] ds.close() display_feature_names = {f: to_readable_names([f])[0] for f in features} display_feature_names = _fix_long_names(display_feature_names) ###feature_units = {f: get_units(f)for f in features} myInterpreter = InterpretToolkit() fnames = [get_fnames(m, target, time, drop_opt) for m in model_names] results = myInterpreter.load_results(fnames=fnames) feature_names = results[ 'ale_variance_interactions_rankings__LogisticRegression'].values feature_names = feature_names[:3] fnames = [get_2d_ale(m, target, time, drop_opt) for m in model_names] ale_data = myInterpreter.load_results(fnames=fnames) feature_names = [tuple(f.split('__')) for f in feature_names] print(feature_names) fig, axes = myInterpreter.plot_ale(
models = [calibrated_pipeline] if resample_method == 'under': examples_transformed, target_values_transformed = rus.fit_resample( examples, target_values) else: examples_transformed = examples target_values_transformed = target_values else: models = [model.steps[-1][1]] examples_transformed, target_values_transformed = just_transforms( model, examples, target_values) feature_names.remove('Run Date') myInterpreter = InterpretToolkit(models=models, model_names=model_names, examples=examples_transformed, targets=target_values_transformed, feature_names=feature_names) results = myInterpreter.calc_ice( features=feature_names, n_bootstrap=1, subsample=200, n_jobs=njobs, n_bins=35, ) results_fname = join( ale_path, f'ice_results_{model_name}_{resample_method}_{target}_{time}{drop_opt}{calibrate}.nc' )
path = '/work/mflora/ML_DATA/SHAP_VALUES' iterator = itertools.product(model_set, target_set, time_set, drop_opt_set) for combo in iterator: model_name, target, time, drop_opt = combo fname= join(path, f'shap_values_{model_name}_{target}_{time}{drop_opt}.pkl') with open(fname, 'rb') as pkl_file: data = pickle.load(pkl_file) important_vars = load_important_vars(target, time, drop_opt) myInterpreter = InterpretToolkit(model=[None], model_names = [model_name], examples=data['examples'], targets=data['targets'], feature_names=data['feature_names'], model_output='probability') feature_names = data['feature_names'] display_feature_names = {f: to_readable_names([f])[0] for f in feature_names} display_feature_names = _fix_long_names(display_feature_names) display_feature_names = [display_feature_names[f] for f in feature_names] targets = data['targets'] climo = 0.5 predictions = climo + np.sum(data['shap_values'], axis=1) diff = (targets-predictions) data1 = {'targets': targets, 'predictions': predictions, 'diff': diff} df = pd.DataFrame(data1)
num_vars_to_plot = 15 figsize = (12, 8) #(8,5) def get_fnames(model_name, target, time, mode, metric, drop_opt, resample=''): """ """ return join( path, f'permutation_importance_{model_name}_{target}_{time}_{mode}_{metric}{drop_opt}{resample}.nc' ) ylabels = ['Severe Wind'] myInterpreter = InterpretToolkit() results = [] fnames = [ get_fnames('RandomForest', 'severe_wind', time, mode, metric, drop_opt, r) for r in ['under', ''] ] myInterpreter.load_results(fnames) fname = '/work/mflora/ML_DATA/INPUT_DATA/20180501/PROBABILITY_OBJECTS_20180501-2330_10.nc' ds = xr.open_dataset(fname) features = [var for var in list(ds.data_vars) if 'matched' not in var ] + ['Run Date'] readable_feature_names = { feature: to_readable_names([feature])[0]
########################################\ combos = pipeline_set.pipeline_set drop_opt = '' imputer_method = 'simple' ale_path = '/work/mflora/ML_DATA/ALE_RESULTS' start_time = datetime.datetime.now() for combo in combos: model_name, target, resample_method, normalize_method, time = combo results_fname = join( ale_path, f'ale_results_{model_name}_{target}_{time}{drop_opt}{resample_method}.nc' ) myInterpreter = InterpretToolkit() myInterpreter.load_results(results_fname) results = myInterpreter.calc_ale_variance(model_names=model_name) save_fname = join( ale_path, f'ale_var_results_{model_name.replace("_under", "")}_{resample_method}_{target}_{time}{drop_opt}.nc' ) print(f'Saving {save_fname}...') myInterpreter.save_results(fname=save_fname, data=results) duration = datetime.datetime.now() - start_time seconds = duration.total_seconds() hours = seconds // 3600 minutes = (seconds % 3600) // 60
#if calibrate != '' : # models=[calibrated_pipeline] # if resample == 'under': # examples_transformed, target_values_transformed = rus.fit_resample(examples, target_values) # else: # examples_transformed = examples # target_values_transformed = target_values #else: # models=[model.steps[-1][1]] # examples_transformed, target_values_transformed = just_transforms(model, examples, target_values) # feature_names.remove('Run Date') myInterpreter = InterpretToolkit(models=models, model_names=model_names, examples=examples, targets=targets, ) results = myInterpreter.calc_permutation_importance(n_vars=n_vars, evaluation_fn=metric, subsample=subsample, n_jobs=njobs, n_bootstrap=nbootstrap, verbose=True, perm_method=method) results_fname = join(perm_imp_path, f'permutation_importance_{model_name}_{target}_{time}_{data_mode}_{metric}{drop_opt}{resample}{calibrate}{method}.nc') print(f'Saving {results_fname}...') myInterpreter.save_results(fname=results_fname, data=results) duration = datetime.datetime.now() - start_time seconds = duration.total_seconds() hours = seconds // 3600 minutes = (seconds % 3600) // 60
""" usage: stdbuf -oL python compute_ale.py 2 > & log_compute_ale & """ ######################################## # USER-DEFINED PARAMETERS # ######################################## combos = pipeline_set.pipeline_set drop_opt = '' imputer_method = 'simple' ale_path = '/work/mflora/ML_DATA/ALE_RESULTS' start_time = datetime.datetime.now() for combo in combos: model_name, target, resample_method, normalize_method, time = combo results_fname = join(ale_path, f'ale_2d_results_{model_name}_{target}_{time}{drop_opt}{resample_method}.nc') myInterpreter = InterpretToolkit() results_2d = myInterpreter.load_results(results_fname) #if resample_method == 'under': # model_name +='_under' # Load the permutation important results from the saved pickle file with open(f'IMPORTANT_FEATURES_ALL_MODELS_{target}_{time}.pkl', 'rb') as pkl_file: important_vars = pickle.load(pkl_file) features = list(itertools.combinations(important_vars, r=2)) results = myInterpreter.calc_ale_variance(ale_data=results_2d, features=features, interaction=True) results_fname = join(ale_path, f'ale_interaction_results_{model_name.replace("_under", "")}_{resample_method}_{target}_{time}{drop_opt}.nc') print(f'Saving {results_fname}...')
return { 'singlepass_scores': np.array(singlepass_scores), 'singlepass_ranking': np.array(singlepass_ranking), 'multipass_scores': np.array(multipass_scores), 'multipass_ranking': np.array(multipass_ranking), 'original_score': original_score } #aupdc = make_scorer(score_func=average_precision_score, greater_is_better=True,needs_proba=True,) #results = permutation_importance(rf, X, y, scorer=aupdc, n_vars=10, n_jobs=50) from mintpy.mintpy.main.interpret_toolkit import InterpretToolkit myInterpreter = InterpretToolkit(model=[rf], model_names=['Random Forest'], examples=X, targets=y) start_time = datetime.datetime.now() results = myInterpreter.calc_permutation_importance(n_vars=10, evaluation_fn='auprc', subsample=1.0, njobs=50, nbootstrap=10) duration = datetime.datetime.now() - start_time seconds = duration.total_seconds() hours = seconds // 3600 minutes = (seconds % 3600) // 60 seconds = seconds % 60
calibrate = 'calibrated' ######################################## feature_names = get_top_features(model_name, target, time, 'training', 'norm_aupdc', drop_opt, resample='') display_feature_names = {f: to_readable_names([f])[0] for f in feature_names} display_feature_names = _fix_long_names(display_feature_names) feature_units = {f: get_units(f) for f in feature_names} unnormalize_func = None myInterpreter = InterpretToolkit() fnames = get_fnames(model_name, target, time, resample_method, drop_opt, calibrate) myInterpreter.load_results(fnames=fnames) ice_fnames = get_ice_fnames(model_name, target, time, resample_method, drop_opt, calibrate) ice_dict = myInterpreter.load_results(fnames=ice_fnames) fig, axes = myInterpreter.plot_ale( features=feature_names[:10], display_feature_names=display_feature_names, display_units=feature_units, title=f'{plt_config.title_dict[target]} {time.replace("_", " ").title()}', unnormalize=unnormalize_func, ice_curves=ice_dict,
] #['XGBoost', 'LogisticRegression', 'RandomForest'] target = 'severe_hail' time = 'first_hour' drop_opt = '' #feature_names = ['lcl_ml_ens_mean_spatial_mean', # 'shear_u_0to1_ens_mean_spatial_mean'] feature_names = [ 'hailcast_time_max_ens_mean_of_90th', 'w_up_time_max_ens_mean_of_90th', 'uh_2to5_time_max_ens_mean_of_90th', 'shear_v_0to6_ens_mean_spatial_mean', 'cape_ml_ens_mean_spatial_mean', 'temperature_700mb_ens_mean_spatial_mean', 'major_axis_length', 'divergence_10m_time_min_ens_mean_of_10th' ] myInterpreter = InterpretToolkit() fnames = [get_fnames(m, target, time, drop_opt) for m in model_names] print(fnames) results = myInterpreter.load_results(fnames=fnames) data1 = results['w_up_time_max_ens_std_of_90th__LogisticRegression__ale'] data2 = results[ 'temperature_700mb_ens_mean_spatial_mean__LogisticRegression__ale'] print( f'700 mb Temp ALE variance: {np.mean(np.std(results[f"temperature_700mb_ens_mean_spatial_mean__LogisticRegression__ale"].values, ddof=1, axis=1)): .5f}' ) #########################################
import itertools targets = ['tornado', 'severe_wind', 'severe_hail'] times = ['first_hour', 'second_hour'] drop_opt = '_manual_drop_time_max_spatial_mean' #'_drop_high_corr_pred' model_set = ['RandomForest', 'XGBoost', 'LogisticRegression'] iterator = itertools.product(targets, times) for pair in iterator: target, time = pair path = '/work/mflora/ML_DATA/permutation_importance/' fnames = [ join( path, f'permutation_importance_{model_name}_{target}_{time}_training_norm_aupdc{drop_opt}.pkl' ) for model_name in model_set ] perm_imp_results = load_pickle(fnames) myInterpreter = InterpretToolkit(model=[None]) myInterpreter.set_results(perm_imp_results, option='permutation_importance') important_vars = myInterpreter.get_important_vars(perm_imp_results, multipass=True, combine=True, nvars=9) fname = f'important_vars_all_models_{target}_{time}_{drop_opt}.pkl' with open(fname, 'wb') as pkl_file: pickle.dump(important_vars, pkl_file)
'imputer': imputer_method, 'drop_opt': drop_opt, 'model_name': model_name } print('First load of the data...') examples, target_values = _load_train_data(**parameters) parameters['model_name'] = model_name calibrated_pipeline = _load_model(**parameters)['model'] model_names = [ model_name ] #[model_name+'_under'] if resample == 'under' model_names = [model_name] myInterpreter = InterpretToolkit( models=[calibrated_pipeline], model_names=model_names, examples=examples, targets=target_values, ) # Load the permutation important results from the saved pickle file with open(f'IMPORTANT_FEATURES_ALL_MODELS_{target}_{time}.pkl', 'rb') as pkl_file: important_vars = pickle.load(pkl_file) features = list(itertools.combinations(important_vars, r=2)) results = myInterpreter.calc_ale( features=features, n_bootstrap=n_bootstrap, subsample=subsample, n_jobs=njobs, n_bins=n_bins,
'imputer': imputer_method, 'drop_opt': drop_opt, 'model_name': model_name } print('First load of the data...') examples, target_values = _load_train_data(**parameters) parameters['model_name'] = model_name calibrated_pipeline = _load_model(**parameters)['model'] model_names = [ model_name ] #[model_name+'_under'] if resample == 'under' model_names = [model_name] myInterpreter = InterpretToolkit( models=[calibrated_pipeline], model_names=model_names, examples=examples, targets=target_values, ) # Load the permutation important results from the saved pickle file with open(f'IMPORTANT_FEATURES_ALL_MODELS_{target}_{time}.pkl', 'rb') as pkl_file: important_vars = pickle.load(pkl_file) features = list(itertools.combinations(important_vars, r=2)) results = myInterpreter.calc_interaction_rankings( features=features, evaluation_fn=norm_aupdc, n_bootstrap=n_bootstrap, subsample=subsample, n_jobs=njobs,
model, examples, target_values) rus = RandomUnderSampler(random_state=42, sampling_strategy={ 0: 500, 1: 500 }) examples_subset, target_subset = rus.fit_resample( examples_transformed, target_values_transformed, ) #examples_subset = shap.sample(examples_transformed, 1000) myInterpreter = InterpretToolkit(model=[model.steps[-1][1]], model_names=[model_name], examples=examples_subset, targets=target_subset, feature_names=feature_names) display_feature_names = { f: to_readable_names([f])[0] for f in feature_names } display_feature_names = _fix_long_names(display_feature_names) feature_units = {f: get_units(f) for f in feature_names} date_subset = date_col[:len(examples_subset)].reshape( len(examples_subset), 1) examples_subset = np.concatenate((examples_subset, date_subset), axis=1) examples_subset = pd.DataFrame(examples_subset,
def get_fnames( model_name, target, time, drop_opt, ): """ """ resample = resample_dict[time][target][model_name] return join( path, f'ale_var_results_{model_name}_{resample}_{target}_{time}{drop_opt}.nc' ) myInterpreter = InterpretToolkit(examples=examples, targets=targets) results = [] for target in targets: fnames = [ get_fnames(model_name, target, time, drop_opt) for model_name in ml_models ] results.append(myInterpreter.load_results(fnames)) readable_feature_names = { feature: to_readable_names([feature])[0] for feature in feature_names } feature_colors = { feature: to_readable_names([feature])[1] for feature in feature_names