'cape_ml_ens_mean_spatial_mean',
        'shear_v_0to1_ens_mean_spatial_mean',
        'hailcast_time_max_ens_mean_of_90th',
        'major_axis_length',
        'uh_2to5_time_max_ens_mean_of_90th',
        'cin_ml_ens_std_spatial_mean',
        'minor_axis_length',
        'shear_v_0to6_ens_mean_spatial_mean',
        'lcl_ml_ens_mean_spatial_mean',
        'w_up_time_max_ens_mean_of_90th']
    '''
    data['X']['mid_level_lapse_rate_ens_mean_spatial_mean'] = data['X']['mid_level_lapse_rate_ens_mean_spatial_mean'] / 2.67765 
    X['mid_level_lapse_rate_ens_mean_spatial_mean'] = X['mid_level_lapse_rate_ens_mean_spatial_mean'] / 2.67765

    fname = join(perm_path, f'permutation_importance_all_models_{target}_{time}_training_{metric}{drop_opt}{perm_method}.nc')
    explainer = InterpretToolkit(X=data['X'],y=data['targets'],estimator_output='probability',) 
    perm_results = explainer.load(fname)
    #important_vars = perm_results['multipass_rankings__LogisticRegression'].values[:12]
    #important_vars = ['low_level_lapse_rate_ens_mean_spatial_mean']

    important_vars = ['mid_level_lapse_rate_ens_mean_spatial_mean']
    all_vars = perm_results['singlepass_rankings__LogisticRegression'].values

    display_feature_names = {f: to_readable_names([f])[0] for f in all_vars} 
    #display_feature_names = _fix_long_names(display_feature_names)
    feature_units = {f: get_units(f)for f in all_vars}

    if option == 'interaction':
        interaction_index = 'auto'
        y = None
    elif option == 'targets':
Beispiel #2
0
time_set = ['first_hour']
target_set = ['tornado']

iterator = itertools.product(time_set, target_set)

for combo in iterator:
    time, target = combo
    parameters = {
        'time': time,
        'target': target,
        'drop_opt': drop_opt,
    }
    X, y = _load_train_data(**parameters)
    estimators = load_models(time, target, drop_opt, model_names)
    explainer = InterpretToolkit(estimators=estimators,
                                 estimator_names=model_names,
                                 X=X,
                                 y=y)

    # ale_results_all_models_tornado_first_hourL1_based_feature_selection_aggressive.nc
    fnames = join(ale_path,
                  f'ale_results_all_models_{target}_{time}{drop_opt}.nc')
    ale = explainer.load(fnames=fnames)
    results = explainer.interaction_strength(ale,
                                             n_bootstrap=10,
                                             subsample=0.1)

    print(results)

    explainer.save(fname=join(
        ale_path, f'ias_score_all_models_{target}_{time}{drop_opt}.nc'),
                   data=results)
Beispiel #3
0
'lcl_ml_ens_mean_spatial_mean',
'major_axis_length',
'cape_ml_ens_mean_spatial_mean',
'geopotential_height_500mb_ens_mean_spatial_mean',
'minor_axis_length',
'wz_0to2_time_max_ens_mean_of_90th',
'bouyancy_time_min_ens_mean_spatial_mean',
'shear_v_0to1_ens_mean_spatial_mean',
'uh_0to2_time_max_ens_std_spatial_mean']
########################################
print('First load of the data...')
display_feature_names = {f: to_readable_names([f])[0] for f in feature_names}
#display_feature_names = _fix_long_names(display_feature_names) 
feature_units = {f: get_units(f)for f in feature_names}

explainer = InterpretToolkit()
#fnames = [get_fnames(m, target, time, drop_opt) for m in model_names]

fnames = get_fnames(target, time, drop_opt)
data = explainer.load(fnames=fnames)
fig, axes = explainer.plot_ale(
        data,
        features = feature_names,
        display_feature_names=display_feature_names,
        display_units=feature_units, title=f'{plt_config.title_dict[target]} {time.replace("_", " ").title()}',
        hspace=.75
        )

fname=f'ale_{target}_{time}_{drop_opt}.png'
base_plot.save_figure(fig=fig, fname=fname)
    )
    #if exists(save_fname):
    #    print(f'{save_fname} already exists!')
    #    continue
    parameters = {
        'time': time,
        'target': target,
        'drop_opt': drop_opt,
    }
    X, y, info = _load_train_data(return_info=True, **parameters)

    estimators = load_models(time, target, drop_opt, model_names)
    # Subsample time indices to reduce autocorrelations
    X_subset, y_subset = get_independent_samples(X, y, info)
    explainer = InterpretToolkit(estimators=estimators,
                                 estimator_names=model_names,
                                 X=X_subset.copy(),
                                 y=y_subset.copy())
    background_dataset = shap.sample(X, 100)

    results = explainer.local_contributions(
        method='shap',
        background_dataset=background_dataset,
        performance_based=True,
        n_samples=n_samples)

    results = explainer.save(fname=save_fname, data=results)

    duration = datetime.datetime.now() - start_time
    seconds = duration.total_seconds()
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
Beispiel #5
0
):
    """
    """
    resample_method = resample_dict[time][target][model_name]
    return join(path,
                f'shap_values_{model_name}_{target}_{time}{drop_opt}.pkl')


fname = get_fnames(model_name, target, time, drop_opt)
with open(fname, 'rb') as pkl_file:
    data = pickle.load(pkl_file)
shap_values, bias = data['shap_values'], data['bias']

features = [var for var in list(data['X'].columns) if 'matched' not in var
            ] + ['Run Date']
display_feature_names = {
    feature: to_readable_names([feature])[0]
    for feature in features
}

myInterpreter = InterpretToolkit(X=data['X'])

fig = myInterpreter.plot_shap(
    shap_values=shap_values,
    plot_type='summary',
    display_feature_names=display_feature_names,
)

fname = f'shap_summary_{model_name}_{target}_{time}_{drop_opt}.png'
base_plot.save_figure(fig=fig, fname=fname)
Beispiel #6
0
    parameters = {
        'time': time,
        'target': target,
        'drop_opt': drop_opt,
    }
    results_fname = join(
        ale_path, f'pd_1d_results_all_models_{target}_{time}{drop_opt}.nc')
    #if exists(results_fname):
    #    print(f'{results_fname} already exist!')
    #    continue

    X, y = _load_train_data(**parameters)
    estimators = load_models(time, target, drop_opt, model_names)

    explainer = InterpretToolkit(estimators=estimators,
                                 estimator_names=model_names,
                                 X=X,
                                 y=y)
    results = explainer.pd(
        features='all',
        n_bootstrap=n_bootstrap,
        subsample=subsample,
        n_jobs=njobs,
        n_bins=n_bins,
    )

    print(f'Saving {results_fname}...')
    explainer.save(fname=results_fname, data=results)

duration = datetime.datetime.now() - start_time
seconds = duration.total_seconds()
hours = seconds // 3600
Beispiel #7
0
    sns.ecdfplot(
        ax=ax,
        data=df,
        x=var,
        hue=target,
        legend=False,
    )


# Load the most important variables
path = '/work/mflora/ML_DATA/permutation_importance'
perm_imp_fname = join(
    path,
    f'permutation_importance_all_models_{target}_{time}_training_{metric}{drop_opt}{perm_method}.nc'
)
explainer = InterpretToolkit()
perm_imp_results = explainer.load(perm_imp_fname)
important_vars = perm_imp_results[
    f'{mode}_rankings__LogisticRegression'].values

important_vars = important_vars[:n_vars]

# Convert to pretty feature names
readable_feature_names = {
    feature: to_readable_names([feature])[0] + f' ({get_units(feature)})'
    for feature in important_vars
}

parameters = {
    'time': time,
    'target': target,
Beispiel #8
0
    save_fname= join(path, f'shap_values_{model_names[0]}_{target}_{time}{drop_opt}.pkl')
    #if exists(save_fname):
    #    print(f'{save_fname} already exists!')
    #    continue
    parameters = {'time' : time, 'target' : target,'drop_opt' : drop_opt,}
    X,y, info = _load_train_data(return_info=True, **parameters)

    estimators = load_models(time,target,drop_opt,model_names)

    # Randomly 5000 samples from the training dataset
    indices = np.random.choice(len(X), size=5000, replace=False)
    X_subset = X.iloc[indices,:].reset_index(drop=True)
    y_subset = y[indices]

    estimators = load_models(time,target,drop_opt,model_names)
    explainer = InterpretToolkit(estimators=estimators,estimator_names=model_names,X=X_subset.copy())
    background_dataset = shap.sample(X, 100)
    
    results = explainer.shap(background_dataset=background_dataset) 
    
    shap_values, bias = results[model_names[0]]
    data = {
            'shap_values': shap_values,
            'bias' : bias, 
            'X' : X_subset,
            'targets' : y_subset,
            } 

    with open(save_fname, 'wb') as pkl_file:
        pickle.dump(data, pkl_file)
Beispiel #9
0
X, y, info = _load_train_data(return_info=True, **parameters)

X_subsample = []
y_subsample = []
for idxs in indices_tuple:
    X_subsample.append(X.iloc[idxs, :].reset_index(drop=True))
    y_subsample.append(y[idxs])

for mode, X, y in zip(['high_STP', 'low_STP'], X_subsample, y_subsample):
    start_time = datetime.datetime.now()
    n_vars = 10 if len(X.columns) else len(X.columns)

    # Load the models
    estimators = load_models(time, target, drop_opt, model_names)
    explainer = InterpretToolkit(estimators=estimators,
                                 estimator_names=model_names,
                                 X=X,
                                 y=y)

    # Compute the importance
    results = explainer.permutation_importance(n_vars=n_vars,
                                               evaluation_fn=metric,
                                               subsample=subsample,
                                               n_jobs=n_jobs,
                                               n_bootstrap=n_bootstrap,
                                               verbose=verbose,
                                               direction=direction)

    results_fname = join(
        perm_imp_path,
        f'permutation_importance_{mode}_{target}_{time}_{data_mode}_{metric}{drop_opt}{direction}.nc'
    )
}
X, y, info = _load_train_data(return_info=True, **parameters)
dates = info['Run Date'].values

random_state = np.random.RandomState(35)
random_idxs = random_state.choice(len(X), size=100)
background_dataset = X.iloc[random_idxs, :]

n_samples = 5

save_fname = join(
    path,
    f'shap_values_performance_{model_names[0]}_{target}_{time}{drop_opt}.pkl')
estimators = load_models(time, target, drop_opt, model_names)
explainer = InterpretToolkit(estimators=estimators,
                             estimator_names=model_names,
                             X=X.copy(),
                             y=np.copy(y))

predictions = estimators[0].predict_proba(X)[:, 1]
print(np.max(predictions))

X_test, y_test, _ = _load_test_data(return_info=True, **parameters)
_predictions = estimators[0].predict_proba(X_test)[:, 1]
print(np.sort(_predictions)[::-1])

performance_dict = get_indices_based_on_performance(
    estimator=estimators[0],
    X=X,
    y=y,
    n_samples=n_samples,
    estimator_output='probability',
    p_values = []
    for n in range(n_vars):
        p_value = permutation_test(multipass_scores[n,:],
                           scores_to_compare_against[n,:],
                           method='approximate',
                           num_rounds=1000,
                           seed=0)
        p_values.append(p_value)
        if p_value > 0.05:
            print('Probably the same distribution\n')
        else:
            print('Probably different distributions\n')
    p_values = np.array(p_values)>0.05
    return p_values

def get_fnames(target, time, mode, metric, drop_opt, perm_method, resample=''):
    return join(path, f'permutation_importance_{atype}_{target}_{time}_{mode}_{metric}{drop_opt}{perm_method}{resample}.nc')

explainer = InterpretToolkit()

results =[]
for target in targets:
    fname = get_fnames(target, time, mode, metric, drop_opt, perm_method, resample)
    results.append(explainer.load(fname))

p_values  = get_p_values(results[0], ml_models[0], n_vars=10)

for (target, time, mode, metric, perm_method)


    else:
        return join(shap_path, f'shap_values_performance_{model_name}_{target}_{time}{drop_opt}.pkl')

model_names = ['LogisticRegression']
target = 'severe_hail' if mode is None else 'tornado'
time = 'first_hour'
drop_opt = 'L1_based_feature_selection_with_manual'
perf_keys = ["Best Hits",
             "Worst False Alarms", 
             "Worst Misses",
            ]
metric = 'norm_aupdc'
perm_method = 'backward'

########################################
explainer = InterpretToolkit()
fnames = [get_fnames(m, target, time, drop_opt, mode) for m in model_names]
dframe = explainer.load(fnames=fnames, dtype='dataframe')

feature_names = dframe.attrs['feature_names']
display_feature_names = {f: to_readable_names([f])[0] for f in feature_names}
#display_feature_names = _fix_long_names(display_feature_names)
feature_units = {f: get_units(f)for f in feature_names}

fname = join(perm_path, f'permutation_importance_all_models_{target}_{time}_training_{metric}{drop_opt}{perm_method}.nc')
perm_results = explainer.load(fname)
important_vars = perm_results['multipass_rankings__LogisticRegression'].values[:12]

#important_vars=feature_names
#important_vars.remove('Run Date')
if 'Initialization Time' in important_vars:
ml_models = ['LogisticRegression']
drop_opt = 'L1_based_feature_selection_with_manual' 
perm_method = 'backward'

num_vars_to_plot=10
figsize = (6,6)

parameters = {'time' : time,'target' : 'severe_hail','drop_opt' : drop_opt}

X,y = _load_train_data(**parameters)
feature_names = list(X.columns)

def get_fnames(target, time, mode, metric, drop_opt, perm_method, resample=''):
    return join(path, f'permutation_importance_{atype}_{target}_{time}_{mode}_{metric}{drop_opt}{perm_method}{resample}.nc')

explainer = InterpretToolkit() #X=X,y=y)

results =[]
for target in targets:
    fname = get_fnames(target, time, mode, metric, drop_opt, perm_method, resample)
    results.append(explainer.load(fname))

results[0].attrs['estimator_output'] = ['LogisticRegression']

fname = '/work/mflora/ML_DATA/INPUT_DATA/20180501/PROBABILITY_OBJECTS_20180501-2330_10.nc'
ds = xr.open_dataset(fname)
features = [ var for var in list(ds.data_vars) if 'matched' not in var] + ['Run Date']

readable_feature_names = {feature: to_readable_names([feature])[0] for feature in features}
feature_colors = {feature: to_readable_names([feature])[1] for feature in features}
Beispiel #14
0
def _load_test_data(base_vars_to_drop=base_vars_to_drop,
                    return_info=None,
                    **parameters):
    """
    Load  test data 
    """
    io = IO()
    time = parameters['time']
    target = parameters['target']
    drop_opt = parameters['drop_opt']
    model_name = parameters.get('model_name', None)

    path = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/'
    if drop_opt == '_drop_high_corr_pred':
        fname = f'correlated_features_to_drop_{time}_{target}.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif drop_opt == '_drop_0.8_corr_pred':
        fname = f'correlated_features_to_drop_{time}_{target}_0.8.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif drop_opt == '_manual_drop_0.9_corr':
        fname = f'correlated_features_to_drop_{time}_{target}_0.9_manual_drop_time_max_spatial_mean.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

        fname = f'time_max_spatial_mean_features.pkl'
        with open(join(path, fname), 'rb') as fp:
            add_columns_to_drop = pickle.load(fp)

        vars_to_drop += add_columns_to_drop

    elif drop_opt == '_manual_drop_0.8_corr':
        fname = f'correlated_features_to_drop_{time}_{target}_0.8_manual_drop_time_max_spatial_mean.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

        fname = f'time_max_spatial_mean_features.pkl'
        with open(join(path, fname), 'rb') as fp:
            add_columns_to_drop = pickle.load(fp)

        vars_to_drop += add_columns_to_drop

    elif '_manual_drop_time_max_spatial_mean' in drop_opt:
        fname = f'time_max_spatial_mean_features.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif drop_opt == '_drop_irrelevant_features':
        fname = f'irrelevant_features_to_drop_{time}_{target}_{model_name}.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif drop_opt == '_drop_object_morph_pred':
        object_pred = ['area', 'minor_axis_length', 'major_axis_length']
        vars_to_drop = base_vars_to_drop + object_pred

    elif 'L1_based_feature_selection' in drop_opt and 'manual' not in drop_opt and 'aggres' not in drop_opt:
        path = '/home/monte.flora/wofs_ml/interpret/L1_based_features'
        fname = f'L1_based_features_to_drop_{time}_{target}.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = list(pickle.load(fp))
        if 'Run Date' in columns_to_drop:
            columns_to_drop.remove('Run Date')
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif 'L1_based_feature_selection_aggressive' in drop_opt:
        path = '/home/monte.flora/wofs_ml/interpret/L1_based_features'
        fname = f'L1_based_features_to_drop_{time}_{target}aggresive.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = list(pickle.load(fp))
        if 'Run Date' in columns_to_drop:
            columns_to_drop.remove('Run Date')
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif 'L1_based_feature_selection_with_manual' in drop_opt:
        path1 = '/home/monte.flora/wofs_ml/interpret/L1_based_features'
        fname = f'L1_based_features_to_drop_{time}_{target}_manual_drop_time_max_spatial_mean.pkl'
        with open(join(path1, fname), 'rb') as fp:
            columns_to_drop1 = list(pickle.load(fp))
        if 'Run Date' in columns_to_drop1:
            columns_to_drop1.remove('Run Date')

        fname = f'time_max_spatial_mean_features.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop2 = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop1 + columns_to_drop2

    else:
        vars_to_drop = base_vars_to_drop

    # LOAD DATA
    print(f'Loading {time} {target} data...(from _load_test_data)')
    fname = join(config.ML_DATA_STORAGE_PATH,
                 f'{time}_testing_matched_to_{target}_0km_dataset.pkl')
    test_data = io.load_dataframe(fname=fname,
                                  target_vars=[
                                      'matched_to_tornado_0km',
                                      'matched_to_severe_hail_0km',
                                      'matched_to_severe_wind_0km'
                                  ],
                                  vars_to_drop=vars_to_drop)

    examples = test_data['examples']
    target_values = test_data[f'matched_to_{target}_0km'].values
    if drop_opt == '_only_important_pred':
        path = '/work/mflora/ML_DATA/permutation_importance/'
        if 'Log' in model_name:
            tag = '_drop_high_corr_pred'
        else:
            tag = ''
        fname = join(
            path,
            f'permutation_importance_{model_name}_{target}_{time}_training_norm_aupdc{tag}.pkl'
        )
        perm_imp_results = load_pickle([fname])
        myInterpreter = InterpretToolkit(model=[None])
        myInterpreter.set_results(perm_imp_results,
                                  option='permutation_importance')
        important_vars = myInterpreter.get_important_vars(perm_imp_results,
                                                          multipass=True)
        important_vars += ['Run Date']

        examples = examples[important_vars]

    if return_info:
        info = test_data['info']
        return examples, target_values, info
    else:
        return examples, target_values
    return data


features = [var for var in list(data['X'].columns) if 'matched' not in var
            ] + ['Run Date']
display_feature_names = {
    feature: to_readable_names([feature])[0]
    for feature in features
}
feature_colors = {
    feature: to_readable_names([feature])[1]
    for feature in features
}

explainer = InterpretToolkit(estimator_names=model_name,
                             estimator_output='probability')
results = shap_values_to_importance(shap_values,
                                    estimator_name=model_name,
                                    X=data['X'])

columns = [r'$\sigma$(SHAP)']
#columns = [r'$\sum$ |SHAP|']

fig = explainer.plot_importance(data=results,
                                method='shap',
                                display_feature_names=display_feature_names,
                                feature_colors=feature_colors,
                                columns=columns)

fname = f'shap_importance_{model_name}_{target}_{time}_{drop_opt}.png'
base_plot.save_figure(fig=fig, fname=fname)