Ejemplo n.º 1
0
        'lcl_ml_ens_mean_spatial_mean',
        'w_up_time_max_ens_mean_of_90th']
    '''
    data['X']['mid_level_lapse_rate_ens_mean_spatial_mean'] = data['X']['mid_level_lapse_rate_ens_mean_spatial_mean'] / 2.67765 
    X['mid_level_lapse_rate_ens_mean_spatial_mean'] = X['mid_level_lapse_rate_ens_mean_spatial_mean'] / 2.67765

    fname = join(perm_path, f'permutation_importance_all_models_{target}_{time}_training_{metric}{drop_opt}{perm_method}.nc')
    explainer = InterpretToolkit(X=data['X'],y=data['targets'],estimator_output='probability',) 
    perm_results = explainer.load(fname)
    #important_vars = perm_results['multipass_rankings__LogisticRegression'].values[:12]
    #important_vars = ['low_level_lapse_rate_ens_mean_spatial_mean']

    important_vars = ['mid_level_lapse_rate_ens_mean_spatial_mean']
    all_vars = perm_results['singlepass_rankings__LogisticRegression'].values

    display_feature_names = {f: to_readable_names([f])[0] for f in all_vars} 
    #display_feature_names = _fix_long_names(display_feature_names)
    feature_units = {f: get_units(f)for f in all_vars}

    if option == 'interaction':
        interaction_index = 'auto'
        y = None
    elif option == 'targets':
        interaction_index=None
        y =data['targets']
    elif option == 'interaction_and_target':
        interaction_index = 'auto'
        y =data['targets']
    else:
        interaction_index=None
        y =data['targets']
                             })
    examples_subset, target_subset = rus.fit_resample(
        examples_transformed,
        target_values_transformed,
    )

    #examples_subset = shap.sample(examples_transformed, 1000)

    myInterpreter = InterpretToolkit(model=[model.steps[-1][1]],
                                     model_names=[model_name],
                                     examples=examples_subset,
                                     targets=target_subset,
                                     feature_names=feature_names)

    display_feature_names = {
        f: to_readable_names([f])[0]
        for f in feature_names
    }
    display_feature_names = _fix_long_names(display_feature_names)
    feature_units = {f: get_units(f) for f in feature_names}

    date_subset = date_col[:len(examples_subset)].reshape(
        len(examples_subset), 1)

    examples_subset = np.concatenate((examples_subset, date_subset), axis=1)
    examples_subset = pd.DataFrame(examples_subset,
                                   columns=original_feature_names)

    if normalize_method != None:
        unnormalize = UnNormalize(model.steps[1][1], feature_names)
        feature_values = unnormalize._full_inverse_transform(examples_subset)
    print('First load of the data...')
    examples, target_values = _load_train_data(**parameters)
    feature_names = list(examples.columns)
    feature_names.remove('Run Date')

    important_vars = [
        'cape_ml_ens_mean_spatial_mean',
        'cin_ml_ens_mean_spatial_mean',
        'lcl_ml_ens_mean_spatial_mean',
        'shear_v_0to6_ens_mean_spatial_mean',
        'srh_0to3_ens_mean_spatial_mean',
    ]

    # {'w_up_time_max_ens_mean_of_90th': 'Updraft  ($\\mu_e$ of P$_{90}$ of max$_t$)', 'uh_2to5_time_max_ens_mean_of_90th': '2-5 km UH  ($\\mu_e$ of P$_{90}$ of max$_t$)', 'cape_ml_ens_mean_spatial_mean': 'ML CAPE  ($\\mu_e$)'}
    display_feature_names = {
        f: to_readable_names([f])[0].split('(')[0][:-1]
        for f in important_vars
    }

    print(display_feature_names)

    display_feature_names = _fix_long_names(display_feature_names)
    feature_units = {f: get_units(f) for f in important_vars}

    ale_results = []
    for i, model_name in enumerate(model_set):
        parameters['model_name'] = model_name
        calibrated_pipeline = _load_model(**parameters)
        model = calibrated_pipeline.calibrated_classifiers_[0].base_estimator

        examples_transformed, target_values_transformed = just_transforms(
Ejemplo n.º 4
0
myInterpreter = InterpretToolkit(examples=examples, targets=target_values)
results = []
for target in targets:
    fnames = [
        get_fnames(model_name, target, time, drop_opt)
        for model_name in ml_models
    ]
    results.append(myInterpreter.load_results(fnames))

fname = '/work/mflora/ML_DATA/INPUT_DATA/20180501/PROBABILITY_OBJECTS_20180501-2330_10.nc'
ds = xr.open_dataset(fname)
features = [var for var in list(ds.data_vars) if 'matched' not in var
            ] + ['Run Date']

readable_feature_names = {
    feature: to_readable_names([feature])[0]
    for feature in features
}
feature_colors = {
    feature: to_readable_names([feature])[1]
    for feature in features
}

adict = readable_feature_names
display_feature_names1 = {
    f'{f[0]}__{f[1]}': f'{adict[f[0]]} & {adict[f[1]]}'
    for f in list(itertools.combinations(features, r=2))
}

display_feature_names2 = {
    f'{f[1]}__{f[0]}': f'{adict[f[0]]} & {adict[f[1]]}'
Ejemplo n.º 5
0
# Load the most important variables
path = '/work/mflora/ML_DATA/permutation_importance'
perm_imp_fname = join(
    path,
    f'permutation_importance_all_models_{target}_{time}_training_{metric}{drop_opt}{perm_method}.nc'
)
explainer = InterpretToolkit()
perm_imp_results = explainer.load(perm_imp_fname)
important_vars = perm_imp_results[
    f'{mode}_rankings__LogisticRegression'].values

important_vars = important_vars[:n_vars]

# Convert to pretty feature names
readable_feature_names = {
    feature: to_readable_names([feature])[0] + f' ({get_units(feature)})'
    for feature in important_vars
}

parameters = {
    'time': time,
    'target': target,
    'drop_opt': drop_opt,
}
X, y = _load_train_data(**parameters)

n_panels = len(important_vars)
fig, axes = base_plt.create_subplots(n_panels,
                                     figsize=(10, 6),
                                     sharey=True,
                                     n_columns=4,
Ejemplo n.º 6
0
    resample_method = resample_dict[time][target][model_name]
    return join(
        ale_path,
        f'ale_2d_results_{model_name}_{target}_{time}{drop_opt}{resample_method}.nc'
    )


########################################
fname = '/work/mflora/ML_DATA/INPUT_DATA/20180501/PROBABILITY_OBJECTS_20180501-2330_10.nc'
ds = xr.open_dataset(fname)
features = [var for var in list(ds.data_vars) if 'matched' not in var
            ] + ['Run Date']
ds.close()

display_feature_names = {f: to_readable_names([f])[0] for f in features}
display_feature_names = _fix_long_names(display_feature_names)
###feature_units = {f: get_units(f)for f in features}

myInterpreter = InterpretToolkit()
fnames = [get_fnames(m, target, time, drop_opt) for m in model_names]

results = myInterpreter.load_results(fnames=fnames)
feature_names = results[
    'ale_variance_interactions_rankings__LogisticRegression'].values

feature_names = feature_names[:3]
fnames = [get_2d_ale(m, target, time, drop_opt) for m in model_names]
ale_data = myInterpreter.load_results(fnames=fnames)

feature_names = [tuple(f.split('__')) for f in feature_names]
    return join(path, f'permutation_importance_{atype}_{target}_{time}_{mode}_{metric}{drop_opt}{perm_method}{resample}.nc')

explainer = InterpretToolkit() #X=X,y=y)

results =[]
for target in targets:
    fname = get_fnames(target, time, mode, metric, drop_opt, perm_method, resample)
    results.append(explainer.load(fname))

results[0].attrs['estimator_output'] = ['LogisticRegression']

fname = '/work/mflora/ML_DATA/INPUT_DATA/20180501/PROBABILITY_OBJECTS_20180501-2330_10.nc'
ds = xr.open_dataset(fname)
features = [ var for var in list(ds.data_vars) if 'matched' not in var] + ['Run Date']

readable_feature_names = {feature: to_readable_names([feature])[0] for feature in features}
feature_colors = {feature: to_readable_names([feature])[1] for feature in features}

p_values=None

fig = explainer.plot_importance(
                             data=results, 
                             method=method,
                             display_feature_names=readable_feature_names, 
                             feature_colors=feature_colors, 
                             num_vars_to_plot=num_vars_to_plot,
                             rows = rows,
                             columns = columns,
                             plot_correlated_features=False,
                             estimator_names = ml_models,
                             p_values =p_values,