Ejemplo n.º 1
0
    ale_results = []
    for i, model_name in enumerate(model_set):
        parameters['model_name'] = model_name
        calibrated_pipeline = _load_model(**parameters)
        model = calibrated_pipeline.calibrated_classifiers_[0].base_estimator

        path = '/work/mflora/ML_DATA/permutation_importance/'
        fnames = [
            join(
                path,
                f'permutation_importance_{model_name}_{target}_{time}_training_norm_aupdc{drop_opt}.pkl'
            )
        ]
        perm_imp_results = load_pickle(fnames)
        myInterpreter = InterpretToolkit(model=[None])
        myInterpreter.set_results(perm_imp_results,
                                  option='permutation_importance')
        important_vars = myInterpreter.get_important_vars(
            perm_imp_results,
            multipass=True,
            combine=False,
        )[model_name][:9]

        important_vars = ['srh_0to1_ens_mean_spatial_mean']

        examples_transformed, target_values_transformed = just_transforms(
            model, examples, target_values)
        myInterpreter = InterpretToolkit(model=[model.steps[-1][1]],
                                         model_names=[model_name],
                                         examples=examples_transformed,
Ejemplo n.º 2
0
    """
    if resample == 'default':
        resample = resample_dict[time][target][model_name]
    else:
        if resample == 'under':
            model_names = model_name + '_under'
        else:
            model_names = model_name
    resample_method = resample_dict[time][target][model_name]
    return join(
        path,
        f'perm_based_interaction_results_{model_name}_{target}_{time}{drop_opt}{resample_method}.nc'
    )


myInterpreter = InterpretToolkit(examples=examples, targets=target_values)
results = []
for target in targets:
    fnames = [
        get_fnames(model_name, target, time, drop_opt)
        for model_name in ml_models
    ]
    results.append(myInterpreter.load_results(fnames))

fname = '/work/mflora/ML_DATA/INPUT_DATA/20180501/PROBABILITY_OBJECTS_20180501-2330_10.nc'
ds = xr.open_dataset(fname)
features = [var for var in list(ds.data_vars) if 'matched' not in var
            ] + ['Run Date']

readable_feature_names = {
    feature: to_readable_names([feature])[0]
Ejemplo n.º 3
0
                                       columns=original_feature_names)

    if model_name == "LogisticRegression":
        normalize_method = 'standard'

    if normalize_method != None:
        unnormalize = UnNormalize(model.steps[1][1], feature_names)
        feature_values = unnormalize._full_inverse_transform(
            new_examples_subset)
    else:
        unnormalize = None
        feature_values = new_examples_subset.values

    myInterpreter = InterpretToolkit(model=[model.steps[-1][1]],
                                     model_names=[model_name],
                                     examples=original_examples_subset,
                                     targets=target_subset,
                                     feature_names=feature_names)

    background_dataset = shap.sample(examples, 200)
    shap_values, bias = myInterpreter.calc_shap(
        background_dataset=background_dataset)

    data = {
        'shap_values': shap_values,
        'bias': bias,
        'examples': original_examples_subset,
        'targets': target_subset,
        'feature_names': feature_names,
        'feature_values': feature_values
    }
                       shear_u,
                       shear_v,
                       cin,
                       srh_constant=150.)

        stp_examples = examples_transformed[stp > 2]
        indices = stp_examples.index.values
        stp_target_values = target_values_transformed[indices]
        stp_examples = stp_examples.values

        print(stp_examples.shape)

        base_clf = model.steps[-1][1]
        myInterpreter = InterpretToolkit(model=[base_clf],
                                         model_names=[model_name],
                                         examples=stp_examples,
                                         targets=stp_target_values,
                                         feature_names=feature_names)

        out_dict = myInterpreter.calc_contributions(
            method='shap',
            data_for_shap=examples_transformed,
            performance_based=True,
            n_examples=250,
            shap_sample_size=200)

        results.append(out_dict)

    myInterpreter.model_names = model_set
    results = merge_nested_dict(results)
    myInterpreter.set_results(results, option='contributions')
Ejemplo n.º 5
0
        'target': target,
        'resample': resample_method,
        'normalize': normalize_method,
        'imputer': imputer_method,
        'drop_opt': drop_opt,
        'model_name': model_name
    }

    parameters['model_name'] = model_name
    calibrated_pipeline = _load_model(**parameters)['model']

    model_names = [
        model_name
    ]  #[model_name+'_under'] if resample == 'under' model_names = [model_name]
    myInterpreter = InterpretToolkit(
        models=calibrated_pipeline,
        model_names=model_names,
    )

    fnames = join(
        ale_path,
        f'pd_results_{model_name}_{target}_{time}{drop_opt}{resample_method}.nc'
    )
    pd_1d = myInterpreter.load_results(fnames=fnames)

    fnames = join(
        ale_path,
        f'pd_2d_results_{model_name}_{target}_{time}{drop_opt}{resample_method}.nc'
    )
    pd_2d = myInterpreter.load_results(fnames=fnames)

    # Load the permutation important results from the saved pickle file
    print('First load of the data...')
    examples, target_values = _load_train_data(**parameters)
    feature_names = list(examples.columns)
    feature_names.remove('Run Date')

    ale_results = []
    for i, model_name in enumerate(model_set):
        parameters['model_name'] = model_name
        calibrated_pipeline = _load_model(**parameters)
        model = calibrated_pipeline.calibrated_classifiers_[0].base_estimator

        examples_transformed, target_values_transformed = just_transforms(
            model, examples, target_values)
        myInterpreter = InterpretToolkit(model=[model.steps[-1][1]],
                                         model_names=[model_name],
                                         examples=examples_transformed,
                                         targets=target_values_transformed,
                                         feature_names=feature_names)

        result_dict = myInterpreter.calc_ale(
            features=feature_names,
            nbootstrap=1,
            subsample=0.6,
            njobs=40,
            nbins=30,
        )
        ale_std = []
        for f in feature_names:
            ale_std.append(np.std(result_dict[f][model_name]['values'],
                                  ddof=1))
    print(display_feature_names)

    display_feature_names = _fix_long_names(display_feature_names)
    feature_units = {f: get_units(f) for f in important_vars}

    ale_results = []
    for i, model_name in enumerate(model_set):
        parameters['model_name'] = model_name
        calibrated_pipeline = _load_model(**parameters)
        model = calibrated_pipeline.calibrated_classifiers_[0].base_estimator

        examples_transformed, target_values_transformed = just_transforms(
            model, examples, target_values)
        myInterpreter = InterpretToolkit(model=[model.steps[-1][1]],
                                         model_names=[model_name],
                                         examples=examples_transformed,
                                         targets=target_values_transformed,
                                         feature_names=feature_names)

        njobs = 1 if model_name == 'XGBoost' else len(important_vars)

        if normalize_method != None:
            unnormalize_func = UnNormalize(model.steps[1][1], feature_names)
        else:
            unnormalize_func = None

        result_dict = myInterpreter.calc_ale(
            features=important_vars,
            nbootstrap=100,
            subsample=1.0,
            njobs=njobs,
Ejemplo n.º 8
0
        f'ale_2d_results_{model_name}_{target}_{time}{drop_opt}{resample_method}.nc'
    )


########################################
fname = '/work/mflora/ML_DATA/INPUT_DATA/20180501/PROBABILITY_OBJECTS_20180501-2330_10.nc'
ds = xr.open_dataset(fname)
features = [var for var in list(ds.data_vars) if 'matched' not in var
            ] + ['Run Date']
ds.close()

display_feature_names = {f: to_readable_names([f])[0] for f in features}
display_feature_names = _fix_long_names(display_feature_names)
###feature_units = {f: get_units(f)for f in features}

myInterpreter = InterpretToolkit()
fnames = [get_fnames(m, target, time, drop_opt) for m in model_names]

results = myInterpreter.load_results(fnames=fnames)
feature_names = results[
    'ale_variance_interactions_rankings__LogisticRegression'].values

feature_names = feature_names[:3]
fnames = [get_2d_ale(m, target, time, drop_opt) for m in model_names]
ale_data = myInterpreter.load_results(fnames=fnames)

feature_names = [tuple(f.split('__')) for f in feature_names]

print(feature_names)

fig, axes = myInterpreter.plot_ale(
Ejemplo n.º 9
0
        models = [calibrated_pipeline]
        if resample_method == 'under':
            examples_transformed, target_values_transformed = rus.fit_resample(
                examples, target_values)
        else:
            examples_transformed = examples
            target_values_transformed = target_values
    else:
        models = [model.steps[-1][1]]
        examples_transformed, target_values_transformed = just_transforms(
            model, examples, target_values)
        feature_names.remove('Run Date')

    myInterpreter = InterpretToolkit(models=models,
                                     model_names=model_names,
                                     examples=examples_transformed,
                                     targets=target_values_transformed,
                                     feature_names=feature_names)

    results = myInterpreter.calc_ice(
        features=feature_names,
        n_bootstrap=1,
        subsample=200,
        n_jobs=njobs,
        n_bins=35,
    )

    results_fname = join(
        ale_path,
        f'ice_results_{model_name}_{resample_method}_{target}_{time}{drop_opt}{calibrate}.nc'
    )
Ejemplo n.º 10
0
path = '/work/mflora/ML_DATA/SHAP_VALUES'

iterator = itertools.product(model_set, target_set, time_set, drop_opt_set)
for combo in iterator:
    model_name, target, time,  drop_opt = combo

    fname= join(path, f'shap_values_{model_name}_{target}_{time}{drop_opt}.pkl')
    with open(fname, 'rb') as pkl_file:
        data = pickle.load(pkl_file)

    important_vars = load_important_vars(target, time, drop_opt)

    myInterpreter = InterpretToolkit(model=[None],
                            model_names = [model_name],
                            examples=data['examples'],
                            targets=data['targets'],
                            feature_names=data['feature_names'],
                            model_output='probability')

    feature_names = data['feature_names']
    display_feature_names = {f: to_readable_names([f])[0] for f in feature_names} 
    display_feature_names = _fix_long_names(display_feature_names)
    display_feature_names  = [display_feature_names[f] for f in feature_names]
   
    targets = data['targets']
    climo = 0.5
    predictions = climo + np.sum(data['shap_values'], axis=1)
    
    diff = (targets-predictions)
    data1 = {'targets': targets, 'predictions': predictions, 'diff': diff}
    df = pd.DataFrame(data1)
Ejemplo n.º 11
0
num_vars_to_plot = 15
figsize = (12, 8)  #(8,5)


def get_fnames(model_name, target, time, mode, metric, drop_opt, resample=''):
    """
    """
    return join(
        path,
        f'permutation_importance_{model_name}_{target}_{time}_{mode}_{metric}{drop_opt}{resample}.nc'
    )


ylabels = ['Severe Wind']

myInterpreter = InterpretToolkit()
results = []

fnames = [
    get_fnames('RandomForest', 'severe_wind', time, mode, metric, drop_opt, r)
    for r in ['under', '']
]
myInterpreter.load_results(fnames)

fname = '/work/mflora/ML_DATA/INPUT_DATA/20180501/PROBABILITY_OBJECTS_20180501-2330_10.nc'
ds = xr.open_dataset(fname)
features = [var for var in list(ds.data_vars) if 'matched' not in var
            ] + ['Run Date']

readable_feature_names = {
    feature: to_readable_names([feature])[0]
Ejemplo n.º 12
0
########################################\
combos = pipeline_set.pipeline_set
drop_opt = ''
imputer_method = 'simple'

ale_path = '/work/mflora/ML_DATA/ALE_RESULTS'

start_time = datetime.datetime.now()

for combo in combos:
    model_name, target, resample_method, normalize_method, time = combo
    results_fname = join(
        ale_path,
        f'ale_results_{model_name}_{target}_{time}{drop_opt}{resample_method}.nc'
    )
    myInterpreter = InterpretToolkit()
    myInterpreter.load_results(results_fname)

    results = myInterpreter.calc_ale_variance(model_names=model_name)
    save_fname = join(
        ale_path,
        f'ale_var_results_{model_name.replace("_under", "")}_{resample_method}_{target}_{time}{drop_opt}.nc'
    )

    print(f'Saving {save_fname}...')
    myInterpreter.save_results(fname=save_fname, data=results)

duration = datetime.datetime.now() - start_time
seconds = duration.total_seconds()
hours = seconds // 3600
minutes = (seconds % 3600) // 60
Ejemplo n.º 13
0
    #if calibrate != '' :
    #    models=[calibrated_pipeline]
    #    if resample == 'under':
    #        examples_transformed, target_values_transformed = rus.fit_resample(examples, target_values)
    #    else:
    #        examples_transformed = examples
    #        target_values_transformed = target_values
    #else:
    #    models=[model.steps[-1][1]]
    #    examples_transformed, target_values_transformed = just_transforms(model, examples, target_values)
    #    feature_names.remove('Run Date')

    myInterpreter = InterpretToolkit(models=models,
                                     model_names=model_names,
                                     examples=examples,
                                     targets=targets,
                                    )

    results = myInterpreter.calc_permutation_importance(n_vars=n_vars, evaluation_fn=metric, 
            subsample=subsample, n_jobs=njobs, n_bootstrap=nbootstrap, verbose=True, perm_method=method)
    
    results_fname = join(perm_imp_path, f'permutation_importance_{model_name}_{target}_{time}_{data_mode}_{metric}{drop_opt}{resample}{calibrate}{method}.nc')

    print(f'Saving {results_fname}...')
    myInterpreter.save_results(fname=results_fname, data=results)

    duration =  datetime.datetime.now() - start_time
    seconds = duration.total_seconds()
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
Ejemplo n.º 14
0
""" usage: stdbuf -oL python compute_ale.py 2 > & log_compute_ale & """

########################################
# USER-DEFINED PARAMETERS              #
########################################
combos = pipeline_set.pipeline_set

drop_opt = ''
imputer_method = 'simple'

ale_path = '/work/mflora/ML_DATA/ALE_RESULTS'
start_time = datetime.datetime.now()
for combo in combos:
    model_name, target, resample_method, normalize_method, time = combo
    results_fname = join(ale_path, f'ale_2d_results_{model_name}_{target}_{time}{drop_opt}{resample_method}.nc')
    myInterpreter = InterpretToolkit()
    results_2d = myInterpreter.load_results(results_fname)

    #if resample_method == 'under':
    #    model_name +='_under'

    # Load the permutation important results from the saved pickle file
    with open(f'IMPORTANT_FEATURES_ALL_MODELS_{target}_{time}.pkl', 'rb') as pkl_file:
        important_vars = pickle.load(pkl_file)

    features = list(itertools.combinations(important_vars, r=2))

    results = myInterpreter.calc_ale_variance(ale_data=results_2d, features=features, interaction=True) 
    results_fname = join(ale_path, f'ale_interaction_results_{model_name.replace("_under", "")}_{resample_method}_{target}_{time}{drop_opt}.nc')

    print(f'Saving {results_fname}...')
Ejemplo n.º 15
0
    return {
        'singlepass_scores': np.array(singlepass_scores),
        'singlepass_ranking': np.array(singlepass_ranking),
        'multipass_scores': np.array(multipass_scores),
        'multipass_ranking': np.array(multipass_ranking),
        'original_score': original_score
    }


#aupdc = make_scorer(score_func=average_precision_score, greater_is_better=True,needs_proba=True,)
#results = permutation_importance(rf, X, y, scorer=aupdc, n_vars=10, n_jobs=50)

from mintpy.mintpy.main.interpret_toolkit import InterpretToolkit
myInterpreter = InterpretToolkit(model=[rf],
                                 model_names=['Random Forest'],
                                 examples=X,
                                 targets=y)

start_time = datetime.datetime.now()

results = myInterpreter.calc_permutation_importance(n_vars=10,
                                                    evaluation_fn='auprc',
                                                    subsample=1.0,
                                                    njobs=50,
                                                    nbootstrap=10)

duration = datetime.datetime.now() - start_time
seconds = duration.total_seconds()
hours = seconds // 3600
minutes = (seconds % 3600) // 60
seconds = seconds % 60
Ejemplo n.º 16
0
calibrate = 'calibrated'

########################################
feature_names = get_top_features(model_name,
                                 target,
                                 time,
                                 'training',
                                 'norm_aupdc',
                                 drop_opt,
                                 resample='')
display_feature_names = {f: to_readable_names([f])[0] for f in feature_names}
display_feature_names = _fix_long_names(display_feature_names)
feature_units = {f: get_units(f) for f in feature_names}

unnormalize_func = None
myInterpreter = InterpretToolkit()
fnames = get_fnames(model_name, target, time, resample_method, drop_opt,
                    calibrate)
myInterpreter.load_results(fnames=fnames)

ice_fnames = get_ice_fnames(model_name, target, time, resample_method,
                            drop_opt, calibrate)
ice_dict = myInterpreter.load_results(fnames=ice_fnames)

fig, axes = myInterpreter.plot_ale(
    features=feature_names[:10],
    display_feature_names=display_feature_names,
    display_units=feature_units,
    title=f'{plt_config.title_dict[target]} {time.replace("_", " ").title()}',
    unnormalize=unnormalize_func,
    ice_curves=ice_dict,
Ejemplo n.º 17
0
               ]  #['XGBoost', 'LogisticRegression', 'RandomForest']
target = 'severe_hail'
time = 'first_hour'
drop_opt = ''

#feature_names = ['lcl_ml_ens_mean_spatial_mean',
#       'shear_u_0to1_ens_mean_spatial_mean']

feature_names = [
    'hailcast_time_max_ens_mean_of_90th', 'w_up_time_max_ens_mean_of_90th',
    'uh_2to5_time_max_ens_mean_of_90th', 'shear_v_0to6_ens_mean_spatial_mean',
    'cape_ml_ens_mean_spatial_mean', 'temperature_700mb_ens_mean_spatial_mean',
    'major_axis_length', 'divergence_10m_time_min_ens_mean_of_10th'
]

myInterpreter = InterpretToolkit()
fnames = [get_fnames(m, target, time, drop_opt) for m in model_names]

print(fnames)

results = myInterpreter.load_results(fnames=fnames)

data1 = results['w_up_time_max_ens_std_of_90th__LogisticRegression__ale']
data2 = results[
    'temperature_700mb_ens_mean_spatial_mean__LogisticRegression__ale']

print(
    f'700 mb Temp ALE variance: {np.mean(np.std(results[f"temperature_700mb_ens_mean_spatial_mean__LogisticRegression__ale"].values, ddof=1, axis=1)): .5f}'
)

#########################################
Ejemplo n.º 18
0
import itertools

targets = ['tornado', 'severe_wind', 'severe_hail']
times = ['first_hour', 'second_hour']
drop_opt = '_manual_drop_time_max_spatial_mean'  #'_drop_high_corr_pred'
model_set = ['RandomForest', 'XGBoost', 'LogisticRegression']

iterator = itertools.product(targets, times)

for pair in iterator:
    target, time = pair
    path = '/work/mflora/ML_DATA/permutation_importance/'
    fnames = [
        join(
            path,
            f'permutation_importance_{model_name}_{target}_{time}_training_norm_aupdc{drop_opt}.pkl'
        ) for model_name in model_set
    ]
    perm_imp_results = load_pickle(fnames)
    myInterpreter = InterpretToolkit(model=[None])
    myInterpreter.set_results(perm_imp_results,
                              option='permutation_importance')
    important_vars = myInterpreter.get_important_vars(perm_imp_results,
                                                      multipass=True,
                                                      combine=True,
                                                      nvars=9)

    fname = f'important_vars_all_models_{target}_{time}_{drop_opt}.pkl'
    with open(fname, 'wb') as pkl_file:
        pickle.dump(important_vars, pkl_file)
Ejemplo n.º 19
0
        'imputer': imputer_method,
        'drop_opt': drop_opt,
        'model_name': model_name
    }

    print('First load of the data...')
    examples, target_values = _load_train_data(**parameters)

    parameters['model_name'] = model_name
    calibrated_pipeline = _load_model(**parameters)['model']
    model_names = [
        model_name
    ]  #[model_name+'_under'] if resample == 'under' model_names = [model_name]
    myInterpreter = InterpretToolkit(
        models=[calibrated_pipeline],
        model_names=model_names,
        examples=examples,
        targets=target_values,
    )

    # Load the permutation important results from the saved pickle file
    with open(f'IMPORTANT_FEATURES_ALL_MODELS_{target}_{time}.pkl',
              'rb') as pkl_file:
        important_vars = pickle.load(pkl_file)

    features = list(itertools.combinations(important_vars, r=2))
    results = myInterpreter.calc_ale(
        features=features,
        n_bootstrap=n_bootstrap,
        subsample=subsample,
        n_jobs=njobs,
        n_bins=n_bins,
Ejemplo n.º 20
0
        'imputer': imputer_method,
        'drop_opt': drop_opt,
        'model_name': model_name
    }

    print('First load of the data...')
    examples, target_values = _load_train_data(**parameters)

    parameters['model_name'] = model_name
    calibrated_pipeline = _load_model(**parameters)['model']
    model_names = [
        model_name
    ]  #[model_name+'_under'] if resample == 'under' model_names = [model_name]
    myInterpreter = InterpretToolkit(
        models=[calibrated_pipeline],
        model_names=model_names,
        examples=examples,
        targets=target_values,
    )

    # Load the permutation important results from the saved pickle file
    with open(f'IMPORTANT_FEATURES_ALL_MODELS_{target}_{time}.pkl',
              'rb') as pkl_file:
        important_vars = pickle.load(pkl_file)

    features = list(itertools.combinations(important_vars, r=2))
    results = myInterpreter.calc_interaction_rankings(
        features=features,
        evaluation_fn=norm_aupdc,
        n_bootstrap=n_bootstrap,
        subsample=subsample,
        n_jobs=njobs,
Ejemplo n.º 21
0
        model, examples, target_values)
    rus = RandomUnderSampler(random_state=42,
                             sampling_strategy={
                                 0: 500,
                                 1: 500
                             })
    examples_subset, target_subset = rus.fit_resample(
        examples_transformed,
        target_values_transformed,
    )

    #examples_subset = shap.sample(examples_transformed, 1000)

    myInterpreter = InterpretToolkit(model=[model.steps[-1][1]],
                                     model_names=[model_name],
                                     examples=examples_subset,
                                     targets=target_subset,
                                     feature_names=feature_names)

    display_feature_names = {
        f: to_readable_names([f])[0]
        for f in feature_names
    }
    display_feature_names = _fix_long_names(display_feature_names)
    feature_units = {f: get_units(f) for f in feature_names}

    date_subset = date_col[:len(examples_subset)].reshape(
        len(examples_subset), 1)

    examples_subset = np.concatenate((examples_subset, date_subset), axis=1)
    examples_subset = pd.DataFrame(examples_subset,
Ejemplo n.º 22
0
def get_fnames(
    model_name,
    target,
    time,
    drop_opt,
):
    """
    """
    resample = resample_dict[time][target][model_name]
    return join(
        path,
        f'ale_var_results_{model_name}_{resample}_{target}_{time}{drop_opt}.nc'
    )


myInterpreter = InterpretToolkit(examples=examples, targets=targets)
results = []
for target in targets:
    fnames = [
        get_fnames(model_name, target, time, drop_opt)
        for model_name in ml_models
    ]
    results.append(myInterpreter.load_results(fnames))

readable_feature_names = {
    feature: to_readable_names([feature])[0]
    for feature in feature_names
}
feature_colors = {
    feature: to_readable_names([feature])[1]
    for feature in feature_names