Beispiel #1
0
drop_opt = 'L1_based_feature_selection_aggressive'
imputer_method = 'simple'
model_names = ['LogisticRegression']
time_set = ['first_hour']
target_set = ['tornado']

iterator = itertools.product(time_set, target_set)

for combo in iterator:
    time, target = combo
    parameters = {
        'time': time,
        'target': target,
        'drop_opt': drop_opt,
    }
    X, y = _load_train_data(**parameters)
    estimators = load_models(time, target, drop_opt, model_names)
    explainer = InterpretToolkit(estimators=estimators,
                                 estimator_names=model_names,
                                 X=X,
                                 y=y)

    # ale_results_all_models_tornado_first_hourL1_based_feature_selection_aggressive.nc
    fnames = join(ale_path,
                  f'ale_results_all_models_{target}_{time}{drop_opt}.nc')
    ale = explainer.load(fnames=fnames)
    results = explainer.interaction_strength(ale,
                                             n_bootstrap=10,
                                             subsample=0.1)

    print(results)
target_set = ['severe_wind'] #['tornado', 'severe_hail', 'severe_wind']
drop_opt_set = ['L1_based_feature_selection_with_manual']
########################################

option = 'interaction_and_target'
path = '/work/mflora/ML_DATA/SHAP_VALUES'
metric = 'norm_aupdc'
perm_method = 'backward'
perm_path = '/work/mflora/ML_DATA/permutation_importance'

iterator = itertools.product(model_set, target_set, time_set, drop_opt_set)
for combo in iterator:
    model_name, target, time,  drop_opt = combo

    parameters = {'time' :time, 'target':target, 'drop_opt':drop_opt}
    X,y_full = _load_train_data(**parameters)

    X[f'matched_to_{target}_0km'] = y_full

    fname= join(path, f'shap_values_{model_name}_{target}_{time}{drop_opt}.pkl')
    with open(fname, 'rb') as pkl_file:
        data = pickle.load(pkl_file)

    '''
    important_vars = ['comp_dz_time_max_ens_mean_of_90th',
        'wz_0to2_time_max_ens_mean_of_90th',
        'cape_ml_ens_mean_spatial_mean',
        'shear_v_0to1_ens_mean_spatial_mean',
        'hailcast_time_max_ens_mean_of_90th',
        'major_axis_length',
        'uh_2to5_time_max_ens_mean_of_90th',
iterator = itertools.product(model_set, time_set, pipeline_set, drop_opt_set)
for combo in iterator:
    model_name, time, pair, drop_opt = combo
    target, resample_method, normalize_method = pair
    parameters = {
        'time': time,
        'target': target,
        'resample': resample_method,
        'normalize': normalize_method,
        'imputer': imputer_method,
        'drop_opt': drop_opt,
        'model_name': model_name
    }
    print('First load of the data...')
    examples, target_values = _load_train_data(**parameters)
    feature_names = list(examples.columns)
    original_feature_names = copy.copy(feature_names)
    date_col = examples['Run Date'].values
    feature_names.remove('Run Date')

    #important_vars = load_important_vars(target, time, drop_opt)

    important_vars = [
        'mid_level_lapse_rate_ens_mean_spatial_mean',
    ]

    calibrated_pipeline = _load_model(**parameters)
    model = calibrated_pipeline.calibrated_classifiers_[0].base_estimator

    examples_transformed, target_values_transformed = just_transforms(
for combo in combos:
    start_time = datetime.datetime.now()
    target, time = combo
    save_fname = join(
        path,
        f'shap_values_performance_{model_names[0]}_{target}_{time}{drop_opt}.pkl'
    )
    #if exists(save_fname):
    #    print(f'{save_fname} already exists!')
    #    continue
    parameters = {
        'time': time,
        'target': target,
        'drop_opt': drop_opt,
    }
    X, y, info = _load_train_data(return_info=True, **parameters)

    estimators = load_models(time, target, drop_opt, model_names)
    # Subsample time indices to reduce autocorrelations
    X_subset, y_subset = get_independent_samples(X, y, info)
    explainer = InterpretToolkit(estimators=estimators,
                                 estimator_names=model_names,
                                 X=X_subset.copy(),
                                 y=y_subset.copy())
    background_dataset = shap.sample(X, 100)

    results = explainer.local_contributions(
        method='shap',
        background_dataset=background_dataset,
        performance_based=True,
        n_samples=n_samples)
Beispiel #5
0
targets = ['tornado', 'severe_hail', 'severe_wind']
drop_opt = 'aggresive'  #'_manual_drop_time_max_spatial_mean'
count = 83 if 'manual' in drop_opt else 113

path = '/home/monte.flora/wofs_ml/interpret/L1_based_features'

parameters = {
    'time': 'first_hour',
    'target': 'tornado',
    'resample': None,
    'normalize': None,
    'imputer': None,
    'drop_opt': drop_opt,
    'model_name': None
}
examples, target_data, info = _load_train_data(**parameters, return_info=True)
feature_names = examples.columns.to_list()

for pair in itertools.product(times, targets):
    time, target = pair
    fname = f'L1_based_features_to_drop_{time}_{target}{drop_opt}.pkl'

    with open(join(path, fname), 'rb') as pkl_file:
        dropped_features = pickle.load(pkl_file)

    print(
        f'{time} {target}...Num of Features: {count - len(dropped_features)}')
    print(Diff(feature_names, dropped_features))

path1 = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/'
fname = f'time_max_spatial_mean_features.pkl'
Beispiel #6
0
    for model_name in ['RandomForest', 'XGBoost', 'LogisticRegression']:
        normalize_method = 'standard' if model_name == "LogisticRegression" else None
        resample_method = pipeline_set.resample_dict[time][target][model_name]
            
        model_fname = f'{model_name}_{time}_{target}_{resample_method}_{normalize_method}_{imputer_method}{drop_opt}{feature_selection_method}.pkl'
        clfs.append(joblib.load(join(config.ML_MODEL_SAVE_PATH, model_fname))['model'])
            
    return clfs

iterator = itertools.product(time_set, target_set)
for combo in iterator:
    time, target = combo

    # LOAD DATA
    parameters = {'time' : time, 'target' : target, 'drop_opt' : drop_opt, 'model_name' : None}
    X, y, info = _load_train_data(**parameters, return_info=True)
    fit_estimators = load_models(time,target,drop_opt)

    feature_names = X.columns.to_list()
    date_col_idx = feature_names.index('Run Date')
    cv = DateBasedCV(n_splits=5, date_col_idx=date_col_idx, y=y) 
   
    stack_clf = StackingClassifier(
            estimators = fit_estimators,
            cv=cv, 
            n_jobs=1,
            ) 
    stack_clf.fit(X,y)

    fname = f'StackedClassifier_{time}_{target}_simple{drop_opt}None.pkl'
    model_dict = {
Beispiel #7
0
from wofs.util import config
from wofs_ml.common.load_results import _load_train_data, _load_test_data, _load_model, just_transforms

parameters = {
    'time': 'first_hour',
    'target': 'tornado',
    'resample': 'under',
    'normalize': 'standard',
    'imputer': 'simple',
    'drop_opt': '',
    'model_name': None
}

print('First load of the data...')
train_examples, _ = _load_train_data(**parameters)
test_examples, _ = _load_test_data(**parameters)

predictor = ['lcl_ml_ens_mean_spatial_mean']

train_examples = train_examples[predictor].values
test_examples = test_examples[predictor].values

fig = plt.figure(figsize=(6, 6), dpi=300)
plt.hist(train_examples,
         bins='auto',
         color='lightgreen',
         alpha=0.8,
         rwidth=0.85,
         log=True)
plt.hist(test_examples,
    print(len(stp_indices))
    print(len(non_stp_indices))

    return stp_indices, non_stp_indices

parameters = {
                'time' : time,
                'target' : target,
                'resample' : None,
                'normalize' : None,
                'imputer' : imputer_method,
                'drop_opt' : drop_opt,
                'model_name' : None
                 }

examples, target_values, info = _load_train_data(return_info=True, **parameters)

# Subsample time indices to reduce autocorrelations 
#examples, target_values = get_independent_samples(examples, target_values, info)

random_state=np.random.RandomState(35)
random_idxs = random_state.choice(len(examples), size=100)
background_dataset = examples.iloc[random_idxs,:]

# Get STP < 0.9 and > 2.0 indices
indices_tuple = get_stp(examples)

random_state=np.random.RandomState(35)
examples_subsample=[]
targets_subsample=[]
for idxs in indices_tuple: