drop_opt = 'L1_based_feature_selection_aggressive' imputer_method = 'simple' model_names = ['LogisticRegression'] time_set = ['first_hour'] target_set = ['tornado'] iterator = itertools.product(time_set, target_set) for combo in iterator: time, target = combo parameters = { 'time': time, 'target': target, 'drop_opt': drop_opt, } X, y = _load_train_data(**parameters) estimators = load_models(time, target, drop_opt, model_names) explainer = InterpretToolkit(estimators=estimators, estimator_names=model_names, X=X, y=y) # ale_results_all_models_tornado_first_hourL1_based_feature_selection_aggressive.nc fnames = join(ale_path, f'ale_results_all_models_{target}_{time}{drop_opt}.nc') ale = explainer.load(fnames=fnames) results = explainer.interaction_strength(ale, n_bootstrap=10, subsample=0.1) print(results)
target_set = ['severe_wind'] #['tornado', 'severe_hail', 'severe_wind'] drop_opt_set = ['L1_based_feature_selection_with_manual'] ######################################## option = 'interaction_and_target' path = '/work/mflora/ML_DATA/SHAP_VALUES' metric = 'norm_aupdc' perm_method = 'backward' perm_path = '/work/mflora/ML_DATA/permutation_importance' iterator = itertools.product(model_set, target_set, time_set, drop_opt_set) for combo in iterator: model_name, target, time, drop_opt = combo parameters = {'time' :time, 'target':target, 'drop_opt':drop_opt} X,y_full = _load_train_data(**parameters) X[f'matched_to_{target}_0km'] = y_full fname= join(path, f'shap_values_{model_name}_{target}_{time}{drop_opt}.pkl') with open(fname, 'rb') as pkl_file: data = pickle.load(pkl_file) ''' important_vars = ['comp_dz_time_max_ens_mean_of_90th', 'wz_0to2_time_max_ens_mean_of_90th', 'cape_ml_ens_mean_spatial_mean', 'shear_v_0to1_ens_mean_spatial_mean', 'hailcast_time_max_ens_mean_of_90th', 'major_axis_length', 'uh_2to5_time_max_ens_mean_of_90th',
iterator = itertools.product(model_set, time_set, pipeline_set, drop_opt_set) for combo in iterator: model_name, time, pair, drop_opt = combo target, resample_method, normalize_method = pair parameters = { 'time': time, 'target': target, 'resample': resample_method, 'normalize': normalize_method, 'imputer': imputer_method, 'drop_opt': drop_opt, 'model_name': model_name } print('First load of the data...') examples, target_values = _load_train_data(**parameters) feature_names = list(examples.columns) original_feature_names = copy.copy(feature_names) date_col = examples['Run Date'].values feature_names.remove('Run Date') #important_vars = load_important_vars(target, time, drop_opt) important_vars = [ 'mid_level_lapse_rate_ens_mean_spatial_mean', ] calibrated_pipeline = _load_model(**parameters) model = calibrated_pipeline.calibrated_classifiers_[0].base_estimator examples_transformed, target_values_transformed = just_transforms(
for combo in combos: start_time = datetime.datetime.now() target, time = combo save_fname = join( path, f'shap_values_performance_{model_names[0]}_{target}_{time}{drop_opt}.pkl' ) #if exists(save_fname): # print(f'{save_fname} already exists!') # continue parameters = { 'time': time, 'target': target, 'drop_opt': drop_opt, } X, y, info = _load_train_data(return_info=True, **parameters) estimators = load_models(time, target, drop_opt, model_names) # Subsample time indices to reduce autocorrelations X_subset, y_subset = get_independent_samples(X, y, info) explainer = InterpretToolkit(estimators=estimators, estimator_names=model_names, X=X_subset.copy(), y=y_subset.copy()) background_dataset = shap.sample(X, 100) results = explainer.local_contributions( method='shap', background_dataset=background_dataset, performance_based=True, n_samples=n_samples)
targets = ['tornado', 'severe_hail', 'severe_wind'] drop_opt = 'aggresive' #'_manual_drop_time_max_spatial_mean' count = 83 if 'manual' in drop_opt else 113 path = '/home/monte.flora/wofs_ml/interpret/L1_based_features' parameters = { 'time': 'first_hour', 'target': 'tornado', 'resample': None, 'normalize': None, 'imputer': None, 'drop_opt': drop_opt, 'model_name': None } examples, target_data, info = _load_train_data(**parameters, return_info=True) feature_names = examples.columns.to_list() for pair in itertools.product(times, targets): time, target = pair fname = f'L1_based_features_to_drop_{time}_{target}{drop_opt}.pkl' with open(join(path, fname), 'rb') as pkl_file: dropped_features = pickle.load(pkl_file) print( f'{time} {target}...Num of Features: {count - len(dropped_features)}') print(Diff(feature_names, dropped_features)) path1 = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/' fname = f'time_max_spatial_mean_features.pkl'
for model_name in ['RandomForest', 'XGBoost', 'LogisticRegression']: normalize_method = 'standard' if model_name == "LogisticRegression" else None resample_method = pipeline_set.resample_dict[time][target][model_name] model_fname = f'{model_name}_{time}_{target}_{resample_method}_{normalize_method}_{imputer_method}{drop_opt}{feature_selection_method}.pkl' clfs.append(joblib.load(join(config.ML_MODEL_SAVE_PATH, model_fname))['model']) return clfs iterator = itertools.product(time_set, target_set) for combo in iterator: time, target = combo # LOAD DATA parameters = {'time' : time, 'target' : target, 'drop_opt' : drop_opt, 'model_name' : None} X, y, info = _load_train_data(**parameters, return_info=True) fit_estimators = load_models(time,target,drop_opt) feature_names = X.columns.to_list() date_col_idx = feature_names.index('Run Date') cv = DateBasedCV(n_splits=5, date_col_idx=date_col_idx, y=y) stack_clf = StackingClassifier( estimators = fit_estimators, cv=cv, n_jobs=1, ) stack_clf.fit(X,y) fname = f'StackedClassifier_{time}_{target}_simple{drop_opt}None.pkl' model_dict = {
from wofs.util import config from wofs_ml.common.load_results import _load_train_data, _load_test_data, _load_model, just_transforms parameters = { 'time': 'first_hour', 'target': 'tornado', 'resample': 'under', 'normalize': 'standard', 'imputer': 'simple', 'drop_opt': '', 'model_name': None } print('First load of the data...') train_examples, _ = _load_train_data(**parameters) test_examples, _ = _load_test_data(**parameters) predictor = ['lcl_ml_ens_mean_spatial_mean'] train_examples = train_examples[predictor].values test_examples = test_examples[predictor].values fig = plt.figure(figsize=(6, 6), dpi=300) plt.hist(train_examples, bins='auto', color='lightgreen', alpha=0.8, rwidth=0.85, log=True) plt.hist(test_examples,
print(len(stp_indices)) print(len(non_stp_indices)) return stp_indices, non_stp_indices parameters = { 'time' : time, 'target' : target, 'resample' : None, 'normalize' : None, 'imputer' : imputer_method, 'drop_opt' : drop_opt, 'model_name' : None } examples, target_values, info = _load_train_data(return_info=True, **parameters) # Subsample time indices to reduce autocorrelations #examples, target_values = get_independent_samples(examples, target_values, info) random_state=np.random.RandomState(35) random_idxs = random_state.choice(len(examples), size=100) background_dataset = examples.iloc[random_idxs,:] # Get STP < 0.9 and > 2.0 indices indices_tuple = get_stp(examples) random_state=np.random.RandomState(35) examples_subsample=[] targets_subsample=[] for idxs in indices_tuple: