def _load_model(**parameters): """ Load a saved ML model, and its training data """ io = IO() time = parameters['time'] target = parameters['target'] resample_method = parameters['resample'] normalize_method = parameters['normalize'] imputer_method = parameters['imputer'] drop_opt = parameters['drop_opt'] model_name = parameters['model_name'] feature_selection_method = parameters.get('feature_selection_method', '') if model_name == "LogisticRegression": print(f'{model_name} is being loaded with "standard" normalization') normalize_method = 'standard' save_fname = f'{model_name}_{time}_{target}_{resample_method}_{normalize_method}_{imputer_method}{drop_opt}{feature_selection_method}.pkl' print(f'Loading {save_fname}...') model = joblib.load(join(config.ML_MODEL_SAVE_PATH, save_fname)) return model
import os os.environ['KMP_WARNINGS'] = '0' """ usage: stdbuf -oL python evaluate_models.py 2 > & log_evaluate_models & """ ####################################################### # This script contains a pipeline method for training ML models. # The pipeline includes imputations, normalizing, resampling, # and calibration all with a date-based cross-validation # train-test split. Hyperparameter tuning is performed by # a random search method. The final model is saved # as well as the cross-validation results, which contain # training and testing evaluations for multiple metrics. ####################################################### # Iterates over model type, target type, resampling method io = IO() ######################################## # USER-DEFINED PARAMETERS # ######################################## model_set = ['RandomForest'] #, 'LogisticRegression', 'XGBoost'] target_set = ['severe_hail'] #['tornado', 'severe_hail', 'severe_wind'] time_set = ['first_hour'] #, 'second_hour'] resampling_method_set = [None] #['under', None] normalize_method_set = [None] #['standard', 'robust', None] imputer_method = 'simple' n_iter = 1000 ####################################### metrics = [ roc_auc_score, average_precision_score, Metrics.performance_curve,
from wofs_ml.preprocess.preprocess import CorrelationFilter from wofs_ml.io.io import IO, base_vars_to_drop from wofs.util import config import itertools import pickle from os.path import join io = IO() time_set = ['first_hour', 'second_hour'] target_set = ['tornado', 'severe_hail', 'severe_wind'] cc_value=0.8 drop_opt = '_manual_drop_time_max_spatial_mean' filter_obj = CorrelationFilter() iterator = itertools.product(time_set,target_set) path = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/' for combo in iterator: time, target = combo if drop_opt == '_manual_drop_time_max_spatial_mean': path = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/' fname = f'time_max_spatial_mean_features.pkl' with open(join(path,fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop
from wofs_ml.io.io import IO from wofs.util import config from os.path import join from glob import glob # Save the individual netcdf files for # each date, time, and forecast time index # into a single pandas.DataFrame # which can used for ML. io = IO() file_start = 'PROBABILITY_OBJECTS' file_end = '.nc' times = ['second_hour'] times_dict = { 'first_hour': ['*[0][0-9]', '*[1][0-2]'], 'second_hour': ['*[1][3-9]', '*[2][0-4]'] } for time in times_dict.keys(): fname_strs = [f"{file_start}{re}{file_end}" for re in times_dict[time]] nc_file_paths = [] for fname_str in fname_strs: nc_file_paths.extend([ glob(join(config.ML_INPUT_PATH, str(date), fname_str)) for date in config.ml_dates ])
def _load_test_data(base_vars_to_drop=base_vars_to_drop, return_info=None, **parameters): """ Load test data """ io = IO() time = parameters['time'] target = parameters['target'] drop_opt = parameters['drop_opt'] model_name = parameters.get('model_name', None) path = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/' if drop_opt == '_drop_high_corr_pred': fname = f'correlated_features_to_drop_{time}_{target}.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop elif drop_opt == '_drop_0.8_corr_pred': fname = f'correlated_features_to_drop_{time}_{target}_0.8.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop elif drop_opt == '_manual_drop_0.9_corr': fname = f'correlated_features_to_drop_{time}_{target}_0.9_manual_drop_time_max_spatial_mean.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop fname = f'time_max_spatial_mean_features.pkl' with open(join(path, fname), 'rb') as fp: add_columns_to_drop = pickle.load(fp) vars_to_drop += add_columns_to_drop elif drop_opt == '_manual_drop_0.8_corr': fname = f'correlated_features_to_drop_{time}_{target}_0.8_manual_drop_time_max_spatial_mean.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop fname = f'time_max_spatial_mean_features.pkl' with open(join(path, fname), 'rb') as fp: add_columns_to_drop = pickle.load(fp) vars_to_drop += add_columns_to_drop elif '_manual_drop_time_max_spatial_mean' in drop_opt: fname = f'time_max_spatial_mean_features.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop elif drop_opt == '_drop_irrelevant_features': fname = f'irrelevant_features_to_drop_{time}_{target}_{model_name}.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop elif drop_opt == '_drop_object_morph_pred': object_pred = ['area', 'minor_axis_length', 'major_axis_length'] vars_to_drop = base_vars_to_drop + object_pred elif 'L1_based_feature_selection' in drop_opt and 'manual' not in drop_opt and 'aggres' not in drop_opt: path = '/home/monte.flora/wofs_ml/interpret/L1_based_features' fname = f'L1_based_features_to_drop_{time}_{target}.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = list(pickle.load(fp)) if 'Run Date' in columns_to_drop: columns_to_drop.remove('Run Date') vars_to_drop = base_vars_to_drop + columns_to_drop elif 'L1_based_feature_selection_aggressive' in drop_opt: path = '/home/monte.flora/wofs_ml/interpret/L1_based_features' fname = f'L1_based_features_to_drop_{time}_{target}aggresive.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop = list(pickle.load(fp)) if 'Run Date' in columns_to_drop: columns_to_drop.remove('Run Date') vars_to_drop = base_vars_to_drop + columns_to_drop elif 'L1_based_feature_selection_with_manual' in drop_opt: path1 = '/home/monte.flora/wofs_ml/interpret/L1_based_features' fname = f'L1_based_features_to_drop_{time}_{target}_manual_drop_time_max_spatial_mean.pkl' with open(join(path1, fname), 'rb') as fp: columns_to_drop1 = list(pickle.load(fp)) if 'Run Date' in columns_to_drop1: columns_to_drop1.remove('Run Date') fname = f'time_max_spatial_mean_features.pkl' with open(join(path, fname), 'rb') as fp: columns_to_drop2 = pickle.load(fp) vars_to_drop = base_vars_to_drop + columns_to_drop1 + columns_to_drop2 else: vars_to_drop = base_vars_to_drop # LOAD DATA print(f'Loading {time} {target} data...(from _load_test_data)') fname = join(config.ML_DATA_STORAGE_PATH, f'{time}_testing_matched_to_{target}_0km_dataset.pkl') test_data = io.load_dataframe(fname=fname, target_vars=[ 'matched_to_tornado_0km', 'matched_to_severe_hail_0km', 'matched_to_severe_wind_0km' ], vars_to_drop=vars_to_drop) examples = test_data['examples'] target_values = test_data[f'matched_to_{target}_0km'].values if drop_opt == '_only_important_pred': path = '/work/mflora/ML_DATA/permutation_importance/' if 'Log' in model_name: tag = '_drop_high_corr_pred' else: tag = '' fname = join( path, f'permutation_importance_{model_name}_{target}_{time}_training_norm_aupdc{tag}.pkl' ) perm_imp_results = load_pickle([fname]) myInterpreter = InterpretToolkit(model=[None]) myInterpreter.set_results(perm_imp_results, option='permutation_importance') important_vars = myInterpreter.get_important_vars(perm_imp_results, multipass=True) important_vars += ['Run Date'] examples = examples[important_vars] if return_info: info = test_data['info'] return examples, target_values, info else: return examples, target_values
from cross_validation_generator import DateBasedCV from wofs.util import config from wofs_ml.io.io import IO, vars_to_drop import pandas as pd from os.path import join # Iterates over model type, target type, resampling method # io = IO() time = 'first_hour' fname = join(config.ML_DATA_STORAGE_PATH, f'{time}_training_dataset.pkl') target_vars = ['matched_to_tornado_0km'] data = io.load_dataframe(fname=fname, target_vars=target_vars, vars_to_drop=vars_to_drop) examples = data['examples'] targets = data['matched_to_tornado_0km'] feature_names = examples.columns.to_list() date_col_idx = feature_names.index('Run Date') cv = DateBasedCV(n_splits=5, date_col_idx=date_col_idx, y=targets, verbose=True) X = examples.to_numpy()