def execute_fold_parallel(participants_fold: pd.Series, fold: int, cuda_device: str, hyper_parameters_tune_mode: bool=False, model_nums_list: list=None, reversed_order: bool=False, bert_hc_exp: bool=False): """ This function get a dict that split the participant to train-val-test (for this fold) and run all the models we want to compare --> it train them using the train data and evaluate them using the val data :param participants_fold: split the participant to train-val-test (for this fold) :param fold: the fold number :param cuda_device: the number of cuda device if using it :param hyper_parameters_tune_mode: after find good data - hyper parameter tuning :param model_nums_list: list of models to run :param reversed_order: if to run with reversed_order of the features in the causal graph :param bert_hc_exp: if we run the BERt_HC experiment (textual features are created by BERT fine tuning) :return: """ # get the train, test, validation participant code for this fold os.environ["CUDA_VISIBLE_DEVICES"] = cuda_device fold_split_dict = dict() for data_set in ['train', 'test', 'validation']: fold_split_dict[data_set] = participants_fold.loc[participants_fold == data_set].index.tolist() # models_to_compare should have for each row: # model_num, model_type, model_name, function_to_run, data_file_name, hyper_parameters # (strings of all parameters for the running function as dict: {'parameter_name': parameter_value}) models_to_compare = pd.read_excel(os.path.join(base_directory, 'models_info.xlsx'), sheet_name='table_to_load', skiprows=[0]) fold_dir = utils.set_folder(f'fold_{fold}', run_dir) excel_models_results = utils.set_folder(folder_name='excel_models_results', father_folder_path=fold_dir) # for test print(f'test_dir: {test_dir}') test_fold_dir = utils.set_folder(f'fold_{fold}', test_dir) excel_test_models_results = utils.set_folder(folder_name='excel_best_models_results', father_folder_path=test_fold_dir) test_participants_fold = pd.read_csv(os.path.join(data_directory, pair_folds_file_name)) test_participants_fold.index = test_participants_fold.pair_id test_table_writer = pd.ExcelWriter(os.path.join(excel_test_models_results, f'Results_test_data_best_models.xlsx'), engine='xlsxwriter') path = f"{REVIEWS_FEATURES_DATASETS_DIR}/experiment_manage.csv" experiment_manage_df = pd.read_csv(path) bert_models = experiment_manage_df.exp_name.values.tolist() table_writer = None log_file_name = os.path.join(fold_dir, f'LogFile_fold_{fold}.log') for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=log_file_name, level=logging.DEBUG, format='%(asctime)s: %(levelname)s %(message)s', datefmt='%H:%M:%S', ) if model_nums_list is not None: all_model_nums = model_nums_list else: all_model_nums = list(set(models_to_compare.model_num)) all_models_results = pd.DataFrame() all_models_prediction_results = pd.DataFrame() if bert_hc_exp: if reversed_order: bert_models = reversed(list(enumerate(bert_models))) else: bert_models = enumerate(bert_models) else: bert_models = enumerate(['']) for feature_num, bert_feature in bert_models: for model_num in all_model_nums: # compare all versions of each model type num_iterates = 1 model_type_versions = models_to_compare.loc[models_to_compare.model_num == model_num] model_num = f'{model_num}_{feature_num}' model_num_results_path = os.path.join(excel_models_results, f'model_num_results_{model_num}.pkl') if not os.path.isfile(model_num_results_path): model_num_results = pd.DataFrame(columns=['model_num', 'model_name', 'model_type', 'hyper_parameters_str', 'data_file_name', 'RMSE', 'Raisha', 'Round']) joblib.dump(model_num_results, model_num_results_path) for index, row in model_type_versions.iterrows(): # iterate over all the models to compare # get all model parameters model_type = row['model_type'] model_name = row['model_name'] function_to_run = row['function_to_run'] data_file_name = row['data_file_name'] test_data_file_name = row['test_data_file_name'] if bert_hc_exp: model_name = f'{model_name}_{bert_feature}' if bert_hc_exp: data_file_name =\ data_file_name.replace('bert_embedding', f'bert_embedding_for_feature_{bert_feature}') test_data_file_name =\ test_data_file_name.replace('bert_embedding', f'bert_embedding_for_feature_{bert_feature}') hyper_parameters_str = row['hyper_parameters'] # get hyper parameters as dict if type(hyper_parameters_str) == str: hyper_parameters_dict = json.loads(hyper_parameters_str) else: hyper_parameters_dict = None if hyper_parameters_dict is not None and 'features_max_size' in hyper_parameters_dict.keys(): if int(hyper_parameters_dict['features_max_size']) > 1000: continue if outer_is_debug: hyper_parameters_dict['num_epochs'] = 2 else: hyper_parameters_dict['num_epochs'] = 100 # if predict test already done: predict_folder = os.path.join(test_dir, f'fold_{fold}', f'{model_num}_{model_type}_{model_name}_' f'{hyper_parameters_dict["num_epochs"]}_epochs_fold_num_{fold}') if os.path.isdir(predict_folder): continue # each function need to get: model_num, fold, fold_dir, model_type, model_name, data_file_name, # fold_split_dict, table_writer, data_directory, hyper_parameters_dict. # During running it needs to write the predictions to the table_writer and save the trained model with # the name: model_name_model_num to the fold_dir. # it needs to return a dict with the final results over the evaluation data: {measure_name: measure} if hyper_parameters_tune_mode: if 'LSTM' in model_type or 'Transformer' in model_type: if 'LSTM' in model_type and 'use_transformer' not in model_type: greadsearch = lstm_gridsearch_params else: # for Transformer models and LSTM_use_transformer models greadsearch = transformer_gridsearch_params for i, parameters_dict in enumerate(greadsearch): if outer_is_debug and i > 1: continue new_hyper_parameters_dict = copy.deepcopy(hyper_parameters_dict) new_hyper_parameters_dict.update(parameters_dict) if 'linear' in model_type and 'lstm_hidden_dim' in new_hyper_parameters_dict: new_hyper_parameters_dict['linear_hidden_dim'] = \ int(0.5 * int(new_hyper_parameters_dict['lstm_hidden_dim'])) if '_avg_turn' in model_type: for inner_i, inner_parameters_dict in enumerate(avg_turn_gridsearch_params): if outer_is_debug and inner_i > 1: continue new_hyper_parameters_dict.update(inner_parameters_dict) new_model_name = f'{model_name}' new_model_num = f'{model_num}_{i}_{inner_i}' if os.path.isfile(os.path.join(excel_models_results, f'Results_fold_{fold}_model_{new_model_num}.xlsx')): continue all_models_results = execute_create_fit_predict_eval_model( function_to_run, new_model_num, fold, fold_dir, model_type, new_model_name, data_file_name, fold_split_dict, table_writer, new_hyper_parameters_dict, excel_models_results, all_models_results, model_num_results_path) else: new_model_name = f'{model_name}' new_model_num = f'{model_num}_{i}' if os.path.isfile(os.path.join(excel_models_results, f'Results_fold_{fold}_model_{new_model_num}.xlsx')): continue all_models_results = execute_create_fit_predict_eval_model( function_to_run, new_model_num, fold, fold_dir, model_type, new_model_name, data_file_name, fold_split_dict, table_writer, new_hyper_parameters_dict, excel_models_results, all_models_results, model_num_results_path) elif 'SVM' in model_type and 'XGBoost' not in model_name or 'Baseline' in model_type: if 'baseline' in model_name or 'Baseline' in model_type: svm_gridsearch_params_inner = [{}] else: svm_gridsearch_params_inner = svm_gridsearch_params if 'EWG' in model_name: num_iterates = 5 for i, parameters_dict in enumerate(svm_gridsearch_params_inner): if outer_is_debug and i > 1: continue new_hyper_parameters_dict = copy.deepcopy(hyper_parameters_dict) new_hyper_parameters_dict.update(parameters_dict) new_model_name = f'{model_name}' new_model_num = f'{model_num}_{i}' if os.path.isfile(os.path.join(excel_models_results, f'Results_fold_{fold}_model_{new_model_num}.xlsx')): continue all_models_results = execute_create_fit_predict_eval_model( function_to_run, new_model_num, fold, fold_dir, model_type, new_model_name, data_file_name, fold_split_dict, table_writer, new_hyper_parameters_dict, excel_models_results, all_models_results, model_num_results_path, num_iterates=num_iterates) elif 'XGBoost' in model_name: for i, parameters_dict in enumerate(xgboost_gridsearch_params): if outer_is_debug and i > 1: continue new_hyper_parameters_dict = copy.deepcopy(hyper_parameters_dict) new_hyper_parameters_dict.update(parameters_dict) new_model_name = f'{model_name}' new_model_num = f'{model_num}_{i}' if os.path.isfile(os.path.join(excel_models_results, f'Results_fold_{fold}_model_{new_model_num}.xlsx')): continue all_models_results = execute_create_fit_predict_eval_model( function_to_run, new_model_num, fold, fold_dir, model_type, new_model_name, data_file_name, fold_split_dict, table_writer, new_hyper_parameters_dict, excel_models_results, all_models_results, model_num_results_path, num_iterates=num_iterates) else: print('Model type must be LSTM-kind, Transformer-kind, or SVM-kind') # select the best hyper-parameters set for this model based on the RMSE model_num_results = joblib.load(model_num_results_path) if model_num_results.empty: continue argmin_index = model_num_results.RMSE.argmin() best_model = model_num_results.iloc[argmin_index] best_model_version_num = best_model.model_num logging.info(f'Best model version for model {model_num}-{model_name} in fold {fold} is: ' f'{best_model_version_num}. Start predict over test data') print(f'Best model version for model {model_num}-{model_name} in fold {fold} is: ' f'{best_model_version_num}. Start predict over test data') # predict on test data using the best version of this model test_fold_split_dict = dict() test_pair_ids_in_fold = test_participants_fold[f'fold_{fold}'] for data_set in ['train', 'test', 'validation']: test_fold_split_dict[data_set] = \ test_pair_ids_in_fold.loc[test_pair_ids_in_fold == data_set].index.tolist() hyper_parameters_str = best_model.hyper_parameters_str model_folder = run_dir if not os.path.exists(os.path.join(base_directory, 'logs', model_folder, f'fold_{fold}')): if not os.path.exists( os.path.join(base_directory, 'logs', f'{model_folder}_best', f'fold_{fold}')): # the folder we need not exists print(f'fold {fold} in folder {model_folder} is not exists') continue else: model_folder = f'{model_folder}_best' # get hyper parameters as dict if type(hyper_parameters_str) == str: hyper_parameters_dict = json.loads(hyper_parameters_str) elif type(hyper_parameters_str) == dict: hyper_parameters_dict = hyper_parameters_str else: hyper_parameters_dict = None print('no hyper parameters dict') num_epochs = hyper_parameters_dict['num_epochs'] model_file_name = f'{best_model_version_num}_{model_type}_{model_name}_fold_{fold}.pkl' if function_to_run == 'ExecuteEvalLSTM': inner_model_folder = \ f'{best_model_version_num}_{model_type}_{model_name}_{num_epochs}_epochs_fold_num_{fold}' else: inner_model_folder = '' trained_model_dir = os.path.join(base_directory, 'logs', model_folder, f'fold_{fold}', inner_model_folder) trained_model = joblib.load(os.path.join(trained_model_dir, model_file_name)) metadata_dict = {'model_num': model_num, 'model_type': model_type, 'model_name': model_name, 'data_file_name': data_file_name, 'test_data_file_name': test_data_file_name, 'hyper_parameters_str': hyper_parameters_dict, 'fold': fold, 'best_model_version_num': best_model_version_num} metadata_df = pd.DataFrame.from_dict(metadata_dict, orient='index').T model_class = getattr(execute_cv_models, function_to_run)( model_num, fold, test_fold_dir, model_type, model_name, data_file_name, test_fold_split_dict, test_table_writer, data_directory, hyper_parameters_dict, excel_test_models_results, trained_model, trained_model_dir, model_file_name, test_data_file_name, 'test') model_class.load_data_create_model() results_df = pd.DataFrame() for i in range(num_iterates): print(f'Start Test Iteration number {i}') logging.info(f'Start Test Iteration number {i}') model_class.predict() results_dict = model_class.eval_model() current_results_df = pd.DataFrame.from_dict(results_dict).T results_df = pd.concat([results_df, current_results_df], sort='False') results_df['raisha_round'] = results_df.index results_df[['Raisha', 'Round']] = results_df.raisha_round.str.split(expand=True) results_df = results_df.drop('raisha_round', axis=1) results_df = results_df.groupby(by=['Raisha', 'Round']).mean() results_df = results_df.reset_index() results_df.index = np.zeros(shape=(results_df.shape[0],)) results_df = metadata_df.join(results_df) all_models_prediction_results = pd.concat([all_models_prediction_results, results_df], sort='False') utils.write_to_excel(model_class.model_table_writer, 'Model results', ['Model results'], results_df) model_class.model_table_writer.save() model_num_results = model_num_results.reset_index() for remove_index, remove_row in model_num_results.iterrows(): if remove_row.model_num == best_model_version_num: continue hyper_parameters_str = remove_row.hyper_parameters_str # get hyper parameters as dict if type(hyper_parameters_str) == str: hyper_parameters_dict = json.loads(hyper_parameters_str) elif type(hyper_parameters_str) == dict: hyper_parameters_dict = hyper_parameters_str else: hyper_parameters_dict = None print('no hyper parameters dict') num_epochs = hyper_parameters_dict['num_epochs'] inner_model_folder = f'{remove_row.model_num}_{remove_row.model_type}_' \ f'{remove_row.model_name}_{num_epochs}_epochs_fold_num_{fold}' model_folder = os.path.join(base_directory, 'logs', model_folder, f'fold_{fold}', inner_model_folder) if os.path.exists(model_folder): print(f'remove {model_folder}') shutil.rmtree(model_folder) else: print(f'Folder {model_folder} does not exist') else: # no hyper parameters all_models_results = execute_create_fit_predict_eval_model( function_to_run, model_num, fold, fold_dir, model_type, model_name, data_file_name, fold_split_dict, table_writer, hyper_parameters_dict, excel_models_results, all_models_results, model_num_results_path) utils.write_to_excel(table_writer, 'All models results', ['All models results'], all_models_results) if table_writer is not None: table_writer.save() if test_table_writer is not None: utils.write_to_excel(test_table_writer, 'All models results', ['All models results'], all_models_prediction_results) test_table_writer.save() logging.info(f'fold {fold} finish compare models') print(f'fold {fold} finish compare models') return f'fold {fold} finish compare models'
sys.argv[5] = reversed order of features: True/False sys.argv[6] = outer_cuda: int: 0/1 sys.argv[7] = bert_hc experiment: True/False """ # is_parallel is_parallel = sys.argv[1] if is_parallel == 'False': is_parallel = False run_dir_name = datetime.now().strftime(f'compare_prediction_models_%d_%m_%Y_%H_%M') test_dir_name = datetime.now().strftime(f'predict_best_models_%d_%m_%Y_%H_%M') if len(sys.argv) > 2: folder_date = sys.argv[2] if folder_date != 'False': run_dir = utils.set_folder(datetime.now().strftime(f'compare_prediction_models_{folder_date}'), 'logs') # for test test_dir = utils.set_folder(datetime.now().strftime(f'predict_best_models_{folder_date}'), 'logs') else: # folder dir run_dir = utils.set_folder(run_dir_name, 'logs') # for test test_dir = utils.set_folder(test_dir_name, 'logs') else: # folder dir run_dir = utils.set_folder(run_dir_name, 'logs') # for test test_dir = utils.set_folder(test_dir_name, 'logs') print(f'test_dir: {test_dir}') # is_debug
def execute_fold_parallel(participants_fold: pd.Series, fold: int, cuda_device: str, data_file_name: str, features_families: list, hyper_parameters_tune_mode: bool=False, test_data_file_name: str=None, id_column: str='pair_id', model_type: str='regression', features_to_remove: Union[list, str] = None): """ This function get a dict that split the participant to train-val-test (for this fold) and run all the models we want to compare --> it train them using the train data and evaluate them using the val data :param participants_fold: split the participant to train-val-test (for this fold) :param fold: the fold number :param cuda_device: the number of cuda device if using it :param hyper_parameters_tune_mode: after find good data - hyper parameter tuning :param data_file_name: the data file name :param features_families: the families of features to use :param id_column: the name of the ID column :param test_data_file_name: the test_data_file_name :param model_type: is this a regression model or a classification model :param features_to_remove: features we want to remove :return: """ # get the train, test, validation participant code for this fold os.environ["CUDA_VISIBLE_DEVICES"] = cuda_device fold_split_dict = dict() for data_set in ['train', 'test', 'validation']: fold_split_dict[data_set] = participants_fold.loc[participants_fold == data_set].index.tolist() fold_dir = utils.set_folder(f'fold_{fold}', run_dir) excel_models_results = utils.set_folder(folder_name='excel_models_results', father_folder_path=fold_dir) # for test test_fold_dir = utils.set_folder(f'fold_{fold}', test_dir) excel_test_models_results = utils.set_folder(folder_name='excel_best_models_results', father_folder_path=test_fold_dir) test_participants_fold = pd.read_csv(os.path.join(data_directory, pair_folds_file_name)) test_participants_fold.index = test_participants_fold[id_column] test_table_writer = pd.ExcelWriter(os.path.join(excel_test_models_results, f'Results_test_data_best_models.xlsx'), engine='xlsxwriter') log_file_name = os.path.join(fold_dir, f'LogFile_fold_{fold}.log') for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=log_file_name, level=logging.DEBUG, format='%(asctime)s: %(levelname)s %(message)s', datefmt='%H:%M:%S', ) all_models_results = pd.DataFrame() all_results_table_writer = pd.ExcelWriter(os.path.join(excel_models_results, f'Results_fold_{fold}_all_models.xlsx'), engine='xlsxwriter') all_models_test_data_results = pd.DataFrame() best_models_paths_dict = defaultdict(str) # load data data_path = os.path.join(base_directory, 'data', 'verbal', 'models_input', data_file_name) if test_data_file_name is None: test_data_path = data_path else: test_data_path = os.path.join(base_directory, 'data', 'verbal', 'models_input', test_data_file_name) train_pair_ids = participants_fold.loc[participants_fold == 'train'].index.tolist() validation_pair_ids = participants_fold.loc[participants_fold == 'validation'].index.tolist() test_pair_ids = participants_fold.loc[participants_fold == 'test'].index.tolist() train_x, train_y, validation_x, validation_y = utils.load_data(data_path=data_path, label_name='label', features_families=features_families, test_pair_ids=validation_pair_ids, train_pair_ids=train_pair_ids, id_column=id_column, features_to_remove=features_to_remove) _, _, test_x, test_y = utils.load_data(data_path=test_data_path, label_name='label', id_column=id_column, features_families=features_families, test_pair_ids=test_pair_ids, features_to_remove=features_to_remove) data_features = train_x.columns.tolist() model_names = ['SVM', 'mean', 'median', 'RandomForest', 'XGBoost', 'CatBoost'] # , 'lightGBM', ''] for model_num, model_name in enumerate(model_names): model_num_results_path = os.path.join(excel_models_results, f'model_name_results_{model_name}.pkl') if not os.path.isfile(model_num_results_path): model_num_results = pd.DataFrame(columns=['model_name', 'hyper_parameters_str'] + measures[model_type][0]) joblib.dump(model_num_results, model_num_results_path) # each function need to get: model_num, fold, fold_dir, model_type, model_name, # fold_split_dict, table_writer, data_directory, hyper_parameters_dict. # During running it needs to write the predictions to the table_writer and save the trained model with # the name: model_name_model_num to the fold_dir. # it needs to return a dict with the final results over the evaluation data: {measure_name: measure} if hyper_parameters_tune_mode: greadsearch = gridsearch_params[model_name] for i, parameters_dict in enumerate(greadsearch): # if i > 0: # continue if os.path.isfile(os.path.join(excel_models_results, f'Results_fold_{fold}_model_{model_name}.xlsx')): continue new_model_num = f'{model_num}_{i}' print(f'start model {model_name} with number {new_model_num} for fold {fold}') all_models_results = execute_create_fit_predict_eval_model( model_num=new_model_num, features=data_features, train_x=train_x, train_y=train_y, test_x=validation_x, test_y=validation_y, fold=fold, fold_dir=fold_dir, model_name=model_name, excel_models_results_folder=excel_models_results, hyper_parameters_dict=parameters_dict, all_models_results=all_models_results, model_num_results_path=model_num_results_path, model_type=model_type) else: # no hyper parameters parameters_dict = default_gridsearch_params[model_name] all_models_results = execute_create_fit_predict_eval_model( model_num=model_num, features=data_features, train_x=train_x, train_y=train_y, test_x=validation_x, test_y=validation_y, fold=fold, fold_dir=fold_dir, model_name=model_name, excel_models_results_folder=excel_models_results, hyper_parameters_dict=parameters_dict, all_models_results=all_models_results, model_num_results_path=model_num_results_path, model_type=model_type) # select the best hyper-parameters set for this model based on the Accuracy model_num_results = joblib.load(model_num_results_path) if model_num_results.empty: continue # measures[model_type][0] is the measure to choose the best model if model_type == 'regression': argmax_index = model_num_results[measures[model_type][0][0]].argmin() elif model_type == 'classification': argmax_index = model_num_results[measures[model_type][0][0]].argmax() else: raise ValueError('model_type must be regression or classification') best_model = model_num_results.iloc[argmax_index] model_version_num = best_model.model_num logging.info(f'Best model version for model {model_num}-{model_name} in fold {fold} is: ' f'{model_version_num}. Start predict over test data') print(f'Best model version for model {model_num}-{model_name} in fold {fold} is: ' f'{model_version_num}. Start predict over test data') # predict on test data using the best version of this model hyper_parameters_str = best_model.hyper_parameters_str model_folder = run_dir if not os.path.exists(os.path.join(base_directory, 'logs', model_folder, f'fold_{fold}')): if not os.path.exists( os.path.join(base_directory, 'logs', f'{model_folder}_best', f'fold_{fold}')): # the folder we need not exists print(f'fold {fold} in folder {model_folder} is not exists') continue else: model_folder = f'{model_folder}_best' # get hyper parameters as dict if type(hyper_parameters_str) == str: hyper_parameters_dict = json.loads(hyper_parameters_str) elif type(hyper_parameters_str) == dict: hyper_parameters_dict = hyper_parameters_str else: hyper_parameters_dict = None print('no hyper parameters dict') model_file_name = f'{model_version_num}_{model_name}_fold_{fold}.pkl' trained_model_dir = os.path.join(base_directory, 'logs', model_folder, f'fold_{fold}') trained_model = joblib.load(os.path.join(trained_model_dir, model_file_name)) best_models_paths_dict[model_name] = os.path.join(trained_model_dir, model_file_name) metadata_dict = {'model_num': model_num, 'model_name': model_name, 'hyper_parameters_str': hyper_parameters_dict, 'fold': fold, 'best_model_version_num': model_version_num} metadata_df = pd.DataFrame.from_dict(metadata_dict, orient='index').T # create model class with trained_model test_model_class = predictive_models.PredictiveModel( data_features, model_name, hyper_parameters_dict, model_num, fold, fold_dir, excel_test_models_results, trained_model=trained_model, model_type=model_type) test_predictions = test_model_class.predict(test_x, test_y) results_dict = getattr(utils, measures[model_type][1])(all_predictions=test_predictions) results_df = pd.DataFrame(results_dict, index=[0]) results_df = metadata_df.join(results_df) all_models_test_data_results = pd.concat([all_models_test_data_results, results_df], sort='False') utils.write_to_excel(test_model_class.model_table_writer, 'Model results', ['Model results'], results_df) test_model_class.model_table_writer.save() utils.write_to_excel(all_results_table_writer, 'All models results', ['All models results'], all_models_results) if all_results_table_writer is not None: all_results_table_writer.save() if test_table_writer is not None: utils.write_to_excel(test_table_writer, 'All models results', ['All models results'], all_models_test_data_results) test_table_writer.save() logging.info(f'fold {fold} finish compare models') print(f'fold {fold} finish compare models') for model_type in best_models_paths_dict.keys(): if model_type not in ['RandomForest', 'XGBoost', 'CatBoost']: continue print(f'\n computing SHAP values of {model_type}') pkl_model_path = Path(best_models_paths_dict[model_type]) model = joblib.load(pkl_model_path) X_test = train_x X_train = train_x # create a file for the SHAP results to be saved at save_shap_values_path = pkl_model_path.parent.joinpath('SHAP_values_results') save_shap_values_path.mkdir(exist_ok=True) shap_obj = XAI_Methods.XAIMethods(model, X_test, X_train, 'SHAP', model_type) shap_res = shap_obj.get_shap_feature_mean_values() shap_res_save_path = save_shap_values_path.joinpath(pkl_model_path.name.replace('pkl', 'csv')) shap_res.to_csv(shap_res_save_path) return f'fold {fold} finish compare models', best_models_paths_dict
def predict_best_models(best_model_file_name: str): all_models_results = pd.DataFrame() best_models = pd.read_excel(os.path.join(base_directory, 'logs', best_model_file_name), sheet_name='table_to_load') os.environ["CUDA_VISIBLE_DEVICES"] = '0' participants_fold = pd.read_csv(os.path.join(data_directory, 'pairs_folds_new_test_data.csv')) participants_fold.index = participants_fold.pair_id excel_models_results = utils.set_folder(folder_name='excel_best_models_results', father_folder_path=run_dir) table_writer = pd.ExcelWriter(os.path.join(excel_models_results, f'Results_test_data_best_models.xlsx'), engine='xlsxwriter') log_file_name = os.path.join(run_dir, f'LogFile.log') for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=log_file_name, level=logging.DEBUG, format='%(asctime)s: %(levelname)s %(message)s', datefmt='%H:%M:%S', ) for fold in range(6): pair_ids_in_fold = participants_fold[f'fold_{fold}'] fold_split_dict = dict() for data_set in ['train', 'test', 'validation']: fold_split_dict[data_set] = pair_ids_in_fold.loc[pair_ids_in_fold == data_set].index.tolist() for index, row in best_models.iterrows(): model_name = row['model_name'] model_name_folder = row[f'model_name_folder_fold_{fold}'] model_num = row['model_num'] # if model_num not in [879]: # continue model_type = row['model_type'] model_type_folder = row[f'model_type_folder_fold_{fold}'] if type(model_type_folder) == float and np.isnan(model_type_folder): continue function_to_run = row['function_to_run'] data_file_name = row['data_file_name'] test_data_file_name = row['test_data_file_name'] hyper_parameters_str = row[f'hyper_parameters_fold_{fold}'] model_folder = row[f'model_folder_fold_{fold}'] if not os.path.exists(os.path.join(base_directory, 'logs', model_folder, f'fold_{fold}')): if not os.path.exists(os.path.join(base_directory, 'logs', f'{model_folder}_best', f'fold_{fold}')): # the folder we need not exists print(f'fold {fold} in folder {model_folder} is not exists') continue else: model_folder = f'{model_folder}_best' model_version_num = row[f'model_version_num_fold_{fold}'] model_file_name = f'{model_version_num}_{model_type_folder}_{model_name_folder}_fold_{fold}.pkl' if function_to_run == 'ExecuteEvalLSTM': inner_model_folder =\ f'{model_version_num}_{model_type_folder}_{model_name_folder}_100_epochs_fold_num_{fold}' else: inner_model_folder = '' trained_model_dir = os.path.join(base_directory, 'logs', model_folder, f'fold_{fold}', inner_model_folder) # if torch.cuda.is_available() or function_to_run != 'ExecuteEvalLSTM': trained_model = joblib.load(os.path.join(trained_model_dir, model_file_name)) # else: # trained_model = torch.load(os.path.join(trained_model_dir, model_file_name), # map_location=torch.device('cpu')) # get hyper parameters as dict if type(hyper_parameters_str) == str: hyper_parameters_dict = json.loads(hyper_parameters_str) else: hyper_parameters_dict = None metadata_dict = {'model_num': model_num, 'model_type': model_type, 'model_name': model_name, 'data_file_name': data_file_name, 'test_data_file_name': test_data_file_name, 'hyper_parameters_str': hyper_parameters_dict, 'fold': fold} metadata_df = pd.DataFrame.from_dict(metadata_dict, orient='index').T model_class = getattr(execute_cv_models, function_to_run)( model_num, fold, run_dir, model_type, model_name, data_file_name, fold_split_dict, table_writer, data_directory, hyper_parameters_dict, excel_models_results, trained_model_dir=trained_model_dir, trained_model=trained_model, model_file_name=model_file_name, test_data_file_name=test_data_file_name, predict_type='test') model_class.load_data_create_model() model_class.predict() results_dict = model_class.eval_model() results_df = pd.DataFrame.from_dict(results_dict).T results_df['raisha_round'] = results_df.index results_df[['Raisha', 'Round']] = results_df.raisha_round.str.split(expand=True) results_df = results_df.drop('raisha_round', axis=1) results_df.index = np.zeros(shape=(results_df.shape[0],)) results_df = metadata_df.join(results_df) all_models_results = pd.concat([all_models_results, results_df], sort='False') utils.write_to_excel(model_class.model_table_writer, 'Model results', ['Model results'], results_df) model_class.model_table_writer.save() utils.write_to_excel(table_writer, 'All models results', ['All models results'], all_models_results) table_writer.save() logging.info(f'Finish predict best models') print(f'Finish predict best models')
import pandas as pd import os import utils from datetime import datetime import logging import json import execute_cv_models import joblib import numpy as np import torch base_directory = os.path.abspath(os.curdir) condition = 'verbal' data_directory = os.path.join(base_directory, 'data', condition, 'cv_framework') run_dir = utils.set_folder(datetime.now().strftime(f'predict_best_models_%d_%m_%Y_%H_%M'), 'logs') def predict_best_models(best_model_file_name: str): all_models_results = pd.DataFrame() best_models = pd.read_excel(os.path.join(base_directory, 'logs', best_model_file_name), sheet_name='table_to_load') os.environ["CUDA_VISIBLE_DEVICES"] = '0' participants_fold = pd.read_csv(os.path.join(data_directory, 'pairs_folds_new_test_data.csv')) participants_fold.index = participants_fold.pair_id excel_models_results = utils.set_folder(folder_name='excel_best_models_results', father_folder_path=run_dir) table_writer = pd.ExcelWriter(os.path.join(excel_models_results, f'Results_test_data_best_models.xlsx'), engine='xlsxwriter') log_file_name = os.path.join(run_dir, f'LogFile.log') for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=log_file_name,
def execute_fold_parallel(participants_fold: pd.Series, fold: int, cuda_device: str, hyper_parameters_tune_mode: bool = False, three_losses: bool = False, leaky_relu: bool = False): """ This function get a dict that split the participant to train-val-test (for this fold) and run all the models we want to compare --> it train them using the train data and evaluate them using the val data :param participants_fold: split the participant to train-val-test (for this fold) :param fold: the fold number :param cuda_device: the number of cuda device if using it :param hyper_parameters_tune_mode: after find good data - hyper parameter tuning :param three_losses: if we want 3 losses for avg_turn models :param leaky_relu: if we wan to use leaky_relu in linear layers :return: """ # get the train, test, validation participant code for this fold os.environ["CUDA_VISIBLE_DEVICES"] = cuda_device fold_split_dict = dict() for data_set in ['train', 'test', 'validation']: fold_split_dict[data_set] = participants_fold.loc[ participants_fold == data_set].index.tolist() # models_to_compare should have for each row: # model_num, model_type, model_name, function_to_run, data_file_name, hyper_parameters # (strings of all parameters for the running function as dict: {'parameter_name': parameter_value}) models_to_compare = pd.read_excel(os.path.join( base_directory, 'models_to_hyper_parameters.xlsx'), sheet_name='table_to_load', skiprows=[0]) fold_dir = utils.set_folder(f'fold_{fold}', run_dir) excel_models_results = utils.set_folder(folder_name='excel_models_results', father_folder_path=fold_dir) # for test test_fold_dir = utils.set_folder(f'fold_{fold}', test_dir) excel_test_models_results = utils.set_folder( folder_name='excel_best_models_results', father_folder_path=test_fold_dir) test_participants_fold = pd.read_csv( os.path.join(data_directory, pair_folds_file_name)) test_participants_fold.index = test_participants_fold.pair_id test_table_writer = pd.ExcelWriter(os.path.join( excel_test_models_results, f'Results_test_data_best_models.xlsx'), engine='xlsxwriter') # table_writer = pd.ExcelWriter(os.path.join(excel_models_results, f'Results_fold_{fold}_all_models.xlsx'), # engine='xlsxwriter') table_writer = None log_file_name = os.path.join(fold_dir, f'LogFile_fold_{fold}.log') for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig( filename=log_file_name, level=logging.DEBUG, format='%(asctime)s: %(levelname)s %(message)s', datefmt='%H:%M:%S', ) # all_model_types = models_to_compare.model_type.unique() # all_model_types = ['LSTM_avg', 'LSTM_avg_turn', 'Transformer_avg_turn', 'Transformer_avg', # 'LSTM_avg_turn_linear', 'Attention_avg'] # all_model_types = ['Attention_avg'] all_model_nums = list(set(models_to_compare.model_num)) # already_trained_models = list(range(15, 21)) + list(range(11)) # all_model_nums = [x for x in all_model_nums if x not in already_trained_models] all_model_nums = [78, 79, 80] + list(range(84, 87)) # all_model_nums = [23, 24, 30, 31] + list(range(54, 63)) + list(range(69, 78)) + list(range(81, 84)) +\ # list(range(163, 166)) + list(range(178, 181)) # all_model_nums = list(range(34, 38)) + [40] + list(range(192, 195)) + list(range(90, 94)) + list(range(100, 103)) all_model_nums = [36] all_models_results = pd.DataFrame() all_models_prediction_results = pd.DataFrame() for model_num in all_model_nums: # compare all versions of each model type # if model_num != 79: # continue model_type_versions = models_to_compare.loc[models_to_compare.model_num == model_num] model_num_results_path = os.path.join( excel_models_results, f'model_num_results_{model_num}.pkl') if not os.path.isfile(model_num_results_path): model_num_results = pd.DataFrame(columns=[ 'model_num', 'model_name', 'model_type', 'hyper_parameters_str', 'data_file_name', 'RMSE', 'Raisha', 'Round' ]) joblib.dump(model_num_results, model_num_results_path) for index, row in model_type_versions.iterrows( ): # iterate over all the models to compare # get all model parameters model_type = row['model_type'] model_name = row['model_name'] if leaky_relu: model_name = model_name + '_leaky' model_num += 600 # for 3 losses: if '_avg_turn' in model_type and three_losses: model_num += 700 model_name = row['model_name'] + '_3_losses' avg_turn_gridsearch_params_inner = [{ 'avg_loss': 1.0, 'turn_loss': 1.0, 'avg_turn_loss': 1.0 }, { 'avg_loss': 2.0, 'turn_loss': 2.0, 'avg_turn_loss': 1.0 }, { 'avg_loss': 1.0, 'turn_loss': 1.0, 'avg_turn_loss': 2.0 }] else: avg_turn_gridsearch_params_inner = avg_turn_gridsearch_params function_to_run = row['function_to_run'] data_file_name = row['data_file_name'] test_data_file_name = row['test_data_file_name'] hyper_parameters_str = row['hyper_parameters'] # get hyper parameters as dict if type(hyper_parameters_str) == str: hyper_parameters_dict = json.loads(hyper_parameters_str) else: hyper_parameters_dict = None if hyper_parameters_dict is not None and 'features_max_size' in hyper_parameters_dict.keys( ): if int(hyper_parameters_dict['features_max_size']) > 3000: continue if outer_is_debug: hyper_parameters_dict['num_epochs'] = 2 else: hyper_parameters_dict['num_epochs'] = 100 # each function need to get: model_num, fold, fold_dir, model_type, model_name, data_file_name, # fold_split_dict, table_writer, data_directory, hyper_parameters_dict. # During running it needs to write the predictions to the table_writer and save the trained model with # the name: model_name_model_num to the fold_dir. # it needs to return a dict with the final results over the evaluation data: {measure_name: measure} if hyper_parameters_tune_mode: if 'LSTM' in model_type or 'Transformer' in model_type: if 'LSTM' in model_type and 'use_transformer' not in model_type: greadsearch = lstm_gridsearch_params else: # for Transformer models and LSTM_use_transformer models greadsearch = transformer_gridsearch_params for i, parameters_dict in enumerate(greadsearch): # if i > 1: # continue new_hyper_parameters_dict = copy.deepcopy( hyper_parameters_dict) new_hyper_parameters_dict.update(parameters_dict) if 'linear' in model_type and 'lstm_hidden_dim' in new_hyper_parameters_dict: new_hyper_parameters_dict['linear_hidden_dim'] = \ int(0.5 * int(new_hyper_parameters_dict['lstm_hidden_dim'])) if '_avg_turn' in model_type: for inner_i, inner_parameters_dict in enumerate( avg_turn_gridsearch_params_inner): # if inner_i > 0: # break new_hyper_parameters_dict.update( inner_parameters_dict) new_model_name = f'{model_name}' new_model_num = f'{model_num}_{i}_{inner_i}' if os.path.isfile( os.path.join( excel_models_results, f'Results_fold_{fold}_model_{new_model_num}.xlsx' )): continue all_models_results = execute_create_fit_predict_eval_model( function_to_run, new_model_num, fold, fold_dir, model_type, new_model_name, data_file_name, fold_split_dict, table_writer, new_hyper_parameters_dict, excel_models_results, all_models_results, model_num_results_path) else: new_model_name = f'{model_name}' new_model_num = f'{model_num}_{i}' if os.path.isfile( os.path.join( excel_models_results, f'Results_fold_{fold}_model_{new_model_num}.xlsx' )): continue all_models_results = execute_create_fit_predict_eval_model( function_to_run, new_model_num, fold, fold_dir, model_type, new_model_name, data_file_name, fold_split_dict, table_writer, new_hyper_parameters_dict, excel_models_results, all_models_results, model_num_results_path) elif 'SVM' in model_type or 'Baseline' in model_type: num_iterates = 1 if 'baseline' in model_name or 'Baseline' in model_type: svm_gridsearch_params_inner = [{}] else: svm_gridsearch_params_inner = svm_gridsearch_params if 'stratified' in model_name: num_iterates = 5000 for i, parameters_dict in enumerate( svm_gridsearch_params_inner): # if i > 0: # continue new_hyper_parameters_dict = copy.deepcopy( hyper_parameters_dict) new_hyper_parameters_dict.update(parameters_dict) new_model_name = f'{model_name}' new_model_num = f'{model_num}_{i}' if os.path.isfile( os.path.join( excel_models_results, f'Results_fold_{fold}_model_{new_model_num}.xlsx' )): continue all_models_results = execute_create_fit_predict_eval_model( function_to_run, new_model_num, fold, fold_dir, model_type, new_model_name, data_file_name, fold_split_dict, table_writer, new_hyper_parameters_dict, excel_models_results, all_models_results, model_num_results_path, num_iterates=num_iterates) elif 'CRF' in model_type: for i, parameters_dict in enumerate(crf_gridsearch_params): # if i > 0: # continue new_hyper_parameters_dict = copy.deepcopy( hyper_parameters_dict) new_hyper_parameters_dict.update(parameters_dict) new_model_name = f'{model_name}' new_model_num = f'{model_num}_{i}' if os.path.isfile( os.path.join( excel_models_results, f'Results_fold_{fold}_model_{new_model_num}.xlsx' )): continue all_models_results = execute_create_fit_predict_eval_model( function_to_run, new_model_num, fold, fold_dir, model_type, new_model_name, data_file_name, fold_split_dict, table_writer, new_hyper_parameters_dict, excel_models_results, all_models_results, model_num_results_path) else: print( 'Model type must be LSTM-kind, Transformer-kind, CRF-kind or SVM-kind' ) # select the best hyper-parameters set for this model based on the RMSE model_num_results = joblib.load(model_num_results_path) if model_num_results.empty: continue argmin_index = model_num_results.RMSE.argmin() best_model = model_num_results.iloc[argmin_index] model_version_num = best_model.model_num logging.info( f'Best model version for model {model_num}-{model_name} in fold {fold} is: ' f'{model_version_num}. Start predict over test data') print( f'Best model version for model {model_num}-{model_name} in fold {fold} is: ' f'{model_version_num}. Start predict over test data') # predict on test data using the best version of this model test_fold_split_dict = dict() test_pair_ids_in_fold = test_participants_fold[f'fold_{fold}'] for data_set in ['train', 'test', 'validation']: test_fold_split_dict[data_set] = \ test_pair_ids_in_fold.loc[test_pair_ids_in_fold == data_set].index.tolist() hyper_parameters_str = best_model.hyper_parameters_str model_folder = run_dir if not os.path.exists( os.path.join(base_directory, 'logs', model_folder, f'fold_{fold}')): if not os.path.exists( os.path.join(base_directory, 'logs', f'{model_folder}_best', f'fold_{fold}')): # the folder we need not exists print( f'fold {fold} in folder {model_folder} is not exists' ) continue else: model_folder = f'{model_folder}_best' # get hyper parameters as dict if type(hyper_parameters_str) == str: hyper_parameters_dict = json.loads(hyper_parameters_str) elif type(hyper_parameters_str) == dict: hyper_parameters_dict = hyper_parameters_str else: hyper_parameters_dict = None print('no hyper parameters dict') num_epochs = hyper_parameters_dict['num_epochs'] model_file_name = f'{model_version_num}_{model_type}_{model_name}_fold_{fold}.pkl' if function_to_run == 'ExecuteEvalLSTM': inner_model_folder = \ f'{model_version_num}_{model_type}_{model_name}_{num_epochs}_epochs_fold_num_{fold}' else: inner_model_folder = '' trained_model_dir = os.path.join(base_directory, 'logs', model_folder, f'fold_{fold}', inner_model_folder) # if torch.cuda.is_available() or function_to_run != 'ExecuteEvalLSTM': trained_model = joblib.load( os.path.join(trained_model_dir, model_file_name)) # else: # trained_model = torch.load(os.path.join(trained_model_dir, model_file_name), # map_location=torch.device('cpu')) metadata_dict = { 'model_num': model_num, 'model_type': model_type, 'model_name': model_name, 'data_file_name': data_file_name, 'test_data_file_name': test_data_file_name, 'hyper_parameters_str': hyper_parameters_dict, 'fold': fold, 'best_model_version_num': model_version_num } metadata_df = pd.DataFrame.from_dict(metadata_dict, orient='index').T model_class = getattr(execute_cv_models, function_to_run)( model_num, fold, test_fold_dir, model_type, model_name, data_file_name, test_fold_split_dict, test_table_writer, data_directory, hyper_parameters_dict, excel_test_models_results, trained_model, trained_model_dir, model_file_name, test_data_file_name, 'test') model_class.load_data_create_model() model_class.predict() results_dict = model_class.eval_model() results_df = pd.DataFrame.from_dict(results_dict).T results_df['raisha_round'] = results_df.index results_df[['Raisha', 'Round' ]] = results_df.raisha_round.str.split(expand=True) results_df = results_df.drop('raisha_round', axis=1) results_df.index = np.zeros(shape=(results_df.shape[0], )) results_df = metadata_df.join(results_df) all_models_prediction_results = pd.concat( [all_models_prediction_results, results_df], sort='False') utils.write_to_excel(model_class.model_table_writer, 'Model results', ['Model results'], results_df) model_class.model_table_writer.save() else: # no hyper parameters all_models_results = execute_create_fit_predict_eval_model( function_to_run, model_num, fold, fold_dir, model_type, model_name, data_file_name, fold_split_dict, table_writer, hyper_parameters_dict, excel_models_results, all_models_results, model_num_results_path) utils.write_to_excel(table_writer, 'All models results', ['All models results'], all_models_results) if table_writer is not None: table_writer.save() if test_table_writer is not None: utils.write_to_excel(test_table_writer, 'All models results', ['All models results'], all_models_prediction_results) test_table_writer.save() logging.info(f'fold {fold} finish compare models') print(f'fold {fold} finish compare models') return f'fold {fold} finish compare models'