def cross_validate_rf(X, y, estimator, max_depth, max_features, flag=False): if flag: table = PrettyTable() table.field_names = ["Fold", "RF Accuracy", "RF AUC"] averages = np.array([0.0] * (len(table.field_names) - 1)) for fold in range(0, N_FOLDS): train_index, test_index = load_split_indices(N_FOLD_CV_SPLIT_INDICIES_FP, fold_index=fold) X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] cw = dict(enumerate(class_weight.compute_class_weight('balanced', np.unique(y_train), y_train))) random_forest = RandomForestClassifier(n_estimators=estimator, max_depth=max_depth, max_features=max_features, random_state=1, class_weight=cw) random_forest = random_forest.fit(X_train, y_train) predicted = random_forest.predict(X_test) rf_accuracy = accuracy_score(y_test, predicted) fpr, tpr, thresholds = roc_curve(y_test, predicted) rf_auc= auc(fpr, tpr) new_row = [ round(rf_accuracy, 3), round(rf_auc, 3)] table.add_row([fold] + new_row) averages += np.array(new_row) / N_FOLDS table.add_row( ["avg"] + list(map(lambda x: round(x, 3), averages)) ) print(table)
def cross_validate_dt(X, y, max_depth, flag=False): if flag: table = PrettyTable() table.field_names = ["Fold", "DT Accuracy", 'DT AUC'] averages = np.array([0.0] * (len(table.field_names) - 1)) for fold in range(0, N_FOLDS): train_index, test_index = load_split_indices(N_FOLD_CV_SPLIT_INDICIES_FP, fold_index=fold) X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] cw = dict(enumerate(class_weight.compute_class_weight('balanced', np.unique(y_train), y_train))) decision_tree = DecisionTreeClassifier(random_state=1, class_weight=cw, max_depth=max_depth) decision_tree = decision_tree.fit(X_train, y_train) predicted = decision_tree.predict(X_test) dt_accuracy = accuracy_score(y_test, predicted) fpr, tpr, thresholds = roc_curve(y_test, predicted) dt_auc= auc(fpr, tpr) new_row = [ round(dt_accuracy, 3), round(dt_auc, 3)] table.add_row([fold] + new_row) averages += np.array(new_row) / N_FOLDS table.add_row( ["avg"] + list(map(lambda x: round(x, 3), averages)) ) print(table)
def cross_validate_combined_rem_d_rem_t(combine_rules_flag=False, DT=True, evaluate_rules_flag=False): if combine_rules_flag: for fold in range(0, N_FOLDS): with open(n_fold_rules_fp(fold), 'rb') as rules_file: rules_dnn = pickle.load(rules_file) if DT: combined_rules_file_path = n_fold_rules_DT_COMB_fp(fold) with open(n_fold_rules_DT_fp(fold), 'rb') as rules_file: rules_tree = pickle.load(rules_file) else: combined_rules_file_path = n_fold_rules_RF_COMB_fp(fold) with open(n_fold_rules_RF_fp(fold), 'rb') as rules_file: rules_tree = pickle.load(rules_file) combined_rules = Ruleset(rules_dnn).combine_ruleset(Ruleset(rules_tree)) # Save rules combined print('Saving fold %d/%d rules combined...' % (fold, N_FOLDS), end='', flush=True) with open(combined_rules_file_path, 'wb') as rules_file: pickle.dump(combined_rules, rules_file) print('done') if evaluate_rules_flag: for fold in range(0, N_FOLDS): train_index, test_index = load_split_indices(N_FOLD_CV_SPLIT_INDICIES_FP, fold_index=fold) X_test = np.load(N_FOLD_CV_SPLIT_X_test_data_FP(fold)) y_test = np.load(N_FOLD_CV_SPLIT_y_test_data_FP(fold)) combined_rules_file_path = n_fold_rules_DT_COMB_fp(fold) print('Loading extracted rules from disk for fold %d/%d...' % (fold, N_FOLDS), end='', flush=True) with open(combined_rules_file_path, 'rb') as rules_file: rules = pickle.load(rules_file) print('done') # Save labels to labels.csv: # label - True data labels label_data = {'id': test_index, 'true_labels': y_test} # label - Rule extraction labels rule_predictions = predict(rules, X_test) label_data['rule_labels'] = rule_predictions pd.DataFrame(data=label_data).to_csv(LABEL_FP, index=False) # Evaluate rules print('Evaulating rules extracted from fold %d/%d...' % (fold, N_FOLDS), end='', flush=True) re_results = evaluate_tree_rules(rules, LABEL_FP) print('done') # Initialise empty results file if fold == 0: pd.DataFrame(data=[], columns=['fold']).to_csv(N_FOLD_RESULTS_DT_COMB_FP, index=False) results_df = pd.read_csv(N_FOLD_RESULTS_DT_COMB_FP) row_index = fold results_df.loc[row_index, 'fold'] = fold results_df.loc[row_index, 're_acc'] = re_results['acc'] results_df.loc[row_index, 're_auc'] = re_results['auct'] results_df.loc[row_index, 'rules_num'] = sum(re_results['n_rules_per_class']) avg_rule_length = np.array(re_results['av_n_terms_per_rule']) avg_rule_length *= np.array(re_results['n_rules_per_class']) avg_rule_length = sum(avg_rule_length) avg_rule_length /= sum(re_results['n_rules_per_class']) results_df.loc[row_index, 'rules_av_len'] = avg_rule_length if fold == N_FOLDS - 1: results_df.iloc[:, 1:] = results_df.round(3) results_df.loc[N_FOLDS, "fold"] = "average" results_df.iloc[N_FOLDS, 1:] = results_df.mean().round(3) results_df = results_df[["fold", "re_acc", "re_auc", "rules_num", "rules_av_len"]] results_df.to_csv(N_FOLD_RESULTS_DT_COMB_FP, index=False)
def cross_validate_rem_d(extract_rules_flag=False, evaluate_rules_flag=False): # Extract rules from model from each fold if extract_rules_flag: for fold in range(0, N_FOLDS): # Path to extracted rules from that fold extracted_rules_file_path = n_fold_rules_fp(fold) # Path to neural network model for this fold model_file_path = n_fold_model_fp(fold) X_train = np.load(N_FOLD_CV_SPLIT_X_train_data_FP(fold)) X_test = np.load(N_FOLD_CV_SPLIT_X_test_data_FP(fold)) y_train = np.load(N_FOLD_CV_SPLIT_y_train_data_FP(fold)) y_test = np.load(N_FOLD_CV_SPLIT_y_test_data_FP(fold)) # Extract rules nn_accuracy, nn_auc, rules, re_time, re_memory= dnn_re.run(X_train, y_train, X_test, y_test, model_file_path) # Save rules extracted print('Saving fold %d/%d rules extracted...' % (fold, N_FOLDS), end='', flush=True) with open(extracted_rules_file_path, 'wb') as rules_file: pickle.dump(rules, rules_file) print('done') # Save rule extraction time and memory usage print('Saving fold %d/%d results...' % (fold, N_FOLDS), end='', flush=True) # Initialise empty results file if fold == 0: pd.DataFrame(data=[], columns=['fold']).to_csv(N_FOLD_RESULTS_FP, index=False) results_df = pd.read_csv(N_FOLD_RESULTS_FP) row_index = fold results_df.loc[row_index, 'fold'] = fold results_df.loc[row_index, 'nn_acc'] = nn_accuracy results_df.loc[row_index, 'nn_auc'] = nn_auc results_df.loc[row_index, 're_time (sec)'] = re_time results_df.loc[row_index, 're_memory (MB)'] = re_memory results_df.to_csv(N_FOLD_RESULTS_FP, index=False) print('done') # Compute cross-validated results if evaluate_rules_flag: for fold in range(0, N_FOLDS): # Get train and test data folds train_index, test_index = load_split_indices(N_FOLD_CV_SPLIT_INDICIES_FP, fold_index=fold) X_test = np.load(N_FOLD_CV_SPLIT_X_test_data_FP(fold)) y_test = np.load(N_FOLD_CV_SPLIT_y_test_data_FP(fold)) # Path to neural network model for this fold model_file_path = n_fold_model_fp(fold) # Load extracted rules from disk print('Loading extracted rules from disk for fold %d/%d...' % (fold, N_FOLDS), end='', flush=True) with open(n_fold_rules_fp(fold), 'rb') as rules_file: rules = pickle.load(rules_file) print('done') # Save labels to labels.csv: # label - True data labels label_data = {'id': test_index, 'true_labels': y_test} # label - Neural network data labels. Use NN to predict X_test nn_model = load_model(model_file_path) nn_predictions = np.argmax(nn_model.predict(X_test), axis=1) label_data['nn_labels'] = nn_predictions # label_data['nn_labels'] = nn_model.predict(X_test) # label - Rule extraction labels rule_predictions = predict(rules, X_test) label_data['rule_%s_labels' % RULE_EXTRACTOR.mode] = rule_predictions pd.DataFrame(data=label_data).to_csv(LABEL_FP, index=False) # Evaluate rules print('Evaulating rules extracted from fold %d/%d...' % (fold, N_FOLDS), end='', flush=True) re_results = evaluate(rules, LABEL_FP) print('done') # Save rule extraction evaulation results row_index = fold results_df = pd.read_csv(N_FOLD_RESULTS_FP) results_df.loc[row_index, 're_acc'] = re_results['acc'] results_df.loc[row_index, 're_auc'] = re_results['aucr'] results_df.loc[row_index, 're_fid'] = re_results['fid'] results_df.loc[row_index, 'rules_num'] = sum(re_results['n_rules_per_class']) avg_rule_length = np.array(re_results['av_n_terms_per_rule']) avg_rule_length *= np.array(re_results['n_rules_per_class']) avg_rule_length = sum(avg_rule_length) avg_rule_length /= sum(re_results['n_rules_per_class']) results_df.loc[row_index, 'rules_av_len'] = avg_rule_length if fold == N_FOLDS - 1: results_df.iloc[:, 1:] = results_df.round(3) results_df.loc[N_FOLDS, "fold"] = "average" results_df.iloc[N_FOLDS, 1:] = results_df.mean().round(3) results_df = results_df[ ["fold", "nn_acc", "nn_auc", "re_acc", "re_auc", "re_fid", "re_time (sec)", "re_memory (MB)", "rules_num", "rules_av_len"]] results_df.to_csv(N_FOLD_RESULTS_FP, index=False)
def cross_validated_rem_t(X, y, extract_evaluate_rules_flag=False, DT=False): if extract_evaluate_rules_flag: for fold in range(0, N_FOLDS): train_index, test_index = load_split_indices(N_FOLD_CV_SPLIT_INDICIES_FP, fold_index=fold) X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] main_df = pd.read_csv(DATA_FP) main_df = main_df.drop([DATASET_INFO.target_col], axis=1) feat_list = list(main_df.columns) feature_names_to_id_map = dict(zip(feat_list, range(len(feat_list)))) # for key in feature_names_to_id_map: # feature_names_to_id_map[key] += 1000 max_depth = None n_estimators = 20 if DT: extracted_rules_file_path = n_fold_rules_DT_fp(fold) accuracy, auc, rules = tree_re.run_dt(X_train, y_train, X_test, y_test, feature_names_to_id_map, DATASET_INFO.output_classes, max_depth) N_FOLD_RESULTS_tree_FP = N_FOLD_RESULTS_DT_FP else: extracted_rules_file_path = n_fold_rules_RF_fp(fold) accuracy, auc, rules = tree_re.run_rf(X_train, y_train, X_test, y_test, feature_names_to_id_map, DATASET_INFO.output_classes, n_estimators, max_depth) N_FOLD_RESULTS_tree_FP = N_FOLD_RESULTS_RF_FP # Save rules extracted print('Saving fold %d/%d rules extracted...' % (fold, N_FOLDS), end='', flush=True) with open(extracted_rules_file_path, 'wb') as rules_file: pickle.dump(rules, rules_file) print('done') # Save labels to labels.csv: # label - True data labels label_data = {'id': test_index, 'true_labels': y_test} # label - Rule extraction labels rule_predictions = predict(rules, X_test) label_data['rule_labels'] = rule_predictions pd.DataFrame(data=label_data).to_csv(LABEL_FP, index=False) print('Evaulating rules extracted from fold %d/%d...' % (fold, N_FOLDS), end='', flush=True) re_results = evaluate_tree_rules(rules, LABEL_FP) print('done') # Initialise empty results file if fold == 0: pd.DataFrame(data=[], columns=['fold']).to_csv(N_FOLD_RESULTS_tree_FP, index=False) results_df = pd.read_csv(N_FOLD_RESULTS_tree_FP) row_index = fold results_df.loc[row_index, 'fold'] = fold results_df.loc[row_index, 're_acc'] = re_results['acc'] results_df.loc[row_index, 're_auc'] = re_results['auct'] results_df.loc[row_index, 'rules_num'] = sum(re_results['n_rules_per_class']) avg_rule_length = np.array(re_results['av_n_terms_per_rule']) avg_rule_length *= np.array(re_results['n_rules_per_class']) avg_rule_length = sum(avg_rule_length) avg_rule_length /= sum(re_results['n_rules_per_class']) results_df.loc[row_index, 'rules_av_len'] = avg_rule_length if fold == N_FOLDS - 1: results_df.iloc[:, 1:] = results_df.round(3) results_df.loc[N_FOLDS, "fold"] = "average" results_df.iloc[N_FOLDS, 1:] = results_df.mean().round(3) results_df = results_df[["fold", "re_acc", "re_auc", "rules_num", "rules_av_len"]] results_df.to_csv(N_FOLD_RESULTS_tree_FP, index=False)
def cross_validate_rem_d_ranking_elimination(rank_rules_flag=False, rule_elimination=False, percentage=0, evaluate_rules_flag=False): if rank_rules_flag: for fold in range(0, N_FOLDS): X_train = np.load(N_FOLD_CV_SPLIT_X_train_data_FP(fold)) y_train = np.load(N_FOLD_CV_SPLIT_y_train_data_FP(fold)) extracted_rules_file_path = n_fold_rules_fp(fold) with open(extracted_rules_file_path, 'rb') as rules_file: rules = pickle.load(rules_file) for rule in rules: rank_rule_scores(rule, X_train, y_train, use_rl=True) clear_file(extracted_rules_file_path) print('Saving fold %d/%d rules after scoring...' % (fold, N_FOLDS), end='', flush=True) with open(extracted_rules_file_path, 'wb') as rules_file: pickle.dump(rules, rules_file) if rule_elimination: for fold in range(0, N_FOLDS): extracted_rules_file_path = n_fold_rules_fp(fold) remaining_rules = eliminate_rules(extracted_rules_file_path, percentage) # Save remaining rules print('Saving fold %d/%d remaining rules ...' % (fold, N_FOLDS), end='', flush=True) with open(n_fold_rules_fp_remaining(N_FOLD_RULES_REMAINING_DP, fold)(percentage), 'wb') as rules_file: pickle.dump(remaining_rules, rules_file) print('done') if evaluate_rules_flag: for fold in range(0, N_FOLDS): train_index, test_index = load_split_indices(N_FOLD_CV_SPLIT_INDICIES_FP, fold_index=fold) X_test = np.load(N_FOLD_CV_SPLIT_X_test_data_FP(fold)) y_test = np.load(N_FOLD_CV_SPLIT_y_test_data_FP(fold)) model_file_path = n_fold_model_fp(fold) print('Loading extracted rules from disk for fold %d/%d...' % (fold, N_FOLDS), end='', flush=True) with open(n_fold_rules_fp_remaining(N_FOLD_RULES_REMAINING_DP, fold)(percentage), 'rb') as rules_file: rules = pickle.load(rules_file) print('done') # Initialise empty results file if fold == 0: pd.DataFrame(data=[], columns=['fold']).to_csv(N_FOLD_RESULTS_FP_REMAINING(percentage), index=False) results_df = pd.read_csv(N_FOLD_RESULTS_FP_REMAINING(percentage)) row_index = fold results_df.loc[row_index, 'fold'] = fold label_data = {'id': test_index, 'true_labels': y_test} nn_model = load_model(model_file_path) nn_predictions = np.argmax(nn_model.predict(X_test), axis=1) label_data['nn_labels'] = nn_predictions rule_predictions = predict(rules, X_test) label_data['rule_%s_labels' % RULE_EXTRACTOR.mode] = rule_predictions pd.DataFrame(data=label_data).to_csv(LABEL_FP, index=False) # Evaluate rules print('Evaulating rules remained from fold %d/%d...' % (fold, N_FOLDS), end='', flush=True) re_results = evaluate(rules, LABEL_FP) print('done') # Save rule extraction evaulation results row_index = fold results_df.loc[row_index, 're_acc'] = re_results['acc'] results_df.loc[row_index, 're_auc'] = re_results['aucr'] results_df.loc[row_index, 're_fid'] = re_results['fid'] results_df.loc[row_index, 'rules_num'] = sum(re_results['n_rules_per_class']) avg_rule_length = np.array(re_results['av_n_terms_per_rule']) avg_rule_length *= np.array(re_results['n_rules_per_class']) avg_rule_length = sum(avg_rule_length) avg_rule_length /= sum(re_results['n_rules_per_class']) results_df.loc[row_index, 'rules_av_len'] = avg_rule_length if fold == N_FOLDS - 1: results_df.iloc[:, 1:] = results_df.round(3) results_df.loc[N_FOLDS, "fold"] = "average" results_df.iloc[N_FOLDS, 1:] = results_df.mean().round(3) results_df = results_df[["fold", "re_acc", "re_auc", "rules_num", "rules_av_len"]] results_df.to_csv(N_FOLD_RESULTS_FP_REMAINING(percentage), index=False)
def run(X, y, hyperparameters): train_index, test_index = split_data.load_split_indices( file_path=NN_INIT_SPLIT_INDICES_FP) # Split data X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] # Save information about nn initialisation if not os.path.exists(NN_INIT_RE_RESULTS_FP): pd.DataFrame(data=[], columns=['run']).to_csv(NN_INIT_RE_RESULTS_FP, index=False) # Path to trained neural network model_file_path = TEMP_DIR + 'model.h5' # Smallest ruleset i.e. total number of rules smallest_ruleset_size = np.float('inf') smallest_ruleset_acc = 0 best_init_index = 0 for i in range(0, 5): print('Testing initialisation %d' % i) # Build and train nn put it in temp/ build_and_train_model(X_train, y_train, X_test, y_test, **hyperparameters, model_file_path=model_file_path) # Extract rules nn_accuracy, nn_AUC, rules, re_time, re_memory = dnn_re.run( X, y, train_index, test_index, model_file_path) # Save labels to labels.csv: # label - True data labels label_data = {'id': test_index, 'true_labels': y_test} # label - Neural network data labels. Use NN to predict X_test nn_model = tf.keras.models.load_model(model_file_path) nn_predictions = np.argmax(nn_model.predict(X_test), axis=1) label_data['nn_labels'] = nn_predictions # label - Rule extraction labels rule_predictions = predict(rules, X_test) label_data['rule_%s_labels' % RULE_EXTRACTOR.mode] = rule_predictions pd.DataFrame(data=label_data).to_csv(LABEL_FP, index=False) # Save rule extraction time and memory usage results_df = pd.read_csv(NN_INIT_RE_RESULTS_FP) results_df.loc[i, 'run'] = i results_df.loc[i, 're_time (sec)'] = re_time re_results = evaluate(rules, LABEL_FP) results_df.loc[i, 'nn_acc'] = nn_accuracy results_df.loc[i, 're_acc'] = re_results['acc'] results_df.loc[i, 're_fid'] = re_results['fid'] results_df.loc[i, 'rules_num'] = sum(re_results['n_rules_per_class']) results_df = results_df.round(3) results_df = results_df[[ "run", "nn_acc", "re_acc", "re_fid", "re_time (sec)", "rules_num" ]] results_df.to_csv(NN_INIT_RE_RESULTS_FP, index=False) # If this initialisation extrcts a smaller ruleset - save it ruleset_size = sum(re_results['n_rules_per_class']) if (ruleset_size < smallest_ruleset_size) \ or (ruleset_size == smallest_ruleset_size and re_results['acc'] > smallest_ruleset_acc): smallest_ruleset_size = ruleset_size smallest_ruleset_acc = re_results['acc'] best_init_index = i # Save initilisation as best_initialisation.h5 tf.keras.models.load_model(TEMP_DIR + 'initialisation.h5').save( BEST_NN_INIT_FP) print('Found neural network with the best initialisation. (%d)' % best_init_index) print( '==================================================================================================' )
def run(X, y, split_data_flag=False, grid_search_flag=False, find_best_initialisation_flag=False, generate_fold_data_flag=False): print(N_FOLDS) """ Args: split_data_flag: Split data. Only do this once! grid_search_flag: Grid search to find best neural network hyperparameters. find_best_initialisation_flag: Find best neural network initialisation generate_fold_data_flag: Generate neural networks for each data fold """ # 1. Split data into train and test. Only do this once if split_data_flag: print('Splitting data. WARNING: only do this once!') split_data.train_test_split(X=X, y=y, test_size=0.2) split_data.stratified_k_fold(X=X, y=y, n_folds=N_FOLDS) # 2. Grid search over neural network hyper params to find optimal neural network hyperparameters if grid_search_flag: print( 'Performing grid search over hyper paramters WARNING this is very expensive' ) scaler = MinMaxScaler() X_scaled = scaler.fit_transform(X) grid_search(X=X_scaled, y=y) # TODO change this to read best grid search hyperparameters from disk nn_hyperparameters = OrderedDict(batch_size=BATCH_SIZE, epochs=EPOCHS, layer_1=LAYER_1, layer_2=LAYER_2) # 3. Initialise 5 neural networks using 1 train test split # Pick initialisation that yields the smallest ruleset if find_best_initialisation_flag: find_best_nn_initialisation.run(X, y, nn_hyperparameters) # 4. Build neural network for each fold using best initialisation found above if generate_fold_data_flag: for fold in range(0, N_FOLDS): print('Training model %d/%d' % (fold, N_FOLDS)) # Split data using precomputed split indices train_index, test_index = load_split_indices( N_FOLD_CV_SPLIT_INDICIES_FP, fold_index=fold) X_train, y_train, X_test, y_test = X[train_index], y[ train_index], X[test_index], y[test_index] scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # X_train_GE = scaler.fit_transform(X_train[:, :1000]) # X_test_GE = scaler.transform(X_test[:, :1000]) # X_train_ClinP = X_train[:, 1000:1013] # X_test_ClinP = X_test[:, 1000:1013] # X_train = np.concatenate((X_train_GE, X_train_ClinP), axis=1) # X_test = np.concatenate((X_test_GE, X_test_ClinP), axis=1) X_train_path = N_FOLD_CV_SPLIT_X_train_data_FP(fold) y_train_path = N_FOLD_CV_SPLIT_y_train_data_FP(fold) X_test_path = N_FOLD_CV_SPLIT_X_test_data_FP(fold) y_test_path = N_FOLD_CV_SPLIT_y_test_data_FP(fold) # Saving scaled data np.save(X_train_path, X_train) np.save(y_train_path, y_train) np.save(X_test_path, X_test) np.save(y_test_path, y_test) # Model to be stored in <dataset name>\cross_validation\<n>_folds\trained_models\ model_file_path = n_fold_model_fp(fold) build_and_train_model(X_train, y_train, X_test, y_test, **nn_hyperparameters, model_file_path=model_file_path, with_best_initilisation_flag=False) # Remove files from temp/ clean_up()