def cross_validate_rf(X, y, estimator, max_depth, max_features, flag=False):
    if flag:
        table = PrettyTable()
        table.field_names = ["Fold", "RF Accuracy", "RF AUC"]
        averages = np.array([0.0] * (len(table.field_names) - 1))

        for fold in range(0, N_FOLDS):
            train_index, test_index = load_split_indices(N_FOLD_CV_SPLIT_INDICIES_FP, fold_index=fold)
            X_train, y_train = X[train_index], y[train_index]
            X_test, y_test = X[test_index], y[test_index]

            cw = dict(enumerate(class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)))
            random_forest = RandomForestClassifier(n_estimators=estimator, max_depth=max_depth,
                                                   max_features=max_features, random_state=1, class_weight=cw)
            random_forest = random_forest.fit(X_train, y_train)
            predicted = random_forest.predict(X_test)

            rf_accuracy = accuracy_score(y_test, predicted)
            fpr, tpr, thresholds = roc_curve(y_test, predicted)
            rf_auc= auc(fpr, tpr)

            new_row = [
                round(rf_accuracy, 3),
                round(rf_auc, 3)]
            table.add_row([fold] + new_row)
            averages += np.array(new_row) / N_FOLDS

        table.add_row(
            ["avg"] +
            list(map(lambda x: round(x, 3), averages))
        )
        print(table)
def cross_validate_dt(X, y, max_depth, flag=False):
    if flag:
        table = PrettyTable()
        table.field_names = ["Fold", "DT Accuracy", 'DT AUC']
        averages = np.array([0.0] * (len(table.field_names) - 1))

        for fold in range(0, N_FOLDS):
            train_index, test_index = load_split_indices(N_FOLD_CV_SPLIT_INDICIES_FP, fold_index=fold)
            X_train, y_train = X[train_index], y[train_index]
            X_test, y_test = X[test_index], y[test_index]

            cw = dict(enumerate(class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)))
            decision_tree = DecisionTreeClassifier(random_state=1, class_weight=cw, max_depth=max_depth)
            decision_tree = decision_tree.fit(X_train, y_train)
            predicted = decision_tree.predict(X_test)

            dt_accuracy = accuracy_score(y_test, predicted)
            fpr, tpr, thresholds = roc_curve(y_test, predicted)
            dt_auc= auc(fpr, tpr)

            new_row = [
                round(dt_accuracy, 3),
                round(dt_auc, 3)]
            table.add_row([fold] + new_row)
            averages += np.array(new_row) / N_FOLDS

        table.add_row(
            ["avg"] +
            list(map(lambda x: round(x, 3), averages))
        )
        print(table)
def cross_validate_combined_rem_d_rem_t(combine_rules_flag=False, DT=True, evaluate_rules_flag=False):
    if combine_rules_flag:
        for fold in range(0, N_FOLDS):
            with open(n_fold_rules_fp(fold), 'rb') as rules_file:
                rules_dnn = pickle.load(rules_file)

            if DT:
                combined_rules_file_path = n_fold_rules_DT_COMB_fp(fold)
                with open(n_fold_rules_DT_fp(fold), 'rb') as rules_file:
                    rules_tree = pickle.load(rules_file)
            else:
                combined_rules_file_path = n_fold_rules_RF_COMB_fp(fold)
                with open(n_fold_rules_RF_fp(fold), 'rb') as rules_file:
                    rules_tree = pickle.load(rules_file)

            combined_rules = Ruleset(rules_dnn).combine_ruleset(Ruleset(rules_tree))

            # Save rules combined
            print('Saving fold %d/%d rules combined...' % (fold, N_FOLDS), end='', flush=True)
            with open(combined_rules_file_path, 'wb') as rules_file:
                pickle.dump(combined_rules, rules_file)
            print('done')


    if evaluate_rules_flag:
        for fold in range(0, N_FOLDS):
            train_index, test_index = load_split_indices(N_FOLD_CV_SPLIT_INDICIES_FP, fold_index=fold)
            X_test = np.load(N_FOLD_CV_SPLIT_X_test_data_FP(fold))
            y_test = np.load(N_FOLD_CV_SPLIT_y_test_data_FP(fold))

            combined_rules_file_path = n_fold_rules_DT_COMB_fp(fold)

            print('Loading extracted rules from disk for fold %d/%d...' % (fold, N_FOLDS), end='', flush=True)
            with open(combined_rules_file_path, 'rb') as rules_file:
                rules = pickle.load(rules_file)
            print('done')

            # Save labels to labels.csv:
            # label - True data labels
            label_data = {'id': test_index,
                          'true_labels': y_test}

            # label - Rule extraction labels
            rule_predictions = predict(rules, X_test)
            label_data['rule_labels'] = rule_predictions
            pd.DataFrame(data=label_data).to_csv(LABEL_FP, index=False)

            # Evaluate rules
            print('Evaulating rules extracted from fold %d/%d...' % (fold, N_FOLDS), end='', flush=True)
            re_results = evaluate_tree_rules(rules, LABEL_FP)
            print('done')


            # Initialise empty results file
            if fold == 0:
                pd.DataFrame(data=[], columns=['fold']).to_csv(N_FOLD_RESULTS_DT_COMB_FP, index=False)

            results_df = pd.read_csv(N_FOLD_RESULTS_DT_COMB_FP)
            row_index = fold
            results_df.loc[row_index, 'fold'] = fold
            results_df.loc[row_index, 're_acc'] = re_results['acc']
            results_df.loc[row_index, 're_auc'] = re_results['auct']
            results_df.loc[row_index, 'rules_num'] = sum(re_results['n_rules_per_class'])
            avg_rule_length = np.array(re_results['av_n_terms_per_rule'])
            avg_rule_length *= np.array(re_results['n_rules_per_class'])
            avg_rule_length = sum(avg_rule_length)
            avg_rule_length /= sum(re_results['n_rules_per_class'])
            results_df.loc[row_index, 'rules_av_len'] = avg_rule_length

            if fold == N_FOLDS - 1:
                results_df.iloc[:, 1:] = results_df.round(3)
                results_df.loc[N_FOLDS, "fold"] = "average"
                results_df.iloc[N_FOLDS, 1:] = results_df.mean().round(3)

            results_df = results_df[["fold", "re_acc", "re_auc", "rules_num", "rules_av_len"]]

            results_df.to_csv(N_FOLD_RESULTS_DT_COMB_FP, index=False)
def cross_validate_rem_d(extract_rules_flag=False, evaluate_rules_flag=False):
    # Extract rules from model from each fold
    if extract_rules_flag:
        for fold in range(0, N_FOLDS):
            # Path to extracted rules from that fold
            extracted_rules_file_path = n_fold_rules_fp(fold)

            # Path to neural network model for this fold
            model_file_path = n_fold_model_fp(fold)

            X_train = np.load(N_FOLD_CV_SPLIT_X_train_data_FP(fold))
            X_test = np.load(N_FOLD_CV_SPLIT_X_test_data_FP(fold))
            y_train = np.load(N_FOLD_CV_SPLIT_y_train_data_FP(fold))
            y_test = np.load(N_FOLD_CV_SPLIT_y_test_data_FP(fold))

            # Extract rules
            nn_accuracy, nn_auc, rules, re_time, re_memory= dnn_re.run(X_train, y_train, X_test, y_test,
                                                                            model_file_path)

            # Save rules extracted
            print('Saving fold %d/%d rules extracted...' % (fold, N_FOLDS), end='', flush=True)
            with open(extracted_rules_file_path, 'wb') as rules_file:
                pickle.dump(rules, rules_file)
            print('done')

            # Save rule extraction time and memory usage
            print('Saving fold %d/%d results...' % (fold, N_FOLDS), end='', flush=True)
            # Initialise empty results file
            if fold == 0:
                pd.DataFrame(data=[], columns=['fold']).to_csv(N_FOLD_RESULTS_FP, index=False)

            results_df = pd.read_csv(N_FOLD_RESULTS_FP)
            row_index = fold
            results_df.loc[row_index, 'fold'] = fold
            results_df.loc[row_index, 'nn_acc'] = nn_accuracy
            results_df.loc[row_index, 'nn_auc'] = nn_auc
            results_df.loc[row_index, 're_time (sec)'] = re_time
            results_df.loc[row_index, 're_memory (MB)'] = re_memory
            results_df.to_csv(N_FOLD_RESULTS_FP, index=False)
            print('done')

    # Compute cross-validated results
    if evaluate_rules_flag:
        for fold in range(0, N_FOLDS):
            # Get train and test data folds
            train_index, test_index = load_split_indices(N_FOLD_CV_SPLIT_INDICIES_FP, fold_index=fold)

            X_test = np.load(N_FOLD_CV_SPLIT_X_test_data_FP(fold))
            y_test = np.load(N_FOLD_CV_SPLIT_y_test_data_FP(fold))

            # Path to neural network model for this fold
            model_file_path = n_fold_model_fp(fold)

            # Load extracted rules from disk
            print('Loading extracted rules from disk for fold %d/%d...' % (fold, N_FOLDS), end='', flush=True)
            with open(n_fold_rules_fp(fold), 'rb') as rules_file:
                rules = pickle.load(rules_file)
            print('done')

            # Save labels to labels.csv:
            # label - True data labels
            label_data = {'id': test_index,
                          'true_labels': y_test}
            # label - Neural network data labels. Use NN to predict X_test
            nn_model = load_model(model_file_path)

            nn_predictions = np.argmax(nn_model.predict(X_test), axis=1)
            label_data['nn_labels'] = nn_predictions
            # label_data['nn_labels'] = nn_model.predict(X_test)


            # label - Rule extraction labels
            rule_predictions = predict(rules, X_test)
            label_data['rule_%s_labels' % RULE_EXTRACTOR.mode] = rule_predictions
            pd.DataFrame(data=label_data).to_csv(LABEL_FP, index=False)

            # Evaluate rules
            print('Evaulating rules extracted from fold %d/%d...' % (fold, N_FOLDS), end='', flush=True)
            re_results = evaluate(rules, LABEL_FP)
            print('done')

            # Save rule extraction evaulation results
            row_index = fold
            results_df = pd.read_csv(N_FOLD_RESULTS_FP)
            results_df.loc[row_index, 're_acc'] = re_results['acc']
            results_df.loc[row_index, 're_auc'] = re_results['aucr']
            results_df.loc[row_index, 're_fid'] = re_results['fid']
            results_df.loc[row_index, 'rules_num'] = sum(re_results['n_rules_per_class'])
            avg_rule_length = np.array(re_results['av_n_terms_per_rule'])
            avg_rule_length *= np.array(re_results['n_rules_per_class'])
            avg_rule_length = sum(avg_rule_length)
            avg_rule_length /= sum(re_results['n_rules_per_class'])
            results_df.loc[row_index, 'rules_av_len'] = avg_rule_length

            if fold == N_FOLDS - 1:
                results_df.iloc[:, 1:] = results_df.round(3)
                results_df.loc[N_FOLDS, "fold"] = "average"
                results_df.iloc[N_FOLDS, 1:] = results_df.mean().round(3)

            results_df = results_df[
                ["fold", "nn_acc", "nn_auc", "re_acc", "re_auc", "re_fid", "re_time (sec)", "re_memory (MB)", "rules_num",
                 "rules_av_len"]]

            results_df.to_csv(N_FOLD_RESULTS_FP, index=False)
def cross_validated_rem_t(X, y, extract_evaluate_rules_flag=False, DT=False):
    if extract_evaluate_rules_flag:
        for fold in range(0, N_FOLDS):
            train_index, test_index = load_split_indices(N_FOLD_CV_SPLIT_INDICIES_FP, fold_index=fold)
            X_train, y_train = X[train_index], y[train_index]
            X_test, y_test = X[test_index], y[test_index]


            main_df = pd.read_csv(DATA_FP)
            main_df = main_df.drop([DATASET_INFO.target_col], axis=1)

            feat_list = list(main_df.columns)
            feature_names_to_id_map = dict(zip(feat_list, range(len(feat_list))))
            # for key in feature_names_to_id_map:
            #     feature_names_to_id_map[key] += 1000

            max_depth = None
            n_estimators = 20

            if DT:
                extracted_rules_file_path = n_fold_rules_DT_fp(fold)
                accuracy, auc, rules = tree_re.run_dt(X_train, y_train, X_test, y_test, feature_names_to_id_map, DATASET_INFO.output_classes, max_depth)
                N_FOLD_RESULTS_tree_FP = N_FOLD_RESULTS_DT_FP
            else:
                extracted_rules_file_path = n_fold_rules_RF_fp(fold)
                accuracy, auc, rules = tree_re.run_rf(X_train, y_train, X_test, y_test, feature_names_to_id_map, DATASET_INFO.output_classes, n_estimators, max_depth)
                N_FOLD_RESULTS_tree_FP = N_FOLD_RESULTS_RF_FP


            # Save rules extracted
            print('Saving fold %d/%d rules extracted...' % (fold, N_FOLDS), end='', flush=True)
            with open(extracted_rules_file_path, 'wb') as rules_file:
                pickle.dump(rules, rules_file)
            print('done')

            # Save labels to labels.csv:
            # label - True data labels
            label_data = {'id': test_index,
                          'true_labels': y_test}

            # label - Rule extraction labels
            rule_predictions = predict(rules, X_test)
            label_data['rule_labels'] = rule_predictions
            pd.DataFrame(data=label_data).to_csv(LABEL_FP, index=False)

            print('Evaulating rules extracted from fold %d/%d...' % (fold, N_FOLDS), end='', flush=True)
            re_results = evaluate_tree_rules(rules, LABEL_FP)
            print('done')

            # Initialise empty results file
            if fold == 0:
                pd.DataFrame(data=[], columns=['fold']).to_csv(N_FOLD_RESULTS_tree_FP, index=False)

            results_df = pd.read_csv(N_FOLD_RESULTS_tree_FP)
            row_index = fold
            results_df.loc[row_index, 'fold'] = fold
            results_df.loc[row_index, 're_acc'] = re_results['acc']
            results_df.loc[row_index, 're_auc'] = re_results['auct']
            results_df.loc[row_index, 'rules_num'] = sum(re_results['n_rules_per_class'])
            avg_rule_length = np.array(re_results['av_n_terms_per_rule'])
            avg_rule_length *= np.array(re_results['n_rules_per_class'])
            avg_rule_length = sum(avg_rule_length)
            avg_rule_length /= sum(re_results['n_rules_per_class'])
            results_df.loc[row_index, 'rules_av_len'] = avg_rule_length


            if fold == N_FOLDS - 1:
                results_df.iloc[:, 1:] = results_df.round(3)
                results_df.loc[N_FOLDS, "fold"] = "average"
                results_df.iloc[N_FOLDS, 1:] = results_df.mean().round(3)


            results_df = results_df[["fold",  "re_acc", "re_auc", "rules_num", "rules_av_len"]]

            results_df.to_csv(N_FOLD_RESULTS_tree_FP, index=False)
def cross_validate_rem_d_ranking_elimination(rank_rules_flag=False, rule_elimination=False, percentage=0,
                                   evaluate_rules_flag=False):
    if rank_rules_flag:
        for fold in range(0, N_FOLDS):
            X_train = np.load(N_FOLD_CV_SPLIT_X_train_data_FP(fold))
            y_train = np.load(N_FOLD_CV_SPLIT_y_train_data_FP(fold))

            extracted_rules_file_path = n_fold_rules_fp(fold)

            with open(extracted_rules_file_path, 'rb') as rules_file:
                rules = pickle.load(rules_file)

            for rule in rules:
                rank_rule_scores(rule, X_train, y_train, use_rl=True)

            clear_file(extracted_rules_file_path)
            print('Saving fold %d/%d rules after scoring...' % (fold, N_FOLDS), end='', flush=True)
            with open(extracted_rules_file_path, 'wb') as rules_file:
                pickle.dump(rules, rules_file)

    if rule_elimination:
        for fold in range(0, N_FOLDS):
            extracted_rules_file_path = n_fold_rules_fp(fold)
            remaining_rules = eliminate_rules(extracted_rules_file_path, percentage)

            # Save remaining rules
            print('Saving fold %d/%d remaining rules ...' % (fold, N_FOLDS), end='', flush=True)
            with open(n_fold_rules_fp_remaining(N_FOLD_RULES_REMAINING_DP, fold)(percentage), 'wb') as rules_file:
                pickle.dump(remaining_rules, rules_file)
            print('done')

    if evaluate_rules_flag:
        for fold in range(0, N_FOLDS):
            train_index, test_index = load_split_indices(N_FOLD_CV_SPLIT_INDICIES_FP, fold_index=fold)

            X_test = np.load(N_FOLD_CV_SPLIT_X_test_data_FP(fold))
            y_test = np.load(N_FOLD_CV_SPLIT_y_test_data_FP(fold))

            model_file_path = n_fold_model_fp(fold)

            print('Loading extracted rules from disk for fold %d/%d...' % (fold, N_FOLDS), end='', flush=True)
            with open(n_fold_rules_fp_remaining(N_FOLD_RULES_REMAINING_DP, fold)(percentage), 'rb') as rules_file:
                rules = pickle.load(rules_file)
            print('done')

            # Initialise empty results file
            if fold == 0:
                pd.DataFrame(data=[], columns=['fold']).to_csv(N_FOLD_RESULTS_FP_REMAINING(percentage), index=False)

            results_df = pd.read_csv(N_FOLD_RESULTS_FP_REMAINING(percentage))
            row_index = fold
            results_df.loc[row_index, 'fold'] = fold

            label_data = {'id': test_index,
                          'true_labels': y_test}
            nn_model = load_model(model_file_path)
            nn_predictions = np.argmax(nn_model.predict(X_test), axis=1)
            label_data['nn_labels'] = nn_predictions
            rule_predictions = predict(rules, X_test)
            label_data['rule_%s_labels' % RULE_EXTRACTOR.mode] = rule_predictions
            pd.DataFrame(data=label_data).to_csv(LABEL_FP, index=False)

            # Evaluate rules
            print('Evaulating rules remained from fold %d/%d...' % (fold, N_FOLDS), end='', flush=True)
            re_results = evaluate(rules, LABEL_FP)
            print('done')

            # Save rule extraction evaulation results
            row_index = fold
            results_df.loc[row_index, 're_acc'] = re_results['acc']
            results_df.loc[row_index, 're_auc'] = re_results['aucr']
            results_df.loc[row_index, 're_fid'] = re_results['fid']
            results_df.loc[row_index, 'rules_num'] = sum(re_results['n_rules_per_class'])
            avg_rule_length = np.array(re_results['av_n_terms_per_rule'])
            avg_rule_length *= np.array(re_results['n_rules_per_class'])
            avg_rule_length = sum(avg_rule_length)
            avg_rule_length /= sum(re_results['n_rules_per_class'])
            results_df.loc[row_index, 'rules_av_len'] = avg_rule_length

            if fold == N_FOLDS - 1:
                results_df.iloc[:, 1:] = results_df.round(3)
                results_df.loc[N_FOLDS, "fold"] = "average"
                results_df.iloc[N_FOLDS, 1:] = results_df.mean().round(3)


            results_df = results_df[["fold", "re_acc", "re_auc", "rules_num", "rules_av_len"]]

            results_df.to_csv(N_FOLD_RESULTS_FP_REMAINING(percentage), index=False)
def run(X, y, hyperparameters):
    train_index, test_index = split_data.load_split_indices(
        file_path=NN_INIT_SPLIT_INDICES_FP)

    # Split data
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]

    # Save information about nn initialisation
    if not os.path.exists(NN_INIT_RE_RESULTS_FP):
        pd.DataFrame(data=[], columns=['run']).to_csv(NN_INIT_RE_RESULTS_FP,
                                                      index=False)

    # Path to trained neural network
    model_file_path = TEMP_DIR + 'model.h5'

    # Smallest ruleset i.e. total number of rules
    smallest_ruleset_size = np.float('inf')
    smallest_ruleset_acc = 0
    best_init_index = 0

    for i in range(0, 5):
        print('Testing initialisation %d' % i)

        # Build and train nn put it in temp/
        build_and_train_model(X_train,
                              y_train,
                              X_test,
                              y_test,
                              **hyperparameters,
                              model_file_path=model_file_path)

        # Extract rules
        nn_accuracy, nn_AUC, rules, re_time, re_memory = dnn_re.run(
            X, y, train_index, test_index, model_file_path)

        # Save labels to labels.csv:
        # label - True data labels
        label_data = {'id': test_index, 'true_labels': y_test}
        # label - Neural network data labels. Use NN to predict X_test
        nn_model = tf.keras.models.load_model(model_file_path)
        nn_predictions = np.argmax(nn_model.predict(X_test), axis=1)
        label_data['nn_labels'] = nn_predictions
        # label - Rule extraction labels
        rule_predictions = predict(rules, X_test)
        label_data['rule_%s_labels' % RULE_EXTRACTOR.mode] = rule_predictions
        pd.DataFrame(data=label_data).to_csv(LABEL_FP, index=False)

        # Save rule extraction time and memory usage
        results_df = pd.read_csv(NN_INIT_RE_RESULTS_FP)
        results_df.loc[i, 'run'] = i
        results_df.loc[i, 're_time (sec)'] = re_time

        re_results = evaluate(rules, LABEL_FP)
        results_df.loc[i, 'nn_acc'] = nn_accuracy
        results_df.loc[i, 're_acc'] = re_results['acc']
        results_df.loc[i, 're_fid'] = re_results['fid']
        results_df.loc[i, 'rules_num'] = sum(re_results['n_rules_per_class'])

        results_df = results_df.round(3)

        results_df = results_df[[
            "run", "nn_acc", "re_acc", "re_fid", "re_time (sec)", "rules_num"
        ]]

        results_df.to_csv(NN_INIT_RE_RESULTS_FP, index=False)

        # If this initialisation extrcts a smaller ruleset - save it
        ruleset_size = sum(re_results['n_rules_per_class'])
        if (ruleset_size < smallest_ruleset_size) \
                or (ruleset_size == smallest_ruleset_size and re_results['acc'] > smallest_ruleset_acc):
            smallest_ruleset_size = ruleset_size
            smallest_ruleset_acc = re_results['acc']
            best_init_index = i

            # Save initilisation as best_initialisation.h5
            tf.keras.models.load_model(TEMP_DIR + 'initialisation.h5').save(
                BEST_NN_INIT_FP)

    print('Found neural network with the best initialisation. (%d)' %
          best_init_index)
    print(
        '=================================================================================================='
    )
コード例 #8
0
def run(X,
        y,
        split_data_flag=False,
        grid_search_flag=False,
        find_best_initialisation_flag=False,
        generate_fold_data_flag=False):
    print(N_FOLDS)
    """

    Args:
        split_data_flag: Split data. Only do this once!
        grid_search_flag: Grid search to find best neural network hyperparameters.
        find_best_initialisation_flag: Find best neural network initialisation
        generate_fold_data_flag: Generate neural networks for each data fold

    """
    # 1. Split data into train and test. Only do this once
    if split_data_flag:
        print('Splitting data. WARNING: only do this once!')
        split_data.train_test_split(X=X, y=y, test_size=0.2)
        split_data.stratified_k_fold(X=X, y=y, n_folds=N_FOLDS)

    # 2. Grid search over neural network hyper params to find optimal neural network hyperparameters
    if grid_search_flag:
        print(
            'Performing grid search over hyper paramters WARNING this is very expensive'
        )
        scaler = MinMaxScaler()
        X_scaled = scaler.fit_transform(X)
        grid_search(X=X_scaled, y=y)

    # TODO change this to read best grid search hyperparameters from disk
    nn_hyperparameters = OrderedDict(batch_size=BATCH_SIZE,
                                     epochs=EPOCHS,
                                     layer_1=LAYER_1,
                                     layer_2=LAYER_2)

    # 3. Initialise 5 neural networks using 1 train test split
    # Pick initialisation that yields the smallest ruleset
    if find_best_initialisation_flag:
        find_best_nn_initialisation.run(X, y, nn_hyperparameters)

    # 4. Build neural network for each fold using best initialisation found above
    if generate_fold_data_flag:
        for fold in range(0, N_FOLDS):
            print('Training model %d/%d' % (fold, N_FOLDS))

            # Split data using precomputed split indices
            train_index, test_index = load_split_indices(
                N_FOLD_CV_SPLIT_INDICIES_FP, fold_index=fold)
            X_train, y_train, X_test, y_test = X[train_index], y[
                train_index], X[test_index], y[test_index]

            scaler = MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            # X_train_GE = scaler.fit_transform(X_train[:, :1000])
            # X_test_GE = scaler.transform(X_test[:, :1000])
            # X_train_ClinP = X_train[:, 1000:1013]
            # X_test_ClinP = X_test[:, 1000:1013]
            # X_train = np.concatenate((X_train_GE, X_train_ClinP), axis=1)
            # X_test = np.concatenate((X_test_GE, X_test_ClinP), axis=1)

            X_train_path = N_FOLD_CV_SPLIT_X_train_data_FP(fold)
            y_train_path = N_FOLD_CV_SPLIT_y_train_data_FP(fold)
            X_test_path = N_FOLD_CV_SPLIT_X_test_data_FP(fold)
            y_test_path = N_FOLD_CV_SPLIT_y_test_data_FP(fold)

            # Saving scaled data
            np.save(X_train_path, X_train)
            np.save(y_train_path, y_train)
            np.save(X_test_path, X_test)
            np.save(y_test_path, y_test)

            # Model to be stored in <dataset name>\cross_validation\<n>_folds\trained_models\
            model_file_path = n_fold_model_fp(fold)
            build_and_train_model(X_train,
                                  y_train,
                                  X_test,
                                  y_test,
                                  **nn_hyperparameters,
                                  model_file_path=model_file_path,
                                  with_best_initilisation_flag=False)
    # Remove files from temp/
    clean_up()