with open( TARGET_DIRECTORY + cnn_model + '_' + mode + '_kfold_training_logs_' + class_subset + '.csv', 'a') as out_stream: out_stream.write( 'Seed,Threshold,Fold,Best Epoch,Training Accuracy,Test Accuracy,Training Accuracy - Test Accuacy,Train Loss,Test Loss,Training Loss - Test Loss,MAE,AUC,Trainable Parameters\n' ) for SEED in seeds: np.random.seed(SEED) idx = np.arange(len(X__)) np.random.shuffle(idx) # randomize index X, Y, subject_groups = X__[idx], Y__[idx], np.array(subject_groups_)[idx] subject_groups = subject_groups.tolist() folds_indices = split_kfoldcv_sbj(Y.argmax(1), subject_groups, folds, SEED) fold_count = 0 for train_index, val_index in folds_indices: # for each fold if 'best_model_seed_' + str(SEED) + '_' + str( class_subset) + '_' + cnn_model + '_' + mode + '_' + str( threshold) + '_fold_' + str( fold_count) + '.h5' in os.listdir(TARGET_DIRECTORY + 'best_model/'): print('SEED_' + str(SEED) + '_fold_' + str(fold_count) + ' done, skipping it....') fold_count += 1 continue
def generate_corr_matrix(X__, seeds, folds): """ Generate a correlation matrix from the given dataset All seeds and folds are done in this function (multiprocessing takes up too much memory) Only the training set is used to generate the matrices Inputs: - X__: Numpy array of matrices containing the dataset (training set) - seeds: list of seed numbers to use - folds: number of folds (int) """ TARGET_DIRECTORY = '../data/corr_matrix/' + class_subset + '/' mkdir(TARGET_DIRECTORY) for SEED in seeds: np.random.seed(SEED) idx = np.arange(len(X__)) np.random.shuffle(idx) # randomize index X, Y, subject_groups = X__[idx], Y__[idx], np.array( subject_groups_)[idx] subject_groups = subject_groups.tolist() folds_indices = split_kfoldcv_sbj(Y.argmax(1), subject_groups, folds, SEED) fold_count = 0 for train_index, val_index in folds_indices: # for each fold if os.path.exists(TARGET_DIRECTORY + "corr_matrix_seed" + str(SEED) + "_fold_" + str(fold_count) + ".hdf5"): print("corr_matrix_seed" + str(SEED) + "_fold_" + str(fold_count) + " has already been generated, skipping it...") else: print("corr_matrix_seed" + str(SEED) + "_fold_" + str(fold_count) + ".hdf5 not found!") X_ = corr_mx_flatten(X) X_train, Y_train = X_[train_index], Y[train_index] X_val, Y_val = X_[val_index], Y[val_index] corr_matrix = np.corrcoef( X_train.T) # Generate correlation matrix print('Correlation matrix generated for seed ' + str(SEED) + ' fold ' + str(fold_count)) corr_matrix = np.absolute(corr_matrix) corr_matrix[corr_matrix < SPARSE_THRESHOLD_CORR] = 0 print("Number of non-zero elements in corr_matrix: " + str(np.count_nonzero(corr_matrix))) g = h5py.File( TARGET_DIRECTORY + "corr_matrix_seed" + str(SEED) + "_fold_" + str(fold_count) + ".hdf5", "w") g.create_dataset('corr_matrix', data=corr_matrix) g.close() print("Wrote corr_matrix " + "corr_matrix_seed" + str(SEED) + "_fold_" + str(fold_count) + " to " + TARGET_DIRECTORY) fold_count += 1