def generate_corr_matrix(X__, seeds, folds): """ Generate a correlation matrix from the given dataset All seeds and folds are done in this function (multiprocessing takes up too much memory) Only the training set is used to generate the matrices Inputs: - X__: Numpy array of matrices containing the dataset (training set) - seeds: list of seed numbers to use - folds: number of folds (int) """ TARGET_DIRECTORY = '../data/corr_matrix/' + class_subset + '/' mkdir(TARGET_DIRECTORY) for SEED in seeds: np.random.seed(SEED) idx = np.arange(len(X__)) np.random.shuffle(idx) # randomize index X, Y, subject_groups = X__[idx], Y__[idx], np.array( subject_groups_)[idx] subject_groups = subject_groups.tolist() folds_indices = split_kfoldcv_sbj(Y.argmax(1), subject_groups, folds, SEED) fold_count = 0 for train_index, val_index in folds_indices: # for each fold if os.path.exists(TARGET_DIRECTORY + "corr_matrix_seed" + str(SEED) + "_fold_" + str(fold_count) + ".hdf5"): print("corr_matrix_seed" + str(SEED) + "_fold_" + str(fold_count) + " has already been generated, skipping it...") else: print("corr_matrix_seed" + str(SEED) + "_fold_" + str(fold_count) + ".hdf5 not found!") X_ = corr_mx_flatten(X) X_train, Y_train = X_[train_index], Y[train_index] X_val, Y_val = X_[val_index], Y[val_index] corr_matrix = np.corrcoef( X_train.T) # Generate correlation matrix print('Correlation matrix generated for seed ' + str(SEED) + ' fold ' + str(fold_count)) corr_matrix = np.absolute(corr_matrix) corr_matrix[corr_matrix < SPARSE_THRESHOLD_CORR] = 0 print("Number of non-zero elements in corr_matrix: " + str(np.count_nonzero(corr_matrix))) g = h5py.File( TARGET_DIRECTORY + "corr_matrix_seed" + str(SEED) + "_fold_" + str(fold_count) + ".hdf5", "w") g.create_dataset('corr_matrix', data=corr_matrix) g.close() print("Wrote corr_matrix " + "corr_matrix_seed" + str(SEED) + "_fold_" + str(fold_count) + " to " + TARGET_DIRECTORY) fold_count += 1
print('SEED_' + str(SEED) + '_fold_' + str(fold_count) + ' done, skipping it....') fold_count += 1 continue print('************** Start of Fold:', fold_count, '**********************') thresholds = np.array([1.0, float(threshold)]) mask = np.ones(34716) for ix, i in enumerate(thresholds): if ix == 0: model_initial_file = TARGET_DIRECTORY + 'initial_model/' + 'initial_' + cnn_model + '_' + str( SEED) + '_' + str(class_subset) + '_' + str(i) + '.h5' X_ = corr_mx_flatten(X) if cnn_model == 'funcNetFFN_2L': _ = funcNetFFN_2L(X_.shape[1], hidden_layer_size, dropout, batch_size, model_initial_file) elif cnn_model == 'funcNetFFN_3L': _ = funcNetFFN_3L(X_.shape[1], hidden_layer_size, dropout, batch_size, model_initial_file) print('Shape of input data', X_.shape) else: threshold_1 = thresholds[ix - 1] threshold_2 = thresholds[ix] prev_threshold = thresholds[ix - 1] prev_best_model = TARGET_DIRECTORY + 'best_model/' + 'best_model_seed_' + str( SEED) + '_' + str(
folds_indices = split_kfoldcv_sbj(Y.argmax(1), subject_groups, folds, SEED) fold_count = 0 for train_index, val_index in folds_indices: # for each fold if os.path.exists(TARGET_DIRECTORY + "eigv_seed" + str(SEED) + "_fold_" + str(fold_count) + ".csv"): print("eigv_seed" + str(SEED) + "_fold_" + str(fold_count) + " has already been generated, skipping it...") else: print("eigv_seed" + str(SEED) + "_fold_" + str(fold_count) + ".csv not found!") X_ = corr_mx_flatten(X) X_train, Y_train = X_[train_index], Y[train_index] X_val, Y_val = X_[val_index], Y[val_index] list_of_multiproc_input.append((X_train, fold_count, SEED)) fold_count += 1 with Pool(processes=MULTIPROC_BATCH_SIZE) as pool: result = pool.starmap_async(generate_eigenvalues_elbow_and_clusters, list_of_multiproc_input) result = result.get() for eigv, elbow, clusters, seed_number, fold_number in result: