def generate_corr_matrix(X__, seeds, folds):
    """
    Generate a correlation matrix from the given dataset
    All seeds and folds are done in this function (multiprocessing takes up too much memory)
    Only the training set is used to generate the matrices
    
    Inputs:
    - X__: Numpy array of matrices containing the dataset (training set)
    - seeds: list of seed numbers to use 
    - folds: number of folds (int)
    """

    TARGET_DIRECTORY = '../data/corr_matrix/' + class_subset + '/'
    mkdir(TARGET_DIRECTORY)

    for SEED in seeds:

        np.random.seed(SEED)
        idx = np.arange(len(X__))
        np.random.shuffle(idx)  # randomize index

        X, Y, subject_groups = X__[idx], Y__[idx], np.array(
            subject_groups_)[idx]
        subject_groups = subject_groups.tolist()

        folds_indices = split_kfoldcv_sbj(Y.argmax(1), subject_groups, folds,
                                          SEED)

        fold_count = 0

        for train_index, val_index in folds_indices:  # for each fold

            if os.path.exists(TARGET_DIRECTORY + "corr_matrix_seed" +
                              str(SEED) + "_fold_" + str(fold_count) +
                              ".hdf5"):
                print("corr_matrix_seed" + str(SEED) + "_fold_" +
                      str(fold_count) +
                      " has already been generated, skipping it...")

            else:
                print("corr_matrix_seed" + str(SEED) + "_fold_" +
                      str(fold_count) + ".hdf5 not found!")

                X_ = corr_mx_flatten(X)

                X_train, Y_train = X_[train_index], Y[train_index]
                X_val, Y_val = X_[val_index], Y[val_index]

                corr_matrix = np.corrcoef(
                    X_train.T)  # Generate correlation matrix
                print('Correlation matrix generated for seed ' + str(SEED) +
                      ' fold ' + str(fold_count))

                corr_matrix = np.absolute(corr_matrix)
                corr_matrix[corr_matrix < SPARSE_THRESHOLD_CORR] = 0
                print("Number of non-zero elements in corr_matrix: " +
                      str(np.count_nonzero(corr_matrix)))

                g = h5py.File(
                    TARGET_DIRECTORY + "corr_matrix_seed" + str(SEED) +
                    "_fold_" + str(fold_count) + ".hdf5", "w")
                g.create_dataset('corr_matrix', data=corr_matrix)
                g.close()
                print("Wrote corr_matrix " + "corr_matrix_seed" + str(SEED) +
                      "_fold_" + str(fold_count) + " to " + TARGET_DIRECTORY)

            fold_count += 1
Esempio n. 2
0
            print('SEED_' + str(SEED) + '_fold_' + str(fold_count) +
                  ' done, skipping it....')
            fold_count += 1
            continue

        print('************** Start of Fold:', fold_count,
              '**********************')
        thresholds = np.array([1.0, float(threshold)])
        mask = np.ones(34716)

        for ix, i in enumerate(thresholds):

            if ix == 0:
                model_initial_file = TARGET_DIRECTORY + 'initial_model/' + 'initial_' + cnn_model + '_' + str(
                    SEED) + '_' + str(class_subset) + '_' + str(i) + '.h5'
                X_ = corr_mx_flatten(X)
                if cnn_model == 'funcNetFFN_2L':
                    _ = funcNetFFN_2L(X_.shape[1], hidden_layer_size, dropout,
                                      batch_size, model_initial_file)
                elif cnn_model == 'funcNetFFN_3L':
                    _ = funcNetFFN_3L(X_.shape[1], hidden_layer_size, dropout,
                                      batch_size, model_initial_file)

                print('Shape of input data', X_.shape)

            else:
                threshold_1 = thresholds[ix - 1]
                threshold_2 = thresholds[ix]
                prev_threshold = thresholds[ix - 1]
                prev_best_model = TARGET_DIRECTORY + 'best_model/' + 'best_model_seed_' + str(
                    SEED) + '_' + str(
    folds_indices = split_kfoldcv_sbj(Y.argmax(1), subject_groups, folds, SEED)

    fold_count = 0

    for train_index, val_index in folds_indices:  # for each fold

        if os.path.exists(TARGET_DIRECTORY + "eigv_seed" + str(SEED) +
                          "_fold_" + str(fold_count) + ".csv"):
            print("eigv_seed" + str(SEED) + "_fold_" + str(fold_count) +
                  " has already been generated, skipping it...")

        else:
            print("eigv_seed" + str(SEED) + "_fold_" + str(fold_count) +
                  ".csv not found!")

            X_ = corr_mx_flatten(X)

            X_train, Y_train = X_[train_index], Y[train_index]
            X_val, Y_val = X_[val_index], Y[val_index]

            list_of_multiproc_input.append((X_train, fold_count, SEED))

        fold_count += 1

with Pool(processes=MULTIPROC_BATCH_SIZE) as pool:
    result = pool.starmap_async(generate_eigenvalues_elbow_and_clusters,
                                list_of_multiproc_input)
    result = result.get()

for eigv, elbow, clusters, seed_number, fold_number in result: