Example #1
0
def get_cv_fold(fold, dataset="HCP"):
    '''
    Brauche train-test-validate wegen Best-model selection und wegen training von combined net
    :return:
    '''

    #For CV
    if fold == 0:
        train, validate, test = [0, 1, 2], [3], [4]
        # train, validate, test = [0, 1, 2, 3, 4], [3], [4]
    elif fold == 1:
        train, validate, test = [1, 2, 3], [4], [0]
    elif fold == 2:
        train, validate, test = [2, 3, 4], [0], [1]
    elif fold == 3:
        train, validate, test = [3, 4, 0], [1], [2]
    elif fold == 4:
        train, validate, test = [4, 0, 1], [2], [3]

    subjects = get_all_subjects(dataset)

    if dataset.startswith("HCP"):
        # subjects = list(Utils.chunks(subjects[:100], 10))   #10 folds
        subjects = list(utils.chunks(subjects, 21))   #5 folds a 21 subjects
        # => 5 fold CV ok (score only 1%-point worse than 10 folds (80 vs 60 train subjects) (10 Fold CV impractical!)
    elif dataset.startswith("Schizo"):
        # 410 subjects
        subjects = list(utils.chunks(subjects, 82))  # 5 folds a 82 subjects
    else:
        raise ValueError("Invalid dataset name")

    subjects = np.array(subjects)
    return list(subjects[train].flatten()), list(subjects[validate].flatten()), list(subjects[test].flatten())
Example #2
0
def get_cv_fold(fold, dataset="HCP"):
    if dataset == "HCP_all":
        subjects = get_all_subjects(dataset)
        cut_point = int(len(subjects) * 0.9)
        return subjects[:cut_point], subjects[cut_point:], ["599671", "599469"]
    elif dataset == "biobank_20k":
        subjects = get_all_subjects(dataset)
        cut_point = int(len(subjects) * 0.9)
        return subjects[:cut_point], subjects[cut_point:], ["1000013", "1000013"]
    else:
        if fold == 0:
            train, validate, test = [0, 1, 2], [3], [4]
        elif fold == 1:
            train, validate, test = [1, 2, 3], [4], [0]
        elif fold == 2:
            train, validate, test = [2, 3, 4], [0], [1]
        elif fold == 3:
            train, validate, test = [3, 4, 0], [1], [2]
        elif fold == 4:
            train, validate, test = [4, 0, 1], [2], [3]

        subjects = get_all_subjects(dataset)

        if dataset.startswith("HCP"):
            subjects = list(utils.chunks(subjects, 21))   #5 folds a 21 subjects
            # 5 fold CV ok (score only 1%-point worse than 10 folds (80 vs 60 train subjects) (10 Fold CV impractical!)
        elif dataset.startswith("Schizo"):
            # ~410 subjects
            subjects = list(utils.chunks(subjects, 82))  # 5 folds a 82 subjects
        else:
            raise ValueError("Invalid dataset name")

        subjects = np.array(subjects)
        return list(subjects[train].flatten()), list(subjects[validate].flatten()), list(subjects[test].flatten())
Example #3
0
def compress_streamlines(streamlines, error_threshold=0.1, nr_cpus=-1):
    import psutil
    if nr_cpus == -1:
        nr_processes = psutil.cpu_count()
    else:
        nr_processes = nr_cpus
    number_streamlines = len(streamlines)

    if nr_processes >= number_streamlines:
        nr_processes = number_streamlines - 1
        if nr_processes < 1:
            nr_processes = 1

    chunk_size = int(number_streamlines / nr_processes)

    if chunk_size < 1:
        # logging.warning("\nReturning early because chunk_size=0")
        return streamlines
    fiber_batches = list(utils.chunks(streamlines, chunk_size))

    global _COMPRESSION_ERROR_THRESHOLD
    global _FIBER_BATCHES
    _COMPRESSION_ERROR_THRESHOLD = error_threshold
    _FIBER_BATCHES = fiber_batches

    # logging.debug("Main program using: {} GB".format(round(Utils.mem_usage(print_usage=False), 3)))
    pool = multiprocessing.Pool(processes=nr_processes)

    #Do not pass data in (doubles amount of memory needed), but only idx of shared memory
    #  (needs only as much memory as single thread version (only main thread needs memory, others almost 0).
    #  Shared memory version also faster (around 20-30%?).
    #  Needed otherwise memory problems when processing the raw tracking output (on disk >10GB and in memory >20GB)
    result = pool.map(compress_fibers_worker_shared_mem,
                      range(0, len(fiber_batches)))

    pool.close()
    pool.join()

    streamlines_c = utils.flatten(result)
    return streamlines_c