def get_cv_fold(fold, dataset="HCP"): ''' Brauche train-test-validate wegen Best-model selection und wegen training von combined net :return: ''' #For CV if fold == 0: train, validate, test = [0, 1, 2], [3], [4] # train, validate, test = [0, 1, 2, 3, 4], [3], [4] elif fold == 1: train, validate, test = [1, 2, 3], [4], [0] elif fold == 2: train, validate, test = [2, 3, 4], [0], [1] elif fold == 3: train, validate, test = [3, 4, 0], [1], [2] elif fold == 4: train, validate, test = [4, 0, 1], [2], [3] subjects = get_all_subjects(dataset) if dataset.startswith("HCP"): # subjects = list(Utils.chunks(subjects[:100], 10)) #10 folds subjects = list(utils.chunks(subjects, 21)) #5 folds a 21 subjects # => 5 fold CV ok (score only 1%-point worse than 10 folds (80 vs 60 train subjects) (10 Fold CV impractical!) elif dataset.startswith("Schizo"): # 410 subjects subjects = list(utils.chunks(subjects, 82)) # 5 folds a 82 subjects else: raise ValueError("Invalid dataset name") subjects = np.array(subjects) return list(subjects[train].flatten()), list(subjects[validate].flatten()), list(subjects[test].flatten())
def get_cv_fold(fold, dataset="HCP"): if dataset == "HCP_all": subjects = get_all_subjects(dataset) cut_point = int(len(subjects) * 0.9) return subjects[:cut_point], subjects[cut_point:], ["599671", "599469"] elif dataset == "biobank_20k": subjects = get_all_subjects(dataset) cut_point = int(len(subjects) * 0.9) return subjects[:cut_point], subjects[cut_point:], ["1000013", "1000013"] else: if fold == 0: train, validate, test = [0, 1, 2], [3], [4] elif fold == 1: train, validate, test = [1, 2, 3], [4], [0] elif fold == 2: train, validate, test = [2, 3, 4], [0], [1] elif fold == 3: train, validate, test = [3, 4, 0], [1], [2] elif fold == 4: train, validate, test = [4, 0, 1], [2], [3] subjects = get_all_subjects(dataset) if dataset.startswith("HCP"): subjects = list(utils.chunks(subjects, 21)) #5 folds a 21 subjects # 5 fold CV ok (score only 1%-point worse than 10 folds (80 vs 60 train subjects) (10 Fold CV impractical!) elif dataset.startswith("Schizo"): # ~410 subjects subjects = list(utils.chunks(subjects, 82)) # 5 folds a 82 subjects else: raise ValueError("Invalid dataset name") subjects = np.array(subjects) return list(subjects[train].flatten()), list(subjects[validate].flatten()), list(subjects[test].flatten())
def compress_streamlines(streamlines, error_threshold=0.1, nr_cpus=-1): import psutil if nr_cpus == -1: nr_processes = psutil.cpu_count() else: nr_processes = nr_cpus number_streamlines = len(streamlines) if nr_processes >= number_streamlines: nr_processes = number_streamlines - 1 if nr_processes < 1: nr_processes = 1 chunk_size = int(number_streamlines / nr_processes) if chunk_size < 1: # logging.warning("\nReturning early because chunk_size=0") return streamlines fiber_batches = list(utils.chunks(streamlines, chunk_size)) global _COMPRESSION_ERROR_THRESHOLD global _FIBER_BATCHES _COMPRESSION_ERROR_THRESHOLD = error_threshold _FIBER_BATCHES = fiber_batches # logging.debug("Main program using: {} GB".format(round(Utils.mem_usage(print_usage=False), 3))) pool = multiprocessing.Pool(processes=nr_processes) #Do not pass data in (doubles amount of memory needed), but only idx of shared memory # (needs only as much memory as single thread version (only main thread needs memory, others almost 0). # Shared memory version also faster (around 20-30%?). # Needed otherwise memory problems when processing the raw tracking output (on disk >10GB and in memory >20GB) result = pool.map(compress_fibers_worker_shared_mem, range(0, len(fiber_batches))) pool.close() pool.join() streamlines_c = utils.flatten(result) return streamlines_c