def convert_to_np(dataset_dir, be_nice, create_new_dataset_dict, create_metadata, split_npz_to_slices, create_dataset_splits, debug): # PATHs output_dir = os.path.join(dataset_dir, 'preprocessed') output_dir_pred = os.path.join(dataset_dir, 'predictionset_preprocessed') # Validate inputs and ensure output dir is empty (prevent mix with old files) rmtree(output_dir, ignore_errors=True) rmtree(output_dir_pred, ignore_errors=True) os.makedirs(output_dir) os.makedirs(output_dir_pred) # Only use free resources on the machine (don't block the machine) if be_nice: os.nice(18) # Load dataset information (location of training data, labels, test data, ...) if create_new_dataset_dict: dataset_dict = create_dataset_dict(dataset_dir) print("Dataset dict created, continue with np-conversion") else: with open(os.path.join(dataset_dir, "dataset.pkl"), 'rb') as f: dataset_dict = pickle.load(f) # Prepare data for parallel processing params = [] filename_pattern = "^(.+\/)*(.+)\.(.+)\.(.+)$" filename_prog = re.compile(filename_pattern) for i in range(len(dataset_dict['training'])): # Select image and corresponding label img = dataset_dict['training'][i]['image'].replace('./', '') lab = dataset_dict['training'][i]['label'].replace('./', '') _, image_path, image_filename, _, _, _ = filename_prog.split(img) output_file_path = os.path.join(output_dir, image_filename) params += [(img, lab, dataset_dir, output_file_path, debug)] for i in range(len(dataset_dict.get('test', ""))): pred = dataset_dict['test'][i].replace('./', '') _, image_path, image_filename, _, _, _ = filename_prog.split(pred) output_file_path = os.path.join(output_dir_pred, image_filename) params += [(pred, None, dataset_dir, output_file_path, debug)] # Convert files in parallel # convert_to_np_worker(params[0][0], params[0][1], params[0][2], params[0][3], params[0][4]) parallel_progbar(convert_to_np_worker, params, starmap=True) # Execute next steps if create_metadata: create_meta(dataset_dir, debug=debug) if split_npz_to_slices: split_npz_into_slices(dataset_dir, debug=debug) if create_dataset_splits: create_splits(dataset_dir)
def create_meta_dirs(image_dir, img_file_suffix, be_nice, debug): # Only use free resources on the machine (don't block the machine) if be_nice: os.nice(18) print("Extracting meta-data from images in {}".format(image_dir)) # Create a list of images to process image_files = dict() files = [] for f in os.listdir(image_dir): if f.endswith(img_file_suffix): files += [(os.path.join(image_dir, f), debug)] # Extract the image shapes in parallel image_shapes = parallel_progbar(extract_shape_from_image, files, starmap=True) # Store the results in format: # image_files[file] = shape for img in image_shapes: npz_file, npz_file_shape = img image_files[npz_file] = npz_file_shape dataset_size = len(image_files.keys()) print("Found {} images for meta collection".format(dataset_size)) with open(os.path.join(image_dir, 'image_meta.pkl'), 'wb') as f: pickle.dump(image_files, f)
def split_npz_into_slices_dirs(be_nice, image_dir, output_dir, debug): # Validate inputs and ensure output dir is empty (prevent mix with old files) rmtree(output_dir, ignore_errors=True) os.makedirs(output_dir) print("Splitting npz files in {}".format(image_dir)) # Copy the meta-data file from the dataset to the new output dir copyfile(os.path.join(image_dir, "image_meta.pkl"), os.path.join(output_dir, "image_meta.pkl")) # Only use free resources on the machine (don't block the machine) if be_nice: os.nice(18) # Create a list of images to process images = [] for image in os.listdir(image_dir): if image.endswith(".npz"): images += [(image_dir, image, output_dir, debug)] # Split the image slices in parallel empty_slices = parallel_progbar(split_npz_worker, images, starmap=True) empty_slices_dict = {} for volume_name, slice_ids in empty_slices: empty_slices_dict[volume_name] = slice_ids with open("{}".format(os.path.join(image_dir, "empty_labels.pkl")), 'wb') as empty_slices_file: pickle.dump(empty_slices_dict, empty_slices_file)
def combine_splits(dataset_dir, be_nice=True, debug=False): # Paths source_dir = os.path.join(dataset_dir, 'predicted_segmentations_slices') output_dir = os.path.join(dataset_dir, 'predicted_segmentations') # Only use free resources on the machine (don't block the machine) if be_nice: os.nice(18) # Validate inputs and ensure output dir is empty (prevent mix with old files) rmtree(output_dir, ignore_errors=True) os.makedirs(output_dir) # Get all 3D volumes volumes = {} volume_metadata = {} filename_pattern = "^(.*)(_slice_)([0-9]*)" filename_prog = re.compile(filename_pattern) for v in os.listdir(source_dir): if v.endswith("nii.gz"): _, volume_name, _, slice_id, _ = filename_prog.split(v) if volume_name in volumes: volumes[volume_name] += [(v, slice_id)] else: # Copy the meta-data file for each volume to the new output dir meta_file = os.path.join(source_dir, "{}.pkl".format(volume_name)) copy(meta_file, output_dir) with open(meta_file, 'rb') as f: meta_data = pickle.load(f) # Create new volume entry volumes[volume_name] = [(v, slice_id)] volume_metadata[volume_name] = meta_data # Convert files in parallel params = [] for volume_name in volumes.keys(): params += [(volume_name, volumes[volume_name], volume_metadata[volume_name], source_dir, output_dir, debug)] parallel_progbar(combine_splits_worker, params, starmap=True)
def test_parallel_progbar_overhead(self): def mapper(i): return i**2 n = list(range(100000)) toc = tic() list_comp = [i**2 for i in n] time_lc = toc('List comprehension') par = parallel_progbar(mapper, n) time_par = toc('Parallel progbar') print("{}s / {}s = {}x slowdown".format(time_par, time_lc, time_par / time_lc)) self.assertSequenceEqual(par, list_comp)
def create_splits(dataset_dir, prediction_volume_numbers=None, image_dir=None, img_file_suffix=".npz", num_splitsets=5): print( "Welcome to the dataset split tool. It will split your data into training, validation, testing sets." ) # Dataset with images to create the splits for if not image_dir: image_dir = os.path.join(dataset_dir, 'preprocessed') # Keep following volume numbers for prediction # only used if no separate, unlabeled image dir is available if prediction_volume_numbers is None: prediction_volume_numbers = [1] # Extract all available image files image_files = [] for file in os.listdir(image_dir): if file.endswith(img_file_suffix): image_files += [file] dataset_size = len(image_files) dataset_size_remaining = dataset_size # Get prediction slices # check if separate testset is available unlabeled_testset_dir = os.path.join(dataset_dir, "predictionset_preprocessed") prediction_set = [] if os.path.exists(unlabeled_testset_dir): print("Separate unlabeled image_dir found.") for file in os.listdir(unlabeled_testset_dir): if file.endswith(img_file_suffix): prediction_set += [file] prediction_set = np.asarray(prediction_set) else: prediction_set = get_volume_slices( image_files, volume_numbers_list=prediction_volume_numbers) # Remove prediction volume from dataset for image in prediction_set: image_files.remove(image) dataset_size_remaining -= len(prediction_set) trainset_size = int(dataset_size_remaining * 0.7) valset_size = int((dataset_size_remaining - trainset_size) / 2) testset_size = int((dataset_size_remaining - trainset_size - valset_size)) # Manual dataset size definition: # trainset_size = 50 # valset_size = 10 # testset_size = 10 print("Found {} labeled slices for learning".format(dataset_size)) print("Added {} image-volumes to training set".format(trainset_size)) print("Added {} image-volumes to validation set".format(valset_size)) print("Added {} image-volumes to test set".format(testset_size)) print("Added {} image-volumes to prediction set".format( len(prediction_set))) print() print("Creating dataset splits ...") splits = [(trainset_size, valset_size, testset_size, image_files.copy()) ] * num_splitsets # Extract the image shapes in parallel) splits_dict = parallel_progbar(create_splits_worker, splits, starmap=True) for i in range(len(splits_dict)): splits_dict[i]['prediction'] = prediction_set with open(os.path.join(dataset_dir, 'splits.pkl'), 'wb') as f: pickle.dump(splits_dict, f) print("Splitfile created")
def to_eep(self, eep_params=None, eep_functions=None, metric_function=None, progress=True, nprocs=None, **kwargs): ''' Converts the grid of evolution tracks to EEP basis. For details on EEP functions, see the documentation for kiauhoku.eep. Parameters ---------- eep_params (dict, None): contains a mapping from your grid's specific column names to the names used by kiauhoku's default EEP functions. It also contains 'eep_intervals', the number of secondary EEPs between each consecutive pair of primary EEPs. If none are supplied, kiauhoku will attempt to read them from a cache directory. eep_functions (dict, None): if the default EEP functions won't do the job, you can specify your own and supply them in a dictionary. EEP functions must have the call signature function(track, eep_params), where `track` is a single track. If none are supplied, the default functions will be used. metric_function (callable, None): the metric function is how the EEP interpolator spaces the secondary EEPs. By default, the path length along the evolution track on the H-R diagram (luminosity vs. Teff) is used, but you can specify your own if desired. metric_function must have the call signature function(track, eep_params), where `track` is a single track. If no function is supplied, defaults to kiauhoku.eep._HRD_distance. progress (bool, True): whether or not to display a progress bar. nprocs (int, None): how many parallel processes to use for MultiIndex DataFrames. If none is specified, defaults to the number of CPUs. **kwargs: extra keyword arguments to pass to parallel_progbar. Returns ------- eep_frame (StarGrid): grid of EEP-based evolution tracks. ''' # User can specify eep_params, but if none are specified, # searched for cached params. if not eep_params: eep_params = load_eep_params(self.name) # If self is a MultiIndexed DataFrame, split it into individual # tracks, convert to EEP basis, and recombine. if self.is_MultiIndex(): def eep_pool_helper(i): # Not strictly necessary, but makes for cleaner mapping. track = self.loc[i, :] return _eep_interpolate(track, eep_params, eep_functions, metric_function) # create index iterator and pass to the mapping/progress function idx = self.index.droplevel(-1).drop_duplicates() eep_tracks = parallel_progbar(eep_pool_helper, idx, verbose=progress, nprocs=nprocs, **kwargs) # Setup MultiIndex and remove Nones idx_list = [(*i, j) for i, tr in zip(idx, eep_tracks) if tr is not None for j in tr.index] eep_tracks = [tr for tr in eep_tracks if tr is not None] # Create MultiIndex for EEP frame multiindex = pd.MultiIndex.from_tuples(idx_list, names=[*idx.names, 'eep']) # Put it all together eep_frame = pd.concat(eep_tracks, ignore_index=True) eep_frame.index = multiindex # Other case is if a single track is passed else: eep_frame = _eep_interpolate(self, eep_params, eep_functions, metric_function) # Cast DataFrame to StarGrid eep_frame = from_pandas(eep_frame, name=self.name, eep_params=eep_params) return eep_frame