def load_without_size_preprocessing(input_folder, cv_fold_num, train_test, idx): image_folders_list, label_folders_list, patient_id_list = get_patient_folders( input_folder, cv_fold_num) # ============================== # First, get ed_es_diff for this subject # ============================== for item in os.listdir(image_folders_list[train_test][idx // 2][:-8]): if 'list.txt' in item: text_file = open( image_folders_list[train_test][idx // 2][:-8] + item, "r") slice_ids_with_annotations = [] for l in text_file.readlines(): slice_ids_with_annotations.append(int(float(l[-25:-21]))) text_file.close() for slice_id in np.unique(slice_ids_with_annotations): if slice_id % 20 != 0: ed_es_diff = int(slice_id % 20) continue # ============================== # read image and label # ============================== image_ED, image_ES, px, py, pz = read_image( image_folders_list[train_test][idx // 2], ed_es_diff) label_ED, label_ES = read_label(label_folders_list[train_test][idx // 2], image_ED.shape, ed_es_diff) img_ED = image_ED.copy() img_ES = image_ES.copy() lab_ED = label_ED.copy() lab_ES = label_ES.copy() # ============ # normalize the image to be between 0 and 1 # ============ img_ED = utils.normalise_image(img_ED, norm_type='div_by_max') img_ES = utils.normalise_image(img_ES, norm_type='div_by_max') # ============ # decide if ES or ED needs to be returned # ============ print((idx % 2)) if int(idx % 2) is 0: image = img_ED label = lab_ED else: image = img_ES label = lab_ES return image, label
def load_without_size_preprocessing(preproc_folder, patient_id): # ================== # read bias corrected image and ground truth segmentation # ================== filepath_bias_corrected_nii_format = preproc_folder + 'Case' + patient_id + '_n4.nii.gz' filepath_seg_nii_format = preproc_folder + 'Case' + patient_id + '_segmentation.nii.gz' # ================================ # read bias corrected image # ================================ image = utils.load_nii(filepath_bias_corrected_nii_format)[0] # ================================ # normalize the image # ================================ image = utils.normalise_image(image, norm_type='div_by_max') # ================================ # read the labels # ================================ label = utils.load_nii(filepath_seg_nii_format)[0] # ================================ # skimage io with simple ITKplugin was used to read the images in the convert_to_nii_and_correct_bias_field function. # this lead to the arrays being read as z-x-y # move the axes appropriately, so that the resolution read above is correct for the corresponding axes. # ================================ image = np.swapaxes(np.swapaxes(image, 0, 1), 1, 2) label = np.swapaxes(np.swapaxes(label, 0, 1), 1, 2) return image, label
def load_without_size_preprocessing(input_folder, cv_fold_num, train_test, idx): file_list = get_file_list(input_folder, cv_fold_num) image_file = file_list[train_test][idx] # ============ # read image and normalize it to be between 0 and 1 # ============ image_dat = utils.load_nii(image_file) image = image_dat[0].copy() image = utils.normalise_image(image, norm_type='div_by_max') # ============ # read label and set RV label to 1, others to 0 # ============ label_file = image_file.split('_n4.nii.gz')[0] + '_gt.nii.gz' label_dat = utils.load_nii(label_file) label = label_dat[0].copy() label[label!=1] = 0 return image, label
def load_without_size_preprocessing(input_folder, site_name, idx, depth): # ======================== # read the filenames # ======================== filenames = sorted(glob.glob(input_folder + site_name + '/*/')) # ================== # get file paths # ================== patient_name, image_path, label_path = get_image_and_label_paths( filenames[idx]) # ============ # read the image and normalize it to be between 0 and 1 # ============ image, _, image_hdr = utils.load_nii(image_path) image = np.swapaxes( image, 1, 2 ) # swap axes 1 and 2 -> this allows appending along axis 2, as in other datasets # ================== # read the label file # ================== label, _, _ = utils.load_nii(label_path) label = np.swapaxes( label, 1, 2 ) # swap axes 1 and 2 -> this allows appending along axis 2, as in other datasets label = utils.group_segmentation_classes( label) # group the segmentation classes as required # ============ # create a segmentation mask and use it to get rid of the skull in the image # ============ label_mask = np.copy(label) label_mask[label > 0] = 1 image = image * label_mask # ================== # crop out some portion of the image, which are all zeros (rough registration via visual inspection) # ================== if site_name is 'CALTECH': image = image[:, 80:, :] label = label[:, 80:, :] elif site_name is 'STANFORD': image, label = center_image_and_label(image, label) # ================== # crop volume along z axis (as there are several zeros towards the ends) # ================== image = utils.crop_or_pad_volume_to_size_along_z(image, depth) label = utils.crop_or_pad_volume_to_size_along_z(label, depth) # ================== # normalize the image # ================== image = utils.normalise_image(image, norm_type='div_by_max') return image, label
def log(self, writer, inputs, outputs, losses): print('logging') writer.add_scalar('lr', self.lr, self.step) # write to tensorboard for loss_type, loss in losses.items(): writer.add_scalar('{}'.format(loss_type), loss, self.step) for i in range(min(4, len(inputs['image']))): writer.add_image('image_l/{}'.format(i), normalise_image(inputs['image'][i]), self.step) writer.add_image('image_r/{}'.format(i), normalise_image(inputs['stereo_image'][i]), self.step) if inputs.get('disparity') is not None: writer.add_image('disp_target/{}'.format(i), normalise_image(inputs['disparity'][i]), self.step) warped_image = self.warp_stereo_image( inputs['stereo_image'][i].cpu(), inputs['disparity'][i].cpu()) writer.add_image('warped_gt_image/{}'.format(i), normalise_image(warped_image), self.step) if inputs.get('mono_disparity') is not None: writer.add_image('mono_disparity/{}'.format(i), normalise_image(inputs['mono_disparity'][i]), self.step) if inputs.get('occlusion_mask') is not None: writer.add_image('occlusion_mask/{}'.format(i), normalise_image(inputs['occlusion_mask'][i]), self.step) writer.add_image('disp_pred/{}'.format(i), normalise_image(outputs[('disp', 0)][i]), self.step) warped_image = self.warp_stereo_image( inputs['stereo_image'][i].cpu(), outputs[('disp', 0)][i].cpu()) writer.add_image('warped_image/{}'.format(i), normalise_image(warped_image), self.step)
def load_without_size_preprocessing(input_folder, idx, labeller): # =============================== # read all the patient folders from the base input folder # =============================== folder_list = [] for folder in os.listdir(input_folder): folder_path = os.path.join(input_folder, folder) if os.path.isdir(folder_path) and 't2_tse_tra.nii.gz' in os.listdir( folder_path): if 'segmentation_' + labeller + '.nii.gz' in os.listdir( folder_path ) or 'segmentation_tra_' + labeller + '.nii.gz' in os.listdir( folder_path): folder_list.append(folder_path) # ================== # read the image file # ================== image, _, _ = utils.load_nii(folder_list[idx] + '/t2_tse_tra_n4.nii.gz') # ============ # normalize the image to be between 0 and 1 # ============ image = utils.normalise_image(image, norm_type='div_by_max') # ================== # read the label file # ================== if 'segmentation_' + labeller + '.nii.gz' in os.listdir(folder_list[idx]): label, _, _ = utils.load_nii(folder_list[idx] + '/segmentation_' + labeller + '.nii.gz') elif 'segmentation_tra_' + labeller + '.nii.gz' in os.listdir( folder_list[idx]): label, _, _ = utils.load_nii(folder_list[idx] + '/segmentation_tra_' + labeller + '.nii.gz') # ================== # remove extra label from some images # ================== label[label > 2] = 0 return image, label # =============================================================== # End of file # ===============================================================
def load_without_size_preprocessing(input_folder, cv_fold_num, train_test, idx): # =============================== # read all the patient folders from the base input folder # =============================== image_folder = os.path.join(input_folder, 'Prostate-3T') label_folder = os.path.join(input_folder, 'NCI_ISBI_Challenge-Prostate3T_Training_Segmentations') folder_list = get_patient_folders(image_folder, folder_base='Prostate3T-01', cv_fold_number = cv_fold_num) folder = folder_list[train_test][idx] # ================== # make a list of all dcm images for this subject # ================== lstFilesDCM = [] # create an empty list for dirName, subdirList, fileList in os.walk(folder): for filename in fileList: if ".dcm" in filename.lower(): # check whether the file's DICOM lstFilesDCM.append(os.path.join(dirName, filename)) # ================== # read bias corrected image # ================== nifti_img_path = lstFilesDCM[0][:lstFilesDCM[0].rfind('/')+1] image = utils.load_nii(img_path = nifti_img_path + 'img_n4.nii.gz')[0] # ============ # normalize the image to be between 0 and 1 # ============ image = utils.normalise_image(image, norm_type='div_by_max') # ================== # read the label file # ================== label = utils.load_nii(img_path = nifti_img_path + 'lbl.nii.gz')[0] return image, label
def load_without_size_preprocessing(input_folder, idx, protocol, preprocessing_folder, depth): # ======================== # read the filenames # ======================== filenames = sorted(glob.glob(input_folder + '*.zip')) # ================== # get file paths # ================== patient_name, image_path, label_path = get_image_and_label_paths( filenames[idx], protocol, preprocessing_folder) # ============ # read the image and normalize it to be between 0 and 1 # ============ image, _, image_hdr = utils.load_nii(image_path) image = np.swapaxes( image, 1, 2 ) # swap axes 1 and 2 -> this allows appending along axis 2, as in other datasets image = utils.normalise_image(image, norm_type='div_by_max') # ================== # read the label file # ================== label, _, _ = utils.load_nii(label_path) label = np.swapaxes( label, 1, 2 ) # swap axes 1 and 2 -> this allows appending along axis 2, as in other datasets label = utils.group_segmentation_classes( label) # group the segmentation classes as required # ================== # crop volume along z axis (as there are several zeros towards the ends) # ================== image = utils.crop_or_pad_volume_to_size_along_z(image, depth) label = utils.crop_or_pad_volume_to_size_along_z(label, depth) return image, label
def prepare_data(input_folder, output_file, idx_start, idx_end, protocol, size, depth, target_resolution, preprocessing_folder): # ======================== # read the filenames # ======================== filenames = sorted(glob.glob(input_folder + '*.zip')) logging.info('Number of images in the dataset: %s' % str(len(filenames))) # ======================= # ======================= hdf5_file = h5py.File(output_file, "w") # =============================== # Create datasets for images and labels # =============================== data = {} num_subjects = idx_end - idx_start data['images'] = hdf5_file.create_dataset("images", [num_subjects] + list(size), dtype=np.float32) data['labels'] = hdf5_file.create_dataset("labels", [num_subjects] + list(size), dtype=np.uint8) # =============================== # initialize lists # =============================== label_list = [] image_list = [] nx_list = [] ny_list = [] nz_list = [] px_list = [] py_list = [] pz_list = [] pat_names_list = [] # =============================== # initiate counter # =============================== patient_counter = 0 # =============================== # iterate through the requested indices # =============================== for idx in range(idx_start, idx_end): # ================== # get file paths # ================== patient_name, image_path, label_path = get_image_and_label_paths(filenames[idx], protocol, preprocessing_folder) # ============ # read the image and normalize it to be between 0 and 1 # ============ image, _, image_hdr = utils.load_nii(image_path) image = np.swapaxes(image, 1, 2) # swap axes 1 and 2 -> this allows appending along axis 2, as in other datasets # ================== # read the label file # ================== label, _, _ = utils.load_nii(label_path) label = np.swapaxes(label, 1, 2) # swap axes 1 and 2 -> this allows appending along axis 2, as in other datasets label = utils.group_segmentation_classes(label) # group the segmentation classes as required # ================== # collect some header info. # ================== px_list.append(float(image_hdr.get_zooms()[0])) py_list.append(float(image_hdr.get_zooms()[2])) # since axes 1 and 2 have been swapped pz_list.append(float(image_hdr.get_zooms()[1])) nx_list.append(image.shape[0]) ny_list.append(image.shape[2]) # since axes 1 and 2 have been swapped nz_list.append(image.shape[1]) pat_names_list.append(patient_name) # ================== # crop volume along z axis (as there are several zeros towards the ends) # ================== image = utils.crop_or_pad_volume_to_size_along_z(image, depth) label = utils.crop_or_pad_volume_to_size_along_z(label, depth) # ================== # normalize the image # ================== image_normalized = utils.normalise_image(image, norm_type='div_by_max') # ====================================================== # rescale, crop / pad to make all images of the required size and resolution # ====================================================== scale_vector = [image_hdr.get_zooms()[0] / target_resolution[0], image_hdr.get_zooms()[2] / target_resolution[1], image_hdr.get_zooms()[1] / target_resolution[2]] # since axes 1 and 2 have been swapped image_rescaled = transform.rescale(image_normalized, scale_vector, order=1, preserve_range=True, multichannel=False, mode = 'constant') label_onehot = utils.make_onehot_(label, nlabels=15) label_onehot_rescaled = transform.rescale(label_onehot, scale_vector, order=1, preserve_range=True, multichannel=True, mode='constant') label_rescaled = np.argmax(label_onehot_rescaled, axis=-1) # ================================== # go through each z slice, crop or pad to a constant size and then append the resized # this will ensure that the axes get arranged in the same orientation as they were during the 2d preprocessing # ================================== image_rescaled_cropped = [] label_rescaled_cropped = [] for zz in range(image_rescaled.shape[2]): image_rescaled_cropped.append(utils.crop_or_pad_slice_to_size(image_rescaled[:,:,zz], size[1], size[2])) label_rescaled_cropped.append(utils.crop_or_pad_slice_to_size(label_rescaled[:,:,zz], size[1], size[2])) image_rescaled_cropped = np.array(image_rescaled_cropped) label_rescaled_cropped = np.array(label_rescaled_cropped) # ============ # append to list # ============ image_list.append(image_rescaled_cropped) label_list.append(label_rescaled_cropped) # ============ # write to file # ============ _write_range_to_hdf5(data, image_list, label_list, patient_counter, patient_counter+1) _release_tmp_memory(image_list, label_list) # update counter patient_counter += 1 # Write the small datasets hdf5_file.create_dataset('nx', data=np.asarray(nx_list, dtype=np.uint16)) hdf5_file.create_dataset('ny', data=np.asarray(ny_list, dtype=np.uint16)) hdf5_file.create_dataset('nz', data=np.asarray(nz_list, dtype=np.uint16)) hdf5_file.create_dataset('px', data=np.asarray(px_list, dtype=np.float32)) hdf5_file.create_dataset('py', data=np.asarray(py_list, dtype=np.float32)) hdf5_file.create_dataset('pz', data=np.asarray(pz_list, dtype=np.float32)) hdf5_file.create_dataset('patnames', data=np.asarray(pat_names_list, dtype="S10")) # After test train loop: hdf5_file.close()
def prepare_data( input_folder, output_file, mode, size, # for 3d: (nz, nx, ny), for 2d: (nx, ny) target_resolution, # for 3d: (px, py, pz), for 2d: (px, py) cv_fold_num): ''' Main function that prepares a dataset from the raw challenge data to an hdf5 dataset ''' assert (mode in ['2D', '3D']), 'Unknown mode: %s' % mode if mode == '2D' and not len(size) == 2: raise AssertionError('Inadequate number of size parameters') if mode == '3D' and not len(size) == 3: raise AssertionError('Inadequate number of size parameters') if mode == '2D' and not len(target_resolution) == 2: raise AssertionError( 'Inadequate number of target resolution parameters') if mode == '3D' and not len(target_resolution) == 3: raise AssertionError( 'Inadequate number of target resolution parameters') # ============ # create an empty hdf5 file # ============ hdf5_file = h5py.File(output_file, "w") # ============ # create empty lists for filling header info # ============ diag_list = {'test': [], 'train': [], 'validation': []} height_list = {'test': [], 'train': [], 'validation': []} weight_list = {'test': [], 'train': [], 'validation': []} patient_id_list = {'test': [], 'train': [], 'validation': []} cardiac_phase_list = {'test': [], 'train': [], 'validation': []} nx_list = {'test': [], 'train': [], 'validation': []} ny_list = {'test': [], 'train': [], 'validation': []} nz_list = {'test': [], 'train': [], 'validation': []} px_list = {'test': [], 'train': [], 'validation': []} py_list = {'test': [], 'train': [], 'validation': []} pz_list = {'test': [], 'train': [], 'validation': []} file_list = {'test': [], 'train': [], 'validation': []} num_slices = {'test': 0, 'train': 0, 'validation': 0} # ============ # go through all images and get header info. # one round of parsing is done just to get all the header info. The size info is used to create empty fields for the images and labels, with the required sizes. # Then, another round of reading the images and labels is done, which are pre-processed and written into the hdf5 file # ============ for folder in os.listdir(input_folder): folder_path = os.path.join(input_folder, folder) if os.path.isdir(folder_path): # ============ # train_test_validation split # ============ train_test = test_train_val_split(patient_id=int(folder[-3:]), cv_fold_number=cv_fold_num) infos = {} for line in open(os.path.join(folder_path, 'Info.cfg')): label, value = line.split(':') infos[label] = value.rstrip('\n').lstrip(' ') patient_id = folder.lstrip('patient') # ============ # reading this patient's image and collecting header info # ============ for file in glob.glob( os.path.join(folder_path, 'patient???_frame??_n4.nii.gz')): # ============ # list with file paths # ============ file_list[train_test].append(file) diag_list[train_test].append(diagnosis_dict[infos['Group']]) weight_list[train_test].append(infos['Weight']) height_list[train_test].append(infos['Height']) patient_id_list[train_test].append(patient_id) systole_frame = int(infos['ES']) diastole_frame = int(infos['ED']) file_base = file.split('.')[0] frame = int(file_base.split('frame')[-1][:-3]) if frame == systole_frame: cardiac_phase_list[train_test].append(1) # 1 == systole elif frame == diastole_frame: cardiac_phase_list[train_test].append(2) # 2 == diastole else: cardiac_phase_list[train_test].append( 0) # 0 means other phase nifty_img = nib.load(file) nx_list[train_test].append(nifty_img.shape[0]) ny_list[train_test].append(nifty_img.shape[1]) nz_list[train_test].append(nifty_img.shape[2]) num_slices[train_test] += nifty_img.shape[2] py_list[train_test].append( nifty_img.header.structarr['pixdim'][2]) px_list[train_test].append( nifty_img.header.structarr['pixdim'][1]) pz_list[train_test].append( nifty_img.header.structarr['pixdim'][3]) # ============ # writing the small datasets # ============ for tt in ['test', 'train', 'validation']: hdf5_file.create_dataset('diagnosis_%s' % tt, data=np.asarray(diag_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('weight_%s' % tt, data=np.asarray(weight_list[tt], dtype=np.float32)) hdf5_file.create_dataset('height_%s' % tt, data=np.asarray(height_list[tt], dtype=np.float32)) hdf5_file.create_dataset('patient_id_%s' % tt, data=np.asarray(patient_id_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('cardiac_phase_%s' % tt, data=np.asarray(cardiac_phase_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('nz_%s' % tt, data=np.asarray(nz_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('ny_%s' % tt, data=np.asarray(ny_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('nx_%s' % tt, data=np.asarray(nx_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('py_%s' % tt, data=np.asarray(py_list[tt], dtype=np.float32)) hdf5_file.create_dataset('px_%s' % tt, data=np.asarray(px_list[tt], dtype=np.float32)) hdf5_file.create_dataset('pz_%s' % tt, data=np.asarray(pz_list[tt], dtype=np.float32)) # ============ # setting sizes according to 2d or 3d # ============ if mode == '3D': # size [num_patients, nz, nx, ny] nz_max, nx, ny = size n_train = len(file_list['train']) # number of patients n_test = len(file_list['test']) n_val = len(file_list['validation']) elif mode == '2D': # size [num_z_slices_across_all_patients, nx, ny] nx, ny = size n_test = num_slices['test'] n_train = num_slices['train'] n_val = num_slices['validation'] else: raise AssertionError('Wrong mode setting. This should never happen.') # ============ # creating datasets for images and labels # ============ data = {} for tt, num_points in zip(['test', 'train', 'validation'], [n_test, n_train, n_val]): if num_points > 0: data['images_%s' % tt] = hdf5_file.create_dataset( "images_%s" % tt, [num_points] + list(size), dtype=np.float32) data['labels_%s' % tt] = hdf5_file.create_dataset( "labels_%s" % tt, [num_points] + list(size), dtype=np.uint8) image_list = {'test': [], 'train': [], 'validation': []} label_list = {'test': [], 'train': [], 'validation': []} for train_test in ['test', 'train', 'validation']: write_buffer = 0 counter_from = 0 patient_counter = 0 for image_file in file_list[train_test]: patient_counter += 1 logging.info('============================================') logging.info('Doing: %s' % image_file) # ============ # read image # ============ image_dat = utils.load_nii(image_file) image = image_dat[0].copy() # ============ # normalize the image to be between 0 and 1 # ============ image = utils.normalise_image(image, norm_type='div_by_max') # ============ # read label # ============ file_base = image_file.split('_n4.nii.gz')[0] label_file = file_base + '_gt.nii.gz' label_dat = utils.load_nii(label_file) label = label_dat[0].copy() # ============ # set RV label to 1 and other labels to 0, as the RVSC dataset only has labels for the RV # original labels: 0 bachground, 1 right ventricle, 2 myocardium, 3 left ventricle # ============ # label[label!=1] = 0 # ============ # original pixel size (px, py, pz) # ============ pixel_size = (image_dat[2].structarr['pixdim'][1], image_dat[2].structarr['pixdim'][2], image_dat[2].structarr['pixdim'][3]) # ======================================================================== # PROCESSING LOOP FOR 3D DATA # ======================================================================== if mode == '3D': # rescaling ratio scale_vector = [ pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1], pixel_size[2] / target_resolution[2] ] # ============================== # rescale image and label # ============================== image_scaled = transform.rescale(image, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') label_scaled = transform.rescale(label, scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') # ============================== # ============================== image_scaled = utils.crop_or_pad_volume_to_size_along_z( image_scaled, nz_max) label_scaled = utils.crop_or_pad_volume_to_size_along_z( label_scaled, nz_max) # ============================== # nz_max is the z-dimension provided in the 'size' parameter # ============================== image_vol = np.zeros((nx, ny, nz_max), dtype=np.float32) label_vol = np.zeros((nx, ny, nz_max), dtype=np.uint8) # =============================== # going through each z slice # =============================== for zz in range(nz_max): image_slice = image_scaled[:, :, zz] label_slice = label_scaled[:, :, zz] # cropping / padding with zeros the x-y slice at this z location image_slice_cropped = utils.crop_or_pad_slice_to_size( image_slice, nx, ny) label_slice_cropped = utils.crop_or_pad_slice_to_size( label_slice, nx, ny) image_vol[:, :, zz] = image_slice_cropped label_vol[:, :, zz] = label_slice_cropped # =============================== # swap axes to maintain consistent orientation as compared to 2d pre-processing # =============================== image_vol = image_vol.swapaxes(0, 2).swapaxes(1, 2) label_vol = label_vol.swapaxes(0, 2).swapaxes(1, 2) # =============================== # append to list that will be written to the hdf5 file # =============================== image_list[train_test].append(image_vol) label_list[train_test].append(label_vol) write_buffer += 1 # =============================== # writing the images and labels pre-processed so far to the hdf5 file # =============================== if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, image_list, label_list, counter_from, counter_to) _release_tmp_memory(image_list, label_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 # ======================================================================== # PROCESSING LOOP FOR SLICE-BY-SLICE 2D DATA # ======================================================================== elif mode == '2D': scale_vector = [ pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1] ] # =============================== # go through each z slice, rescale and crop and append. # in this process, the z axis will become the zeroth axis # =============================== for zz in range(image.shape[2]): image_slice = np.squeeze(image[:, :, zz]) label_slice = np.squeeze(label[:, :, zz]) image_slice_rescaled = transform.rescale( image_slice, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') label_slice_rescaled = transform.rescale( label_slice, scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') image_slice_cropped = utils.crop_or_pad_slice_to_size( image_slice_rescaled, nx, ny) label_slice_cropped = utils.crop_or_pad_slice_to_size( label_slice_rescaled, nx, ny) image_list[train_test].append(image_slice_cropped) label_list[train_test].append(label_slice_cropped) write_buffer += 1 # Writing needs to happen inside the loop over the slices if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, image_list, label_list, counter_from, counter_to) _release_tmp_memory(image_list, label_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, image_list, label_list, counter_from, counter_to) _release_tmp_memory(image_list, label_list, train_test) # After test train loop: hdf5_file.close()
def prepare_data(input_folder, output_file, idx_start, idx_end, protocol, size, depth, target_resolution, preprocessing_folder): # ======================== # read the filenames # ======================== folders_list = sorted(glob.glob(input_folder + '/*/')) logging.info('Number of images in the dataset: %s' % str(len(folders_list))) # ======================= # create a hdf5 file # ======================= hdf5_file = h5py.File(output_file, "w") # =============================== # Create datasets for images and labels # =============================== data = {} num_slices = count_slices(folders_list, idx_start, idx_end, depth) data['images'] = hdf5_file.create_dataset("images", [num_slices] + list(size), dtype=np.float32) data['labels'] = hdf5_file.create_dataset("labels", [num_slices] + list(size), dtype=np.uint8) # =============================== # initialize lists # =============================== label_list = [] image_list = [] nx_list = [] ny_list = [] nz_list = [] px_list = [] py_list = [] pz_list = [] pat_names_list = [] # =============================== # =============================== write_buffer = 0 counter_from = 0 # =============================== # iterate through the requested indices # =============================== for idx in range(idx_start, idx_end): # ================== # get file paths # ================== patient_name, image_path, label_path = get_image_and_label_paths( folders_list[idx]) # ============ # read the image and normalize it to be between 0 and 1 # ============ image, _, image_hdr = utils.load_nii(image_path) image = np.swapaxes( image, 1, 2 ) # swap axes 1 and 2 -> this allows appending along axis 2, as in other datasets # ================== # read the label file # ================== label, _, _ = utils.load_nii(label_path) label = np.swapaxes( label, 1, 2 ) # swap axes 1 and 2 -> this allows appending along axis 2, as in other datasets # labels have already been grouped as required # ============ # create a segmentation mask and use it to get rid of the skull in the image # ============ label_mask = np.copy(label) label_mask[label > 0] = 1 image = image * label_mask # ================== # crop out some portion of the image, which are all zeros (rough registration via visual inspection) # ================== image, label = center_image_and_label(image, label) # plt.figure(); plt.imshow(image[:,:,50], cmap='gray'); plt.title(patient_name); plt.show(); plt.close() # ================== # crop volume along z axis (as there are several zeros towards the ends) # ================== image = utils.crop_or_pad_volume_to_size_along_z(image, depth) label = utils.crop_or_pad_volume_to_size_along_z(label, depth) # ================== # collect some header info. # ================== px_list.append(float(image_hdr.get_zooms()[0])) py_list.append( float(image_hdr.get_zooms()[2]) ) # since axes 1 and 2 have been swapped. this is important when dealing with pixel dimensions pz_list.append(float(image_hdr.get_zooms()[1])) nx_list.append(image.shape[0]) ny_list.append( image.shape[1] ) # since axes 1 and 2 have been swapped. however, only the final axis locations are relevant when dealing with shapes nz_list.append(image.shape[2]) pat_names_list.append(patient_name) # ================== # normalize the image # ================== image_normalized = utils.normalise_image(image, norm_type='div_by_max') # ====================================================== ### PROCESSING LOOP FOR SLICE-BY-SLICE 2D DATA ################### # ====================================================== scale_vector = [ image_hdr.get_zooms()[0] / target_resolution[0], image_hdr.get_zooms()[2] / target_resolution[1] ] # since axes 1 and 2 have been swapped. this is important when dealing with pixel dimensions for zz in range(image.shape[2]): # ============ # rescale the images and labels so that their orientation matches that of the nci dataset # ============ image2d_rescaled = rescale(np.squeeze(image_normalized[:, :, zz]), scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') label2d_rescaled = rescale(np.squeeze(label[:, :, zz]), scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') # ============ # rotate to align with other datasets # ============ image2d_rescaled_rotated = np.rot90(image2d_rescaled, k=0) label2d_rescaled_rotated = np.rot90(label2d_rescaled, k=0) # ============ # crop or pad to make of the same size # ============ image2d_rescaled_rotated_cropped = utils.crop_or_pad_slice_to_size( image2d_rescaled_rotated, size[0], size[1]) label2d_rescaled_rotated_cropped = utils.crop_or_pad_slice_to_size( label2d_rescaled_rotated, size[0], size[1]) # ============ # append to list # ============ image_list.append(image2d_rescaled_rotated_cropped) label_list.append(label2d_rescaled_rotated_cropped) write_buffer += 1 # Writing needs to happen inside the loop over the slices if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, image_list, label_list, counter_from, counter_to) _release_tmp_memory(image_list, label_list) # update counters counter_from = counter_to write_buffer = 0 logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, image_list, label_list, counter_from, counter_to) _release_tmp_memory(image_list, label_list) # Write the small datasets hdf5_file.create_dataset('nx', data=np.asarray(nx_list, dtype=np.uint16)) hdf5_file.create_dataset('ny', data=np.asarray(ny_list, dtype=np.uint16)) hdf5_file.create_dataset('nz', data=np.asarray(nz_list, dtype=np.uint16)) hdf5_file.create_dataset('px', data=np.asarray(px_list, dtype=np.float32)) hdf5_file.create_dataset('py', data=np.asarray(py_list, dtype=np.float32)) hdf5_file.create_dataset('pz', data=np.asarray(pz_list, dtype=np.float32)) hdf5_file.create_dataset('patnames', data=np.asarray(pat_names_list, dtype="S10")) # After test train loop: hdf5_file.close()
def prepare_data(input_image_folder, input_mask_folder, output_file, size, target_resolution): ''' Main function that prepares a dataset from the raw challenge data to an hdf5 dataset ''' hdf5_file = h5py.File(output_file, "w") expert_list = [ 'Readings_AH', 'Readings_EK', 'Readings_KC', 'Readings_KS', 'Readings_OD', 'Readings_UM' ] num_annotators = len(expert_list) logging.info('Counting files and parsing meta data...') patient_id_list = {'test': [], 'train': [], 'validation': []} image_file_list = {'test': [], 'train': [], 'validation': []} mask_file_list = {'test': [], 'train': [], 'validation': []} num_slices = {'test': 0, 'train': 0, 'validation': 0} logging.info('Counting files and parsing meta data...') for folder in os.listdir(input_image_folder): folder_path = os.path.join(input_image_folder, folder) if os.path.isdir(folder_path) and folder.startswith('888'): patient_id = int(folder.lstrip('888')) if patient_id == 9: logging.info( 'WARNING: Skipping case 9, because one annotation has wrong dimensions...' ) continue if patient_id % 5 == 0: train_test = 'test' elif patient_id % 4 == 0: train_test = 'validation' else: train_test = 'train' file_path = os.path.join(folder_path, 't2_tse_tra.nii.gz') annotator_mask_list = [] for exp in expert_list: mask_folder = os.path.join(input_mask_folder, exp) file = glob.glob( os.path.join(mask_folder, '*' + str(patient_id).zfill(4) + '_*.nii.gz')) # for ii in range(len(file)): # if 'NCI' in file[ii]: # del file[ii] assert len( file ) == 1, 'more or less than one file matches the glob pattern %s' % ( '*' + str(patient_id).zfill(5) + '*.nii.gz') annotator_mask_list.append(file[0]) mask_file_list[train_test].append(annotator_mask_list) image_file_list[train_test].append(file_path) patient_id_list[train_test].append(patient_id) nifty_img = nib.load(file_path) num_slices[train_test] += nifty_img.shape[2] # Write the small datasets for tt in ['test', 'train', 'validation']: hdf5_file.create_dataset('patient_id_%s' % tt, data=np.asarray(patient_id_list[tt], dtype=np.uint8)) nx, ny = size n_test = num_slices['test'] n_train = num_slices['train'] n_val = num_slices['validation'] print('Debug: Check if sets add up to correct value:') print(n_train, n_val, n_test, n_train + n_val + n_test) # Create datasets for images and masks data = {} for tt, num_points in zip(['test', 'train', 'validation'], [n_test, n_train, n_val]): if num_points > 0: data['images_%s' % tt] = hdf5_file.create_dataset( "images_%s" % tt, [num_points] + list(size), dtype=np.float32) data['masks_%s' % tt] = hdf5_file.create_dataset( "masks_%s" % tt, [num_points] + list(size) + [num_annotators], dtype=np.uint8) mask_list = {'test': [], 'train': [], 'validation': []} img_list = {'test': [], 'train': [], 'validation': []} logging.info('Parsing image files') for train_test in ['test', 'train', 'validation']: write_buffer = 0 counter_from = 0 patient_counter = 0 for img_file, mask_files in zip(image_file_list[train_test], mask_file_list[train_test]): patient_counter += 1 logging.info( '-----------------------------------------------------------') logging.info('Doing: %s' % img_file) img_dat = utils.load_nii(img_file) img = img_dat[0] masks = [] for mf in mask_files: mask_dat = utils.load_nii(mf) masks.append(mask_dat[0]) masks_arr = np.asarray(masks) # annotator, size_x, size_y, size_z masks_arr = masks_arr.transpose( (1, 2, 3, 0)) # size_x, size_y, size_z, annotator img = utils.normalise_image(img) pixel_size = (img_dat[2].structarr['pixdim'][1], img_dat[2].structarr['pixdim'][2], img_dat[2].structarr['pixdim'][3]) logging.info('Pixel size:') logging.info(pixel_size) scale_vector = [ pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1] ] for zz in range(img.shape[2]): slice_img = np.squeeze(img[:, :, zz]) slice_rescaled = transform.rescale(slice_img, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') slice_mask = np.squeeze(masks_arr[:, :, zz, :]) mask_rescaled = transform.rescale(slice_mask, scale_vector, order=0, preserve_range=True, multichannel=True, mode='constant') slice_cropped = crop_or_pad_slice_to_size( slice_rescaled, nx, ny) mask_cropped = crop_or_pad_slice_to_size(mask_rescaled, nx, ny) # REMOVE SEMINAL VESICLES mask_cropped[mask_cropped == 3] = 0 # DEBUG # import matplotlib.pyplot as plt # plt.figure() # plt.imshow(slice_img) # # plt.figure() # plt.imshow(slice_rescaled) # # plt.figure() # plt.imshow(slice_cropped) # # plt.show() # END DEBUG img_list[train_test].append(slice_cropped) mask_list[train_test].append(mask_cropped) write_buffer += 1 # Writing needs to happen inside the loop over the slices if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 # after file loop: Write the remaining data logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # After test train loop: hdf5_file.close()
def prepare_data(input_folder, output_file, idx_start, idx_end, protocol, size, depth, target_resolution, preprocessing_folder): # ======================== # read the filenames # ======================== filenames = sorted(glob.glob(input_folder + '*.zip')) logging.info('Number of images in the dataset: %s' % str(len(filenames))) # ======================= # create a new hdf5 file # ======================= hdf5_file = h5py.File(output_file, "w") # =============================== # Create datasets for images and labels # =============================== data = {} num_slices = count_slices(filenames, idx_start, idx_end, protocol, preprocessing_folder, depth) # =============================== # the sizes of the image and label arrays are set as: [(number of coronal slices per subject*number of subjects), size of coronal slices] # =============================== data['images'] = hdf5_file.create_dataset("images", [num_slices] + list(size), dtype=np.float32) data['labels'] = hdf5_file.create_dataset("labels", [num_slices] + list(size), dtype=np.uint8) # =============================== # initialize lists # =============================== label_list = [] image_list = [] nx_list = [] ny_list = [] nz_list = [] px_list = [] py_list = [] pz_list = [] pat_names_list = [] # =============================== # initialize counters # =============================== write_buffer = 0 counter_from = 0 # =============================== # iterate through the requested indices # =============================== for idx in range(idx_start, idx_end): # ================== # get file paths # ================== patient_name, image_path, label_path = get_image_and_label_paths(filenames[idx], protocol, preprocessing_folder) # ============ # read the image and normalize it to be between 0 and 1 # ============ image, _, image_hdr = utils.load_nii(image_path) image = np.swapaxes(image, 1, 2) # swap axes 1 and 2 -> this allows appending along axis 2, as in other datasets # ================== # read the label file # ================== label, _, _ = utils.load_nii(label_path) label = np.swapaxes(label, 1, 2) # swap axes 1 and 2 -> this allows appending along axis 2, as in other datasets label = utils.group_segmentation_classes(label) # group the segmentation classes as required # ================== # crop volume along z axis (as there are several zeros towards the ends) # ================== image = utils.crop_or_pad_volume_to_size_along_z(image, depth) label = utils.crop_or_pad_volume_to_size_along_z(label, depth) # ================== # collect some header info. # ================== px_list.append(float(image_hdr.get_zooms()[0])) py_list.append(float(image_hdr.get_zooms()[2])) # since axes 1 and 2 have been swapped pz_list.append(float(image_hdr.get_zooms()[1])) nx_list.append(image.shape[0]) ny_list.append(image.shape[1]) # since axes 1 and 2 have been swapped nz_list.append(image.shape[2]) pat_names_list.append(patient_name) # ================== # normalize the image # ================== image_normalized = utils.normalise_image(image, norm_type='div_by_max') # ====================================================== ### PROCESSING LOOP FOR SLICE-BY-SLICE 2D DATA ################### # ====================================================== scale_vector = [image_hdr.get_zooms()[0] / target_resolution[0], image_hdr.get_zooms()[2] / target_resolution[1]] # since axes 1 and 2 have been swapped for zz in range(image.shape[2]): # ============ # rescale the images and labels so that their orientation matches that of the nci dataset # ============ image2d_rescaled = rescale(np.squeeze(image_normalized[:, :, zz]), scale_vector, order=1, preserve_range=True, multichannel=False, mode = 'constant') label2d_rescaled = rescale(np.squeeze(label[:, :, zz]), scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') # ============ # crop or pad to make of the same size # ============ image2d_rescaled_rotated_cropped = utils.crop_or_pad_slice_to_size(image2d_rescaled, size[0], size[1]) label2d_rescaled_rotated_cropped = utils.crop_or_pad_slice_to_size(label2d_rescaled, size[0], size[1]) # ============ # append to list # ============ image_list.append(image2d_rescaled_rotated_cropped) label_list.append(label2d_rescaled_rotated_cropped) # ============ # increment counter # ============ write_buffer += 1 # ============ # Writing needs to happen inside the loop over the slices # ============ if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, image_list, label_list, counter_from, counter_to) _release_tmp_memory(image_list, label_list) # ============ # update counters # ============ counter_from = counter_to write_buffer = 0 # ============ # write leftover data # ============ logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, image_list, label_list, counter_from, counter_to) _release_tmp_memory(image_list, label_list) # ============ # Write the small datasets - image resolutions, sizes, patient ids # ============ hdf5_file.create_dataset('nx', data=np.asarray(nx_list, dtype=np.uint16)) hdf5_file.create_dataset('ny', data=np.asarray(ny_list, dtype=np.uint16)) hdf5_file.create_dataset('nz', data=np.asarray(nz_list, dtype=np.uint16)) hdf5_file.create_dataset('px', data=np.asarray(px_list, dtype=np.float32)) hdf5_file.create_dataset('py', data=np.asarray(py_list, dtype=np.float32)) hdf5_file.create_dataset('pz', data=np.asarray(pz_list, dtype=np.float32)) hdf5_file.create_dataset('patnames', data=np.asarray(pat_names_list, dtype="S10")) # ============ # close the hdf5 file # ============ hdf5_file.close()
def prepare_data(input_folder, output_file, mode, size, target_resolution, cv_fold_num): ''' Main function that prepares a dataset from the raw challenge data to an hdf5 dataset ''' hdf5_file = h5py.File(output_file, "w") cardiac_phase_list = {'test': [], 'train': [], 'validation': []} nx_list = {'test': [], 'train': [], 'validation': []} ny_list = {'test': [], 'train': [], 'validation': []} nz_list = {'test': [], 'train': [], 'validation': []} px_list = {'test': [], 'train': [], 'validation': []} py_list = {'test': [], 'train': [], 'validation': []} pz_list = {'test': [], 'train': [], 'validation': []} num_slices = {'test': 0, 'train': 0, 'validation': 0} # ============================== # read all images and save header info # ============================== image_folders_list, label_folders_list, patient_id_list = get_patient_folders( input_folder, cv_fold_num) for tt in ['train', 'test', 'validation']: for sub_id in range(len(image_folders_list[tt])): image_details = get_image_details(image_folders_list[tt][sub_id]) for _ in range(2): # append details for 2 volumes - ED and ES px_list[tt].append(image_details[0]) py_list[tt].append(image_details[1]) pz_list[tt].append(image_details[2]) nx_list[tt].append(image_details[3]) ny_list[tt].append(image_details[4]) nz_list[tt].append(image_details[5]) num_slices[tt] += image_details[5] cardiac_phase_list[tt].append(1) # 'ES' cardiac_phase_list[tt].append(2) # 'ED' # ============================== # Write the small datasets # ============================== for tt in ['test', 'train', 'validation']: hdf5_file.create_dataset('patient_id_%s' % tt, data=np.asarray(patient_id_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('cardiac_phase_%s' % tt, data=np.asarray(cardiac_phase_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('nz_%s' % tt, data=np.asarray(nz_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('ny_%s' % tt, data=np.asarray(ny_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('nx_%s' % tt, data=np.asarray(nx_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('py_%s' % tt, data=np.asarray(py_list[tt], dtype=np.float32)) hdf5_file.create_dataset('px_%s' % tt, data=np.asarray(px_list[tt], dtype=np.float32)) hdf5_file.create_dataset('pz_%s' % tt, data=np.asarray(pz_list[tt], dtype=np.float32)) # ============================== # set dimensions for the hdf5 file # ============================== nx, ny = size n_test = num_slices['test'] n_train = num_slices['train'] n_val = num_slices['validation'] # ============================== # Create datasets for images and labels # ============================== data = {} for tt, num_points in zip(['test', 'train', 'validation'], [n_test, n_train, n_val]): if num_points > 0: data['images_%s' % tt] = hdf5_file.create_dataset( "images_%s" % tt, [num_points] + list(size), dtype=np.float32) data['labels_%s' % tt] = hdf5_file.create_dataset( "labels_%s" % tt, [num_points] + list(size), dtype=np.uint8) label_list = {'test': [], 'train': [], 'validation': []} image_list = {'test': [], 'train': [], 'validation': []} # ============================== # read each image and label # ============================== logging.info('Parsing image files') for train_test in ['test', 'train', 'validation']: write_buffer = 0 counter_from = 0 patient_counter = 0 for sub_num in range(len(image_folders_list[train_test])): patient_counter += 1 logging.info('============================================') logging.info('Doing: %s' % image_folders_list[train_test][sub_num]) # ============================== # First, get ed_es_diff for this subject # ============================== for item in os.listdir( image_folders_list[train_test][sub_num][:-8]): if 'list.txt' in item: text_file = open( image_folders_list[train_test][sub_num][:-8] + item, "r") slice_ids_with_annotations = [] for l in text_file.readlines(): slice_ids_with_annotations.append( int(float(l[-25:-21]))) text_file.close() for slice_id in np.unique(slice_ids_with_annotations): if slice_id % 20 != 0: ed_es_diff = int(slice_id % 20) continue # ============================== # read image and label # ============================== image_ED, image_ES, px, py, pz = read_image( image_folders_list[train_test][sub_num], ed_es_diff, nifti_available=False) label_ED, label_ES = read_label( label_folders_list[train_test][sub_num], image_ED.shape, ed_es_diff, nifti_available=False) img_ED = image_ED.copy() img_ES = image_ES.copy() lab_ED = label_ED.copy() lab_ES = label_ES.copy() # ============ # normalize the image to be between 0 and 1 # ============ img_ED = utils.normalise_image(img_ED, norm_type='div_by_max') img_ES = utils.normalise_image(img_ES, norm_type='div_by_max') pixel_size = (px, py, pz) scale_vector = [ pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1] ] # ============ # rescale and write to hdf5 the ED image and label # ============ for zz in range(img_ED.shape[2]): slice_rescaled_ED = transform.rescale(np.squeeze(img_ED[:, :, zz]), scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') label_rescaled_ED = transform.rescale(np.squeeze(lab_ED[:, :, zz]), scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') slice_cropped_ED = utils.crop_or_pad_slice_to_size( slice_rescaled_ED, nx, ny) label_cropped_ED = utils.crop_or_pad_slice_to_size( label_rescaled_ED, nx, ny) # ============ # rotation by 90 degrees # ============ slice_cropped_ED = np.rot90(slice_cropped_ED, k=-1) label_cropped_ED = np.rot90(label_cropped_ED, k=-1) image_list[train_test].append(slice_cropped_ED) label_list[train_test].append(label_cropped_ED) write_buffer += 1 # Writing needs to happen inside the loop over the slices if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, image_list, label_list, counter_from, counter_to) _release_tmp_memory(image_list, label_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 # ============ # rescale and write to hdf5 the ES image and label # ============ for zz in range(img_ES.shape[2]): slice_rescaled_ES = transform.rescale(np.squeeze(img_ES[:, :, zz]), scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') label_rescaled_ES = transform.rescale(np.squeeze(lab_ES[:, :, zz]), scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') slice_cropped_ES = utils.crop_or_pad_slice_to_size( slice_rescaled_ES, nx, ny) label_cropped_ES = utils.crop_or_pad_slice_to_size( label_rescaled_ES, nx, ny) # ============ # rotation by 90 degrees # ============ slice_cropped_ES = np.rot90(slice_cropped_ES, k=-1) label_cropped_ES = np.rot90(label_cropped_ES, k=-1) image_list[train_test].append(slice_cropped_ES) label_list[train_test].append(label_cropped_ES) write_buffer += 1 # Writing needs to happen inside the loop over the slices if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, image_list, label_list, counter_from, counter_to) _release_tmp_memory(image_list, label_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 # ============ # write remaining data # ============ logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, image_list, label_list, counter_from, counter_to) _release_tmp_memory(image_list, label_list, train_test) # After test train loop: hdf5_file.close()
def prepare_data(input_folder, output_filepath, idx_start, idx_end, size, target_resolution, labeller): # =============================== # create a hdf5 file # =============================== hdf5_file = h5py.File(output_filepath, "w") # =============================== # read all the patient folders from the base input folder # =============================== folder_list = [] for folder in os.listdir(input_folder): folder_path = os.path.join(input_folder, folder) if os.path.isdir(folder_path) and 't2_tse_tra.nii.gz' in os.listdir( folder_path): if 'segmentation_' + labeller + '.nii.gz' in os.listdir( folder_path ) or 'segmentation_tra_' + labeller + '.nii.gz' in os.listdir( folder_path): folder_list.append(folder_path) # =============================== # Create datasets for images and labels # =============================== data = {} num_slices = count_slices(folder_list, idx_start, idx_end) data['images'] = hdf5_file.create_dataset("images", [num_slices] + list(size), dtype=np.float32) data['labels'] = hdf5_file.create_dataset("labels", [num_slices] + list(size), dtype=np.float32) # =============================== # initialize lists # =============================== label_list = [] image_list = [] nx_list = [] ny_list = [] nz_list = [] px_list = [] py_list = [] pz_list = [] pat_names_list = [] # =============================== # =============================== write_buffer = 0 counter_from = 0 patient_counter = 0 # =============================== # iterate through the requested indices # =============================== for idx in range(idx_start, idx_end): patient_counter = patient_counter + 1 # ================== # read the image file # ================== image, _, image_hdr = utils.load_nii(folder_list[idx] + '/t2_tse_tra_n4.nii.gz') # ============ # normalize the image to be between 0 and 1 # ============ image_normalized = utils.normalise_image(image, norm_type='div_by_max') # ================== # collect some header info. # ================== px_list.append(float(image_hdr.get_zooms()[0])) py_list.append(float(image_hdr.get_zooms()[1])) pz_list.append(float(image_hdr.get_zooms()[2])) nx_list.append(image.shape[0]) ny_list.append(image.shape[1]) nz_list.append(image.shape[2]) pat_names_list.append(folder_list[idx][folder_list[idx].rfind('/') + 1:]) # ================== # read the label file # ================== if 'segmentation_' + labeller + '.nii.gz' in os.listdir( folder_list[idx]): label, _, _ = utils.load_nii(folder_list[idx] + '/segmentation_' + labeller + '.nii.gz') elif 'segmentation_tra_' + labeller + '.nii.gz' in os.listdir( folder_list[idx]): label, _, _ = utils.load_nii(folder_list[idx] + '/segmentation_tra_' + labeller + '.nii.gz') # ================== # remove extra label from some images # ================== label[label > 2] = 0 print(np.unique(label)) # ====================================================== ### PROCESSING LOOP FOR SLICE-BY-SLICE 2D DATA ################### # ====================================================== scale_vector = [ image_hdr.get_zooms()[0] / target_resolution[0], image_hdr.get_zooms()[1] / target_resolution[1] ] for zz in range(image.shape[2]): # ============ # rescale the images and labels so that their orientation matches that of the nci dataset # ============ image2d_rescaled = rescale(np.squeeze(image_normalized[:, :, zz]), scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') label2d_rescaled = rescale(np.squeeze(label[:, :, zz]), scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') # ============ # rotate the images and labels so that their orientation matches that of the nci dataset # ============ image2d_rescaled_rotated = np.rot90(image2d_rescaled, k=3) label2d_rescaled_rotated = np.rot90(label2d_rescaled, k=3) # ============ # crop or pad to make of the same size # ============ image2d_rescaled_rotated_cropped = crop_or_pad_slice_to_size( image2d_rescaled_rotated, size[0], size[1]) label2d_rescaled_rotated_cropped = crop_or_pad_slice_to_size( label2d_rescaled_rotated, size[0], size[1]) image_list.append(image2d_rescaled_rotated_cropped) label_list.append(label2d_rescaled_rotated_cropped) write_buffer += 1 # Writing needs to happen inside the loop over the slices if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, image_list, label_list, counter_from, counter_to) _release_tmp_memory(image_list, label_list) # update counters counter_from = counter_to write_buffer = 0 logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, image_list, label_list, counter_from, counter_to) _release_tmp_memory(image_list, label_list) # Write the small datasets hdf5_file.create_dataset('nx', data=np.asarray(nx_list, dtype=np.uint16)) hdf5_file.create_dataset('ny', data=np.asarray(ny_list, dtype=np.uint16)) hdf5_file.create_dataset('nz', data=np.asarray(nz_list, dtype=np.uint16)) hdf5_file.create_dataset('px', data=np.asarray(px_list, dtype=np.float32)) hdf5_file.create_dataset('py', data=np.asarray(py_list, dtype=np.float32)) hdf5_file.create_dataset('pz', data=np.asarray(pz_list, dtype=np.float32)) hdf5_file.create_dataset('patnames', data=np.asarray(pat_names_list, dtype="S10")) # After test train loop: hdf5_file.close()
def prepare_data(input_folder, output_file, size, target_resolution, labels_list, rescale_to_one, offset=None, image_postfix='.nii.gz'): ''' Main function that prepares a dataset from the raw challenge data to an hdf5 dataset ''' csv_summary_file = os.path.join(input_folder, 'summary_alldata.csv') summary = pd.read_csv(csv_summary_file) # Use only cases that have imaging data (obs) summary = summary.loc[summary['image_exists'] == True] # Don't use images with unknown diagnosis summary = summary.loc[~(summary['diagnosis_3cat'] == 'unknown')] # Get list of unique rids rids = summary.rid.unique() # Get initial diagnosis for rough stratification diagnoses = [] for rid in rids: diagnoses.append( summary.loc[summary['rid'] == rid]['diagnosis_3cat'].values[0]) train_and_val_rids, test_rids, train_and_val_diagnoses, _ = train_test_split( rids, diagnoses, test_size=0.2, stratify=diagnoses) train_rids, val_rids = train_test_split( train_and_val_rids, test_size=0.2, stratify=train_and_val_diagnoses) print(len(train_rids), len(test_rids), len(val_rids)) # n_images_train = len(summary.loc[summary['rid'].isin(train_rids)]) # n_images_test = len(summary.loc[summary['rid'].isin(test_rids)]) # n_images_val = len(summary.loc[summary['rid'].isin(val_rids)]) hdf5_file = h5py.File(output_file, "w") diag_list = {'test': [], 'train': [], 'val': []} weight_list = {'test': [], 'train': [], 'val': []} age_list = {'test': [], 'train': [], 'val': []} gender_list = {'test': [], 'train': [], 'val': []} rid_list = {'test': [], 'train': [], 'val': []} viscode_list = {'test': [], 'train': [], 'val': []} adas13_list = {'test': [], 'train': [], 'val': []} mmse_list = {'test': [], 'train': [], 'val': []} field_strength_list = {'test': [], 'train': [], 'val': []} file_list = {'test': [], 'train': [], 'val': []} logging.info('Counting files and parsing meta data...') for train_test, set_rids in zip(['train', 'test', 'val'], [train_rids, test_rids, val_rids]): for ii, row in summary.iterrows(): rid = row['rid'] if rid not in set_rids: continue diagnosis_str = row['diagnosis_3cat'] diagnosis = diagnosis_dict[diagnosis_str] if diagnosis not in labels_list: continue rid_list[train_test].append(rid) diag_list[train_test].append(diagnosis) viscode = row['viscode'] viscode_list[train_test].append(viscode_dict[viscode]) weight_list[train_test].append(row['weight']) age_list[train_test].append(row['age']) gender_list[train_test].append(gender_dict[row['gender']]) adas13_list[train_test].append(fix_nan_and_unknown( row['adas13'], target_data_format=np.float32)) mmse_list[train_test].append(fix_nan_and_unknown( row['mmse'], target_data_format=np.uint8)) field_strength = row['field_strength'] field_strength_list[train_test].append(field_strength) phase = row['phase'] file_name = 'rid_%s/%s_%sT_%s_rid%s_%s%s' % (str(rid).zfill(4), phase.lower(), str(field_strength), diagnosis_str, str(rid).zfill(4), viscode, image_postfix) file_list[train_test].append(os.path.join(input_folder, file_name)) # Write the small datasets for tt in ['test', 'train', 'val']: hdf5_file.create_dataset( 'rid_%s' % tt, data=np.asarray(rid_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('viscode_%s' % tt, data=np.asarray( viscode_list[tt], dtype=np.uint8)) hdf5_file.create_dataset( 'diagnosis_%s' % tt, data=np.asarray(diag_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('age_%s' % tt, data=np.asarray( age_list[tt], dtype=np.float32)) hdf5_file.create_dataset('weight_%s' % tt, data=np.asarray( weight_list[tt], dtype=np.float32)) hdf5_file.create_dataset('gender_%s' % tt, data=np.asarray( gender_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('adas13_%s' % tt, data=np.asarray( adas13_list[tt], dtype=np.float32)) hdf5_file.create_dataset( 'mmse_%s' % tt, data=np.asarray(mmse_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('field_strength_%s' % tt, data=np.asarray( field_strength_list[tt], dtype=np.float16)) n_train = len(file_list['train']) n_test = len(file_list['test']) n_val = len(file_list['val']) # assert n_train == n_images_train, 'Mismatch in data sizes, %d not == %d' % (n_train, n_images_train) # assert n_test == n_images_test, 'Mismatch in data sizes, %d not == %d' % (n_test, n_images_test) # assert n_val == n_images_val, 'Mismatch in data sizes, %d not == %d' % (n_val, n_images_val) # Create datasets for images and masks data = {} for tt, num_points in zip(['test', 'train', 'val'], [n_test, n_train, n_val]): data['images_%s' % tt] = hdf5_file.create_dataset( "images_%s" % tt, [num_points] + list(size), dtype=np.float32) img_list = {'test': [], 'train': [], 'val': []} logging.info('Parsing image files') for train_test in ['test', 'train', 'val']: write_buffer = 0 counter_from = 0 for file in file_list[train_test]: logging.info( '-----------------------------------------------------------') logging.info('Doing: %s' % file) img_dat = utils.load_nii(file) img = img_dat[0].copy() pixel_size = (img_dat[2].structarr['pixdim'][1], img_dat[2].structarr['pixdim'][2], img_dat[2].structarr['pixdim'][3]) logging.info('Pixel size:') logging.info(pixel_size) scale_vector = [pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1], pixel_size[2] / target_resolution[2]] img_scaled = transform.rescale(img, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') img_resized = crop_or_pad_slice_to_size( img_scaled, size, offset=offset) if rescale_to_one: img_resized = utils.map_image_to_intensity_range( img_resized, -1, 1, percentiles=5) else: img_resized = utils.normalise_image(img_resized) ### DEBUGGING ############################################ # utils.create_and_save_nii(img_resized, 'debug.nii.gz') # exit() ######################################################### img_list[train_test].append(img_resized) write_buffer += 1 if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5( data, train_test, img_list, counter_from, counter_to) _release_tmp_memory(img_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 # after file loop: Write the remaining data logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, counter_from, counter_to) _release_tmp_memory(img_list, train_test) # After test train loop: hdf5_file.close()
def prepare_data(input_folder, output_file, mode, size, target_resolution): ''' Main function that prepares a dataset from the raw challenge data to an hdf5 dataset ''' assert (mode in ['2D', '3D']), 'Unknown mode: %s' % mode if mode == '2D' and not len(size) == 2: raise AssertionError('Inadequate number of size parameters') if mode == '3D' and not len(size) == 3: raise AssertionError('Inadequate number of size parameters') if mode == '2D' and not len(target_resolution) == 2: raise AssertionError( 'Inadequate number of target resolution parameters') if mode == '3D' and not len(target_resolution) == 3: raise AssertionError( 'Inadequate number of target resolution parameters') hdf5_file = h5py.File(output_file, "w") diag_list = {'test': [], 'train': [], 'validation': []} height_list = {'test': [], 'train': [], 'validation': []} weight_list = {'test': [], 'train': [], 'validation': []} patient_id_list = {'test': [], 'train': [], 'validation': []} cardiac_phase_list = {'test': [], 'train': [], 'validation': []} file_list = {'test': [], 'train': [], 'validation': []} num_slices = {'test': 0, 'train': 0, 'validation': 0} logging.info('Counting files and parsing meta data...') for folder in os.listdir(input_folder): folder_path = os.path.join(input_folder, folder) if os.path.isdir(folder_path): if int(folder[-3:]) % 5 == 0: train_test = 'test' elif int(folder[-3:]) % 4 == 0: train_test = 'validation' else: train_test = 'train' infos = {} for line in open(os.path.join(folder_path, 'Info.cfg')): label, value = line.split(':') infos[label] = value.rstrip('\n').lstrip(' ') patient_id = folder.lstrip('patient') for file in glob.glob( os.path.join(folder_path, 'patient???_frame??.nii.gz')): file_list[train_test].append(file) # diag_list[train_test].append(diagnosis_to_int(infos['Group'])) diag_list[train_test].append(diagnosis_dict[infos['Group']]) weight_list[train_test].append(infos['Weight']) height_list[train_test].append(infos['Height']) patient_id_list[train_test].append(patient_id) systole_frame = int(infos['ES']) diastole_frame = int(infos['ED']) file_base = file.split('.')[0] frame = int(file_base.split('frame')[-1]) if frame == systole_frame: cardiac_phase_list[train_test].append(1) # 1 == systole elif frame == diastole_frame: cardiac_phase_list[train_test].append(2) # 2 == diastole else: cardiac_phase_list[train_test].append( 0) # 0 means other phase nifty_img = nib.load(file) num_slices[train_test] += nifty_img.shape[2] # Write the small datasets for tt in ['test', 'train', 'validation']: hdf5_file.create_dataset('diagnosis_%s' % tt, data=np.asarray(diag_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('weight_%s' % tt, data=np.asarray(weight_list[tt], dtype=np.float32)) hdf5_file.create_dataset('height_%s' % tt, data=np.asarray(height_list[tt], dtype=np.float32)) hdf5_file.create_dataset('patient_id_%s' % tt, data=np.asarray(patient_id_list[tt], dtype=np.uint8)) hdf5_file.create_dataset('cardiac_phase_%s' % tt, data=np.asarray(cardiac_phase_list[tt], dtype=np.uint8)) if mode == '3D': nx, ny, nz_max = size n_train = len(file_list['train']) n_test = len(file_list['test']) n_val = len(file_list['validation']) elif mode == '2D': nx, ny = size n_test = num_slices['test'] n_train = num_slices['train'] n_val = num_slices['validation'] else: raise AssertionError('Wrong mode setting. This should never happen.') # print('Debug: Check if sets add up to correct value:') # print(n_train, n_val, n_test, n_train + n_val + n_test) # Create datasets for images and masks data = {} for tt, num_points in zip(['test', 'train', 'validation'], [n_test, n_train, n_val]): if num_points > 0: data['images_%s' % tt] = hdf5_file.create_dataset( "images_%s" % tt, [num_points] + list(size), dtype=np.float32) data['masks_%s' % tt] = hdf5_file.create_dataset( "masks_%s" % tt, [num_points] + list(size), dtype=np.uint8) mask_list = {'test': [], 'train': [], 'validation': []} img_list = {'test': [], 'train': [], 'validation': []} logging.info('Parsing image files') for train_test in ['test', 'train', 'validation']: write_buffer = 0 counter_from = 0 full_mask_list = [] patient_counter = 0 for file in file_list[train_test]: patient_counter += 1 logging.info( '-----------------------------------------------------------') logging.info('Doing: %s' % file) file_base = file.split('.nii.gz')[0] file_mask = file_base + '_gt.nii.gz' # patient_id = int(file_base.split('/')[-1].lstrip('patient').split('_')[0]) img_dat = utils.load_nii(file) mask_dat = utils.load_nii(file_mask) img = img_dat[0].copy() mask = mask_dat[0].copy() img = utils.normalise_image(img) pixel_size = (img_dat[2].structarr['pixdim'][1], img_dat[2].structarr['pixdim'][2], img_dat[2].structarr['pixdim'][3]) logging.info('Pixel size:') logging.info(pixel_size) ### PROCESSING LOOP FOR 3D DATA ################################ if mode == '3D': scale_vector = [ pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1], pixel_size[2] / target_resolution[2] ] img_scaled = transform.rescale(img, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') mask_scaled = transform.rescale(mask, scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') slice_vol = np.zeros((nx, ny, nz_max), dtype=np.float32) mask_vol = np.zeros((nx, ny, nz_max), dtype=np.uint8) nz_curr = img_scaled.shape[2] stack_from = (nz_max - nz_curr) // 2 if stack_from < 0: raise AssertionError( 'nz_max is too small for the chosen through plane resolution. Consider changing' 'the size or the target resolution in the through-plane.' ) for zz in range(nz_curr): slice_rescaled = img_scaled[:, :, zz] mask_rescaled = mask_scaled[:, :, zz] slice_cropped = crop_or_pad_slice_to_size( slice_rescaled, nx, ny) mask_cropped = crop_or_pad_slice_to_size( mask_rescaled, nx, ny) slice_vol[:, :, stack_from] = slice_cropped mask_vol[:, :, stack_from] = mask_cropped stack_from += 1 img_list[train_test].append(slice_vol) mask_list[train_test].append(mask_vol) write_buffer += 1 if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 ### PROCESSING LOOP FOR SLICE-BY-SLICE 2D DATA ################### elif mode == '2D': scale_vector = [ pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1] ] for zz in range(img.shape[2]): slice_img = np.squeeze(img[:, :, zz]) slice_rescaled = transform.rescale(slice_img, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') slice_mask = np.squeeze(mask[:, :, zz]) mask_rescaled = transform.rescale(slice_mask, scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') slice_cropped = crop_or_pad_slice_to_size( slice_rescaled, nx, ny) mask_cropped = crop_or_pad_slice_to_size( mask_rescaled, nx, ny) img_list[train_test].append(slice_cropped) mask_list[train_test].append(mask_cropped) write_buffer += 1 # Writing needs to happen inside the loop over the slices if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 # scale_vector = [pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1]] # # for zz in range(img.shape[2]): # # slice_img = np.squeeze(img[:, :, zz]) # slice_rescaled = transform.rescale(slice_img, # scale_vector, # order=1, # preserve_range=True, # multichannel=False, # mode='constant') # # slice_mask = np.squeeze(mask[:, :, zz]) # mask_rescaled = transform.rescale(slice_mask, # scale_vector, # order=0, # preserve_range=True, # multichannel=False, # mode='constant') # # slice_cropped = crop_or_pad_slice_to_size(slice_rescaled, nx, ny) # mask_cropped = crop_or_pad_slice_to_size(mask_rescaled, nx, ny) # # img_list[train_test].append(slice_cropped) # mask_list[train_test].append(mask_cropped) # # # write_buffer += 1 # # # Writing needs to happen inside the loop over the slices # if write_buffer >= MAX_WRITE_BUFFER: # counter_to = counter_from + write_buffer # _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) # _release_tmp_memory(img_list, mask_list, train_test) # # # reset stuff for next iteration # counter_from = counter_to # write_buffer = 0 # hdf5_file.create_dataset('full_mask_available_%s' % train_test, data=np.asarray(full_mask_list, dtype=np.uint8)) # after file loop: Write the remaining data logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # After test train loop: hdf5_file.close()
def main(input_folder, output_folder, model_path, exp_config, do_postprocessing=False, gt_exists=True): # Get Data data_loader = data_switch(exp_config.data_identifier) data = data_loader(exp_config) # Make and restore vagan model segmenter_model = segmenter( exp_config=exp_config, data=data, fixed_batch_size=1) # CRF model requires fixed batch size segmenter_model.load_weights(model_path, type='best_dice') total_time = 0 total_volumes = 0 dice_list = [] assd_list = [] hd_list = [] for folder in os.listdir(input_folder): folder_path = os.path.join(input_folder, folder) if os.path.isdir(folder_path): infos = {} for line in open(os.path.join(folder_path, 'Info.cfg')): label, value = line.split(':') infos[label] = value.rstrip('\n').lstrip(' ') patient_id = folder.lstrip('patient') if not int(patient_id) % 5 == 0: continue ED_frame = int(infos['ED']) ES_frame = int(infos['ES']) for file in glob.glob( os.path.join(folder_path, 'patient???_frame??.nii.gz')): logging.info(' ----- Doing image: -------------------------') logging.info('Doing: %s' % file) logging.info(' --------------------------------------------') file_base = file.split('.nii.gz')[0] frame = int(file_base.split('frame')[-1]) img, img_affine, img_header = utils.load_nii(file) img = utils.normalise_image(img) zooms = img_header.get_zooms() if gt_exists: file_mask = file_base + '_gt.nii.gz' mask, mask_affine, mask_header = utils.load_nii(file_mask) start_time = time.time() if exp_config.dimensionality_mode == '2D': pixel_size = (img_header.structarr['pixdim'][1], img_header.structarr['pixdim'][2]) scale_vector = (pixel_size[0] / exp_config.target_resolution[0], pixel_size[1] / exp_config.target_resolution[1]) predictions = [] nx, ny = exp_config.image_size for zz in range(img.shape[2]): slice_img = np.squeeze(img[:, :, zz]) slice_rescaled = transform.rescale(slice_img, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') x, y = slice_rescaled.shape x_s = (x - nx) // 2 y_s = (y - ny) // 2 x_c = (nx - x) // 2 y_c = (ny - y) // 2 # Crop section of image for prediction if x > nx and y > ny: slice_cropped = slice_rescaled[x_s:x_s + nx, y_s:y_s + ny] else: slice_cropped = np.zeros((nx, ny)) if x <= nx and y > ny: slice_cropped[x_c:x_c + x, :] = slice_rescaled[:, y_s:y_s + ny] elif x > nx and y <= ny: slice_cropped[:, y_c:y_c + y] = slice_rescaled[x_s:x_s + nx, :] else: slice_cropped[x_c:x_c + x, y_c:y_c + y] = slice_rescaled[:, :] # GET PREDICTION network_input = np.float32( np.tile(np.reshape(slice_cropped, (nx, ny, 1)), (1, 1, 1, 1))) mask_out, softmax = segmenter_model.predict( network_input) prediction_cropped = np.squeeze(softmax[0, ...]) # ASSEMBLE BACK THE SLICES slice_predictions = np.zeros( (x, y, exp_config.nlabels)) # insert cropped region into original image again if x > nx and y > ny: slice_predictions[x_s:x_s + nx, y_s:y_s + ny, :] = prediction_cropped else: if x <= nx and y > ny: slice_predictions[:, y_s:y_s + ny, :] = prediction_cropped[ x_c:x_c + x, :, :] elif x > nx and y <= ny: slice_predictions[ x_s:x_s + nx, :, :] = prediction_cropped[:, y_c:y_c + y, :] else: slice_predictions[:, :, :] = prediction_cropped[ x_c:x_c + x, y_c:y_c + y, :] # RESCALING ON THE LOGITS if gt_exists: prediction = transform.resize( slice_predictions, (mask.shape[0], mask.shape[1], exp_config.nlabels), order=1, preserve_range=True, mode='constant') else: # This can occasionally lead to wrong volume size, therefore if gt_exists # we use the gt mask size for resizing. prediction = transform.rescale( slice_predictions, (1.0 / scale_vector[0], 1.0 / scale_vector[1], 1), order=1, preserve_range=True, multichannel=False, mode='constant') prediction = np.uint8(np.argmax(prediction, axis=-1)) # import matplotlib.pyplot as plt # fig = plt.Figure() # for ii in range(3): # plt.subplot(1, 3, ii + 1) # plt.imshow(np.squeeze(prediction)) # plt.show() predictions.append(prediction) prediction_arr = np.transpose( np.asarray(predictions, dtype=np.uint8), (1, 2, 0)) elif exp_config.dimensionality_mode == '3D': nx, ny, nz = exp_config.image_size pixel_size = (img_header.structarr['pixdim'][1], img_header.structarr['pixdim'][2], img_header.structarr['pixdim'][3]) scale_vector = (pixel_size[0] / exp_config.target_resolution[0], pixel_size[1] / exp_config.target_resolution[1], pixel_size[2] / exp_config.target_resolution[2]) vol_scaled = transform.rescale(img, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') nz_max = exp_config.image_size[2] slice_vol = np.zeros((nx, ny, nz_max), dtype=np.float32) nz_curr = vol_scaled.shape[2] stack_from = (nz_max - nz_curr) // 2 stack_counter = stack_from x, y, z = vol_scaled.shape x_s = (x - nx) // 2 y_s = (y - ny) // 2 x_c = (nx - x) // 2 y_c = (ny - y) // 2 for zz in range(nz_curr): slice_rescaled = vol_scaled[:, :, zz] if x > nx and y > ny: slice_cropped = slice_rescaled[x_s:x_s + nx, y_s:y_s + ny] else: slice_cropped = np.zeros((nx, ny)) if x <= nx and y > ny: slice_cropped[x_c:x_c + x, :] = slice_rescaled[:, y_s:y_s + ny] elif x > nx and y <= ny: slice_cropped[:, y_c:y_c + y] = slice_rescaled[x_s:x_s + nx, :] else: slice_cropped[x_c:x_c + x, y_c:y_c + y] = slice_rescaled[:, :] slice_vol[:, :, stack_counter] = slice_cropped stack_counter += 1 stack_to = stack_counter network_input = np.float32( np.reshape(slice_vol, (1, nx, ny, nz_max, 1))) start_time = time.time() mask_out, softmax = segmenter_model.predict(network_input) logging.info('Classified 3D: %f secs' % (time.time() - start_time)) prediction_nzs = mask_out[0, :, :, stack_from: stack_to] # non-zero-slices if not prediction_nzs.shape[2] == nz_curr: raise ValueError('sizes mismatch') # ASSEMBLE BACK THE SLICES prediction_scaled = np.zeros( vol_scaled.shape) # last dim is for logits classes # insert cropped region into original image again if x > nx and y > ny: prediction_scaled[x_s:x_s + nx, y_s:y_s + ny, :] = prediction_nzs else: if x <= nx and y > ny: prediction_scaled[:, y_s:y_s + ny, :] = prediction_nzs[x_c:x_c + x, :, :] elif x > nx and y <= ny: prediction_scaled[ x_s:x_s + nx, :, :] = prediction_nzs[:, y_c:y_c + y, :] else: prediction_scaled[:, :, :] = prediction_nzs[ x_c:x_c + x, y_c:y_c + y, :] logging.info('Prediction_scaled mean %f' % (np.mean(prediction_scaled))) prediction = transform.resize( prediction_scaled, (mask.shape[0], mask.shape[1], mask.shape[2], 1), order=1, preserve_range=True, mode='constant') prediction = np.argmax(prediction, axis=-1) prediction_arr = np.asarray(prediction, dtype=np.uint8) # This is the same for 2D and 3D again if do_postprocessing: prediction_arr = utils.keep_largest_connected_components( prediction_arr) elapsed_time = time.time() - start_time total_time += elapsed_time total_volumes += 1 logging.info('Evaluation of volume took %f secs.' % elapsed_time) if frame == ED_frame: frame_suffix = '_ED' elif frame == ES_frame: frame_suffix = '_ES' else: raise ValueError( 'Frame doesnt correspond to ED or ES. frame = %d, ED = %d, ES = %d' % (frame, ED_frame, ES_frame)) # Save prediced mask out_file_name = os.path.join( output_folder, 'prediction', 'patient' + patient_id + frame_suffix + '.nii.gz') if gt_exists: out_affine = mask_affine out_header = mask_header else: out_affine = img_affine out_header = img_header logging.info('saving to: %s' % out_file_name) utils.save_nii(out_file_name, prediction_arr, out_affine, out_header) # Save image data to the same folder for convenience image_file_name = os.path.join( output_folder, 'image', 'patient' + patient_id + frame_suffix + '.nii.gz') logging.info('saving to: %s' % image_file_name) utils.save_nii(image_file_name, img, out_affine, out_header) if gt_exists: # Save GT image gt_file_name = os.path.join( output_folder, 'ground_truth', 'patient' + patient_id + frame_suffix + '.nii.gz') logging.info('saving to: %s' % gt_file_name) utils.save_nii(gt_file_name, mask, out_affine, out_header) # Save difference mask between predictions and ground truth difference_mask = np.where( np.abs(prediction_arr - mask) > 0, [1], [0]) difference_mask = np.asarray(difference_mask, dtype=np.uint8) diff_file_name = os.path.join( output_folder, 'difference', 'patient' + patient_id + frame_suffix + '.nii.gz') logging.info('saving to: %s' % diff_file_name) utils.save_nii(diff_file_name, difference_mask, out_affine, out_header) # calculate metrics y_ = prediction_arr y = mask per_lbl_dice = [] per_lbl_assd = [] per_lbl_hd = [] for lbl in [3, 1, 2]: #range(exp_config.nlabels): binary_pred = (y_ == lbl) * 1 binary_gt = (y == lbl) * 1 if np.sum(binary_gt) == 0 and np.sum(binary_pred) == 0: per_lbl_dice.append(1) per_lbl_assd.append(0) per_lbl_hd.append(0) elif np.sum(binary_pred) > 0 and np.sum( binary_gt) == 0 or np.sum( binary_pred) == 0 and np.sum(binary_gt) > 0: logging.warning( 'Structure missing in either GT (x)or prediction. ASSD and HD will not be accurate.' ) per_lbl_dice.append(0) per_lbl_assd.append(1) per_lbl_hd.append(1) else: per_lbl_dice.append(dc(binary_pred, binary_gt)) per_lbl_assd.append( assd(binary_pred, binary_gt, voxelspacing=zooms)) per_lbl_hd.append( hd(binary_pred, binary_gt, voxelspacing=zooms)) dice_list.append(per_lbl_dice) assd_list.append(per_lbl_assd) hd_list.append(per_lbl_hd) logging.info('Average time per volume: %f' % (total_time / total_volumes)) dice_arr = np.asarray(dice_list) assd_arr = np.asarray(assd_list) hd_arr = np.asarray(hd_list) mean_per_lbl_dice = dice_arr.mean(axis=0) mean_per_lbl_assd = assd_arr.mean(axis=0) mean_per_lbl_hd = hd_arr.mean(axis=0) logging.info('Dice') logging.info(mean_per_lbl_dice) logging.info(np.mean(mean_per_lbl_dice)) logging.info('ASSD') logging.info(mean_per_lbl_assd) logging.info(np.mean(mean_per_lbl_assd)) logging.info('HD') logging.info(mean_per_lbl_hd) logging.info(np.mean(mean_per_lbl_hd))
def prepare_data(input_folder, preproc_folder, protocol, idx_start, idx_end): images = [] affines = [] patnames = [] masks = [] # ======================== # read the filenames # ======================== filenames = sorted(glob.glob(input_folder + '*.zip')) logging.info('Number of images in the dataset: %s' % str(len(filenames))) # ======================== # iterate through the requested indices # ======================== for idx in range(idx_start, idx_end): logging.info( '============================================================') # ======================== # get the file name for this subject # ======================== filename = filenames[idx] # ======================== # define how much of the image can be cropped out as it consists of zeros # ======================== x_start = 18 x_end = -18 y_start = 28 y_end = -27 z_start = 2 z_end = -34 # original images are 260 * 311 * 260 # cropping them down to 224 * 256 * 224 # ======================== # read the contents inside the top-level subject directory # ======================== with zipfile.ZipFile(filename, 'r') as zfile: # ======================== # search for the relevant files # ======================== for name in zfile.namelist(): # ======================== # search for files inside the T1w directory # ======================== if re.search(r'\/T1w/', name) != None: # ======================== # search for .gz files inside the T1w directory # ======================== if re.search(r'\.gz$', name) != None: # ======================== # get the protocol image # ======================== if re.search(protocol + 'acpc_dc_restore_brain', name) != None: logging.info('reading image: %s' % name) _filepath = zfile.extract( name, sys_config.preproc_folder_hcp ) # extract the image filepath _patname = name[:name.find( '/')] # extract the patient name _img_data, _img_affine, _img_header = utils.load_nii( _filepath) # read the 3d image _img_data = _img_data[ x_start:x_end, y_start:y_end, z_start: z_end] # discard some pixels as they are always zero. _img_data = utils.normalise_image( _img_data, norm_type='div_by_max' ) # normalise the image (volume wise) savepath = sys_config.preproc_folder_hcp + _patname + '/preprocessed_image' + protocol + '.nii' # save the pre-processed image utils.save_nii(savepath, _img_data, _img_affine, _img_header) images.append( _img_data ) # append to the list of all images, affines and patient names affines.append(_img_affine) patnames.append(_patname) # ======================== # get the segmentation mask # ======================== if re.search( 'aparc.aseg', name ) != None: # segmentation mask with ~100 classes if re.search('T1wDividedByT2w_', name) == None: logging.info('reading mask: %s' % name) _segpath = zfile.extract( name, sys_config.preproc_folder_hcp ) # extract the segmentation mask _patname = name[:name.find( '/')] # extract the patient name _seg_data, _seg_affine, _seg_header = utils.load_nii( _segpath) # read the segmentation mask _seg_data = _seg_data[ x_start:x_end, y_start:y_end, z_start: z_end] # discard some pixels as they are always zero. _seg_data = utils.group_segmentation_classes( _seg_data ) # group the segmentation classes as required savepath = sys_config.preproc_folder_hcp + _patname + '/preprocessed_gt15.nii' # save the pre-processed segmentation ground truth utils.save_nii(savepath, _seg_data, _seg_affine, _seg_header) masks.append( _seg_data ) # append to the list of all masks # ======================== # convert the lists to arrays # ======================== images = np.array(images) affines = np.array(affines) patnames = np.array(patnames) masks = np.array(masks, dtype='uint8') # ======================== # merge along the y-zis to get a stack of x-z slices, for the images as well as the masks # ======================== images = images.swapaxes(1, 2) images = images.reshape(-1, images.shape[2], images.shape[3]) masks = masks.swapaxes(1, 2) masks = masks.reshape(-1, masks.shape[2], masks.shape[3]) # ======================== # save the processed images and masks so that they can be directly read the next time # make appropriate filenames according to the requested indices of training, validation and test images # ======================== logging.info('Saving pre-processed files...') config_details = '%sfrom%dto%d_' % (protocol, idx_start, idx_end) filepath_images = preproc_folder + config_details + 'images_2d.npy' filepath_masks = preproc_folder + config_details + 'annotations15_2d.npy' filepath_affine = preproc_folder + config_details + 'affines.npy' filepath_patnames = preproc_folder + config_details + 'patnames.npy' np.save(filepath_images, images) np.save(filepath_masks, masks) np.save(filepath_affine, affines) np.save(filepath_patnames, patnames) return images, masks, affines, patnames
def prepare_data(input_folder, output_file, size, target_resolution, cv_fold_num): # ======================= # ======================= image_folder = os.path.join(input_folder, 'Prostate-3T') mask_folder = os.path.join(input_folder, 'NCI_ISBI_Challenge-Prostate3T_Training_Segmentations') # ======================= # ======================= hdf5_file = h5py.File(output_file, "w") # ======================= # ======================= logging.info('Counting files and parsing meta data...') folder_list = get_patient_folders(image_folder, folder_base='Prostate3T-01', cv_fold_number = cv_fold_num) num_slices = count_slices(image_folder, folder_base='Prostate3T-01', cv_fold_number = cv_fold_num) nx, ny = size n_test = num_slices['test'] n_train = num_slices['train'] n_val = num_slices['validation'] # ======================= # ======================= print('Debug: Check if sets add up to correct value:') print(n_train, n_val, n_test, n_train + n_val + n_test) # ======================= # Create datasets for images and masks # ======================= data = {} for tt, num_points in zip(['test', 'train', 'validation'], [n_test, n_train, n_val]): if num_points > 0: data['images_%s' % tt] = hdf5_file.create_dataset("images_%s" % tt, [num_points] + list(size), dtype=np.float32) data['masks_%s' % tt] = hdf5_file.create_dataset("masks_%s" % tt, [num_points] + list(size), dtype=np.uint8) mask_list = {'test': [], 'train': [], 'validation': []} img_list = {'test': [], 'train': [], 'validation': []} nx_list = {'test': [], 'train': [], 'validation': []} ny_list = {'test': [], 'train': [], 'validation': []} nz_list = {'test': [], 'train': [], 'validation': []} px_list = {'test': [], 'train': [], 'validation': []} py_list = {'test': [], 'train': [], 'validation': []} pz_list = {'test': [], 'train': [], 'validation': []} pat_names_list = {'test': [], 'train': [], 'validation': []} # ======================= # ======================= logging.info('Parsing image files') for train_test in ['test', 'train', 'validation']: write_buffer = 0 counter_from = 0 patient_counter = 0 for folder in folder_list[train_test]: patient_counter += 1 logging.info('================================') logging.info('Doing: %s' % folder) pat_names_list[train_test].append(str(folder.split('-')[-1])) lstFilesDCM = [] # create an empty list for dirName, subdirList, fileList in os.walk(folder): # fileList.sort() for filename in fileList: if ".dcm" in filename.lower(): # check whether the file's DICOM lstFilesDCM.append(os.path.join(dirName, filename)) # Get ref file RefDs = dicom.read_file(lstFilesDCM[0]) # Load dimensions based on the number of rows, columns, and slices (along the Z axis) ConstPixelDims = (int(RefDs.Rows), int(RefDs.Columns), len(lstFilesDCM)) # Load spacing values (in mm) pixel_size = (float(RefDs.PixelSpacing[0]), float(RefDs.PixelSpacing[1]), float(RefDs.SliceThickness)) px_list[train_test].append(float(RefDs.PixelSpacing[0])) py_list[train_test].append(float(RefDs.PixelSpacing[1])) pz_list[train_test].append(float(RefDs.SliceThickness)) print('PixelDims') print(ConstPixelDims) print('PixelSpacing') print(pixel_size) # The array is sized based on 'ConstPixelDims' img = np.zeros(ConstPixelDims, dtype=RefDs.pixel_array.dtype) # loop through all the DICOM files for filenameDCM in lstFilesDCM: # read the file ds = dicom.read_file(filenameDCM) # ====== # store the raw image data # img[:, :, lstFilesDCM.index(filenameDCM)] = ds.pixel_array # index number field is not set correctly! # instead instance number is the slice number. # ====== img[:, :, ds.InstanceNumber - 1] = ds.pixel_array # ================================ # save as nifti, this sets the affine transformation as an identity matrix # ================================ nifti_img_path = lstFilesDCM[0][:lstFilesDCM[0].rfind('/')+1] utils.save_nii(img_path = nifti_img_path + 'img.nii.gz', data = img, affine = np.eye(4)) # ================================ # do bias field correction # ================================ input_img = nifti_img_path + 'img.nii.gz' output_img = nifti_img_path + 'img_n4.nii.gz' subprocess.call(["/usr/bmicnas01/data-biwi-01/bmicdatasets/Sharing/N4_th", input_img, output_img]) # ================================ # read bias corrected image # ================================ img = utils.load_nii(img_path = nifti_img_path + 'img_n4.nii.gz')[0] # ================================ # normalize the image # ================================ img = utils.normalise_image(img, norm_type='div_by_max') # ================================ # read the labels # ================================ mask_path = os.path.join(mask_folder, folder.split('/')[-1] + '.nrrd') mask, options = nrrd.read(mask_path) # fix swap axis mask = np.swapaxes(mask, 0, 1) # ================================ # save as nifti, this sets the affine transformation as an identity matrix # ================================ utils.save_nii(img_path = nifti_img_path + 'lbl.nii.gz', data = mask, affine = np.eye(4)) nx_list[train_test].append(mask.shape[0]) ny_list[train_test].append(mask.shape[1]) nz_list[train_test].append(mask.shape[2]) print('mask.shape') print(mask.shape) print('img.shape') print(img.shape) ### PROCESSING LOOP FOR SLICE-BY-SLICE 2D DATA ################### scale_vector = [pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1]] for zz in range(img.shape[2]): slice_img = np.squeeze(img[:, :, zz]) slice_rescaled = transform.rescale(slice_img, scale_vector, order=1, preserve_range=True, multichannel=False, mode = 'constant') slice_mask = np.squeeze(mask[:, :, zz]) mask_rescaled = transform.rescale(slice_mask, scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') slice_cropped = utils.crop_or_pad_slice_to_size(slice_rescaled, nx, ny) mask_cropped = utils.crop_or_pad_slice_to_size(mask_rescaled, nx, ny) img_list[train_test].append(slice_cropped) mask_list[train_test].append(mask_cropped) write_buffer += 1 # Writing needs to happen inside the loop over the slices if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # Write the small datasets for tt in ['test', 'train', 'validation']: hdf5_file.create_dataset('nx_%s' % tt, data=np.asarray(nx_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('ny_%s' % tt, data=np.asarray(ny_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('nz_%s' % tt, data=np.asarray(nz_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('px_%s' % tt, data=np.asarray(px_list[tt], dtype=np.float32)) hdf5_file.create_dataset('py_%s' % tt, data=np.asarray(py_list[tt], dtype=np.float32)) hdf5_file.create_dataset('pz_%s' % tt, data=np.asarray(pz_list[tt], dtype=np.float32)) hdf5_file.create_dataset('patnames_%s' % tt, data=np.asarray(pat_names_list[tt], dtype="S10")) # After test train loop: hdf5_file.close()
def prepare_data(input_folder, preproc_folder, # bias corrected images will be saved here already output_file, size, target_resolution, cv_fold_num): # ======================= # create the hdf5 file where everything will be written # ======================= hdf5_file = h5py.File(output_file, "w") # ======================= # read all the images and count the number of slices along the append axis (the one with the lowest resolution) # ======================= logging.info('Counting files and parsing meta data...') # using the bias corrected images in the preproc folder for this step num_slices, patient_ids_list = count_slices_and_patient_ids_list(preproc_folder, cv_fold_number = cv_fold_num) # ======================= # set the number of slices according to what has been found from the previous function # ======================= nx, ny = size n_test = num_slices['test'] n_train = num_slices['train'] n_val = num_slices['validation'] # ======================= # Create datasets for images and masks # ======================= data = {} for tt, num_points in zip(['test', 'train', 'validation'], [n_test, n_train, n_val]): if num_points > 0: data['images_%s' % tt] = hdf5_file.create_dataset("images_%s" % tt, [num_points] + list(size), dtype=np.float32) data['masks_%s' % tt] = hdf5_file.create_dataset("masks_%s" % tt, [num_points] + list(size), dtype=np.uint8) mask_list = {'test': [], 'train': [], 'validation': []} img_list = {'test': [], 'train': [], 'validation': []} nx_list = {'test': [], 'train': [], 'validation': []} ny_list = {'test': [], 'train': [], 'validation': []} nz_list = {'test': [], 'train': [], 'validation': []} px_list = {'test': [], 'train': [], 'validation': []} py_list = {'test': [], 'train': [], 'validation': []} pz_list = {'test': [], 'train': [], 'validation': []} pat_names_list = {'test': [], 'train': [], 'validation': []} # ======================= # read data of each subject, preprocess it and write to the hdf5 file # ======================= logging.info('Parsing image files') for train_test in ['test', 'train', 'validation']: write_buffer = 0 counter_from = 0 patient_counter = 0 for patient_id in patient_ids_list[train_test]: filepath_orig_mhd_format = input_folder + 'Case' + patient_id + '.mhd' filepath_orig_nii_format = preproc_folder + 'Case' + patient_id + '.nii.gz' filepath_bias_corrected_nii_format = preproc_folder + 'Case' + patient_id + '_n4.nii.gz' filepath_seg_nii_format = preproc_folder + 'Case' + patient_id + '_segmentation.nii.gz' patient_counter += 1 pat_names_list[train_test].append('case' + patient_id) logging.info('================================') logging.info('Doing: %s' % filepath_orig_mhd_format) # ================================ # read the original mhd image, in order to extract pixel resolution information # ================================ img_mhd = sitk.ReadImage(filepath_orig_mhd_format) pixel_size = img_mhd.GetSpacing() px_list[train_test].append(float(pixel_size[0])) py_list[train_test].append(float(pixel_size[1])) pz_list[train_test].append(float(pixel_size[2])) # ================================ # read bias corrected image # ================================ img = utils.load_nii(filepath_bias_corrected_nii_format)[0] # ================================ # normalize the image # ================================ img = utils.normalise_image(img, norm_type='div_by_max') # ================================ # read the labels # ================================ mask = utils.load_nii(filepath_seg_nii_format)[0] # ================================ # skimage io with simple ITKplugin was used to read the images in the convert_to_nii_and_correct_bias_field function. # this lead to the arrays being read as z-x-y # move the axes appropriately, so that the resolution read above is correct for the corresponding axes. # ================================ img = np.swapaxes(np.swapaxes(img, 0, 1), 1, 2) mask = np.swapaxes(np.swapaxes(mask, 0, 1), 1, 2) # ================================ # write to the dimensions now # ================================ nx_list[train_test].append(mask.shape[0]) ny_list[train_test].append(mask.shape[1]) nz_list[train_test].append(mask.shape[2]) print('mask.shape') print(mask.shape) print('img.shape') print(img.shape) ### PROCESSING LOOP FOR SLICE-BY-SLICE 2D DATA ################### scale_vector = [pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1]] for zz in range(img.shape[2]): slice_img = np.squeeze(img[:, :, zz]) slice_rescaled = transform.rescale(slice_img, scale_vector, order=1, preserve_range=True, multichannel=False, mode = 'constant') slice_mask = np.squeeze(mask[:, :, zz]) mask_rescaled = transform.rescale(slice_mask, scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') slice_cropped = utils.crop_or_pad_slice_to_size(slice_rescaled, nx, ny) mask_cropped = utils.crop_or_pad_slice_to_size(mask_rescaled, nx, ny) img_list[train_test].append(slice_cropped) mask_list[train_test].append(mask_cropped) write_buffer += 1 # Writing needs to happen inside the loop over the slices if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # Write the small datasets for tt in ['test', 'train', 'validation']: hdf5_file.create_dataset('nx_%s' % tt, data=np.asarray(nx_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('ny_%s' % tt, data=np.asarray(ny_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('nz_%s' % tt, data=np.asarray(nz_list[tt], dtype=np.uint16)) hdf5_file.create_dataset('px_%s' % tt, data=np.asarray(px_list[tt], dtype=np.float32)) hdf5_file.create_dataset('py_%s' % tt, data=np.asarray(py_list[tt], dtype=np.float32)) hdf5_file.create_dataset('pz_%s' % tt, data=np.asarray(pz_list[tt], dtype=np.float32)) hdf5_file.create_dataset('patnames_%s' % tt, data=np.asarray(pat_names_list[tt], dtype="S10")) # After test train loop: hdf5_file.close()
def prepare_data(input_folder, preproc_folder, idx_start, idx_end, bias_correction): images = [] affines = [] patnames = [] masks = [] # read the foldernames foldernames = sorted(glob.glob(input_folder + '*/')) logging.info('Number of images in the dataset: %s' % str(len(foldernames))) # iterate through all indices for idx in range(len(foldernames)): # only consider images within the indices requested if (idx < idx_start) or (idx >= idx_end): logging.info('skipping subject: %d' % idx) continue # get the file name for this subject foldername = foldernames[idx] # extract the patient name _patname = foldername[foldername[:-1].rfind('/') + 1:-1] if _patname == 'A00033264': # this subject has images of a different size continue # ==================================================== # search for the segmentation file # ==================================================== name = foldername + 'orig_labels_aligned_with_true_image.nii.gz' # segmentation mask with ~100 classes logging.info('==============================================') logging.info('reading segmentation mask: %s' % name) # read the segmentation mask _seg_data, _seg_affine, _seg_header = utils.load_nii(name) # group the segmentation classes as required _seg_data = utils.group_segmentation_classes(_seg_data) # ==================================================== # read the image file # ==================================================== if bias_correction is True: name = foldername + 'MPRAGE_n4.nii' # read the original image else: name = foldername + 'MPRAGE.nii' # read the original image # ==================================================== # bias correction before reading the image file (optional) # ==================================================== # read the image logging.info('reading image: %s' % name) _img_data, _img_affine, _img_header = utils.load_nii(name) # _img_header.get_zooms() = (1.0, 1.0, 1.0) # ============ # create a segmentation mask and use it to get rid of the skull in the image # ============ seg_mask = np.copy(_seg_data) seg_mask[_seg_data > 0] = 1 img_masked = _img_data * seg_mask # normalise the image _img_data = utils.normalise_image(img_masked, norm_type='div_by_max') # ============ # rescale the image and the segmentation mask so that their pixel size in mm matches that of the hcp images # ============ img_rescaled = rescale(image=_img_data, scale=10 / 7, order=1, preserve_range=True, multichannel=False) seg_rescaled = rescale(image=_seg_data, scale=10 / 7, order=0, preserve_range=True, multichannel=False) # ============ # A lot of the periphery is just zeros, so get rid of some of it # ============ # define how much of the image can be cropped out as it consists of zeros x_start = 13 x_end = -14 y_start = 55 y_end = -55 z_start = 55 + 16 + 50 z_end = -55 - 16 + 50 # original images are 176 * 256 * 256 # rescaling them makes them 251 * 366 * 366 # cropping them down to 224 * 256 * 224 img_rescaled = img_rescaled[x_start:x_end, y_start:y_end, z_start:z_end] seg_rescaled = seg_rescaled[x_start:x_end, y_start:y_end, z_start:z_end] # save the pre-processed segmentation ground truth utils.makefolder(preproc_folder + _patname) utils.save_nii(preproc_folder + _patname + '/preprocessed_gt15.nii', seg_rescaled, _seg_affine) if bias_correction is True: utils.save_nii( preproc_folder + _patname + '/preprocessed_image_n4.nii', img_rescaled, _img_affine) else: utils.save_nii( preproc_folder + _patname + '/preprocessed_image.nii', img_rescaled, _img_affine) # append to lists images.append(img_rescaled) affines.append(_img_affine) patnames.append(_patname) masks.append(seg_rescaled) # convert the lists to arrays images = np.array(images) affines = np.array(affines) patnames = np.array(patnames) masks = np.array(masks, dtype='uint8') # ======================== # merge along the y-zis to get a stack of x-z slices, for the images as well as the masks # ======================== images = images.swapaxes(1, 2) images = images.reshape(-1, images.shape[2], images.shape[3]) masks = masks.swapaxes(1, 2) masks = masks.reshape(-1, masks.shape[2], masks.shape[3]) # save the processed images and masks so that they can be directly read the next time # make appropriate filenames according to the requested indices of training, validation and test images logging.info('Saving pre-processed files...') config_details = 'from%dto%d_' % (idx_start, idx_end) if bias_correction is True: filepath_images = preproc_folder + config_details + 'images_2d_bias_corrected.npy' else: filepath_images = preproc_folder + config_details + 'images_2d.npy' filepath_masks = preproc_folder + config_details + 'annotations15_2d.npy' filepath_affine = preproc_folder + config_details + 'affines.npy' filepath_patnames = preproc_folder + config_details + 'patnames.npy' np.save(filepath_images, images) np.save(filepath_masks, masks) np.save(filepath_affine, affines) np.save(filepath_patnames, patnames) return images, masks, affines, patnames
def prepare_data(input_folder, output_file, idx_start, idx_end, protocol, size, target_resolution, preprocessing_folder): # ======================== # read the filenames # ======================== filenames = sorted(glob.glob(input_folder + '*.zip')) logging.info('Number of images in the dataset: %s' % str(len(filenames))) # ======================= # create a hdf5 file # ======================= # hdf5_file = h5py.File(output_file, "w") # # # =============================== # # Create datasets for images and labels # # =============================== # data = {} # num_subjects = idx_end - idx_start # # data['images'] = hdf5_file.create_dataset("images", [num_subjects] + list(size), dtype=np.float32) # data['labels'] = hdf5_file.create_dataset("labels", [num_subjects] + list(size), dtype=np.uint8) # # # =============================== # initialize lists # =============================== label_list = [] image_list = [] nx_list = [] ny_list = [] nz_list = [] px_list = [] py_list = [] pz_list = [] pat_names_list = [] # =============================== # initiate counter # =============================== patient_counter = 0 # =============================== # iterate through the requested indices # =============================== for idx in range(idx_start, idx_end): logging.info('Volume {} of {}...'.format(idx, idx_end)) # ================== # get file paths # ================== patient_name, image_path, label_path = get_image_and_label_paths( filenames[idx], protocol, preprocessing_folder) # ============ # read the image and normalize it to be between 0 and 1 # ============ image, _, image_hdr = utils.load_nii(image_path) # ================== # read the label file # ================== label, _, _ = utils.load_nii(label_path) label = utils.group_segmentation_classes( label) # group the segmentation classes as required # # ================== # # collect some header info. # # ================== # px_list.append(float(image_hdr.get_zooms()[0])) # py_list.append(float(image_hdr.get_zooms()[1])) # pz_list.append(float(image_hdr.get_zooms()[2])) # nx_list.append(image.shape[0]) # ny_list.append(image.shape[1]) # nz_list.append(image.shape[2]) # pat_names_list.append(patient_name) # ================== # crop volume along all axes from the ends (as there are several zeros towards the ends) # ================== image = utils.crop_or_pad_volume_to_size_along_x(image, 256) label = utils.crop_or_pad_volume_to_size_along_x(label, 256) image = utils.crop_or_pad_volume_to_size_along_y(image, 256) label = utils.crop_or_pad_volume_to_size_along_y(label, 256) image = utils.crop_or_pad_volume_to_size_along_z(image, 256) label = utils.crop_or_pad_volume_to_size_along_z(label, 256) # ================== # normalize the image # ================== image_normalized = utils.normalise_image(image, norm_type='div_by_max') # ====================================================== # rescale, crop / pad to make all images of the required size and resolution # ====================================================== scale_vector = [ image_hdr.get_zooms()[0] / target_resolution[0], image_hdr.get_zooms()[1] / target_resolution[1], image_hdr.get_zooms()[2] / target_resolution[2] ] image_rescaled = transform.rescale(image_normalized, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') # label_onehot = utils.make_onehot(label, nlabels=15) # # label_onehot_rescaled = transform.rescale(label_onehot, # scale_vector, # order=1, # preserve_range=True, # multichannel=True, # mode='constant') # # label_rescaled = np.argmax(label_onehot_rescaled, axis=-1) # # # ============ # # the images and labels have been rescaled to the desired resolution. # # write them to the hdf5 file now. # # ============ # image_list.append(image_rescaled) # label_list.append(label_rescaled) # ============ # write to file # ============ # image_rescaled volume_dir = os.path.join(preprocessing_folder, 'volume_{:06d}'.format(idx)) os.makedirs(volume_dir, exist_ok=True) for i in range(size[1]): slice_path = os.path.join(volume_dir, 'slice_{:06d}.jpeg'.format(i)) slice = image_rescaled[:, i, :] * 255 image = Image.fromarray(slice.astype(np.uint8)) image.save(slice_path)
def prepare_data(input_folder, output_file, mode, size, target_resolution): ''' Main function that prepares a dataset from the raw challenge data to an hdf5 dataset ''' assert (mode in ['2D', '3D']), 'Unknown mode: %s' % mode if mode == '2D' and not len(size) == 2: raise AssertionError('Inadequate number of size parameters') if mode == '3D' and not len(size) == 3: raise AssertionError('Inadequate number of size parameters') if mode == '2D' and not len(target_resolution) == 2: raise AssertionError( 'Inadequate number of target resolution parameters') if mode == '3D' and not len(target_resolution) == 3: raise AssertionError( 'Inadequate number of target resolution parameters') image_folder = os.path.join(input_folder, 'Prostate-3T') mask_folder = os.path.join( input_folder, 'NCI_ISBI_Challenge-Prostate3T_Training_Segmentations') hdf5_file = h5py.File(output_file, "w") logging.info('Counting files and parsing meta data...') folder_list = get_patient_folders(image_folder, folder_base='Prostate3T-01') if mode == '3D': nx, ny, nz_max = size n_train = len(folder_list['train']) n_test = len(folder_list['test']) n_val = len(folder_list['validation']) elif mode == '2D': num_slices = count_slices(image_folder, folder_base='Prostate3T-01') nx, ny = size n_test = num_slices['test'] n_train = num_slices['train'] n_val = num_slices['validation'] else: raise AssertionError('Wrong mode setting. This should never happen.') print('Debug: Check if sets add up to correct value:') print(n_train, n_val, n_test, n_train + n_val + n_test) # Create datasets for images and masks data = {} for tt, num_points in zip(['test', 'train', 'validation'], [n_test, n_train, n_val]): if num_points > 0: data['images_%s' % tt] = hdf5_file.create_dataset( "images_%s" % tt, [num_points] + list(size), dtype=np.float32) data['masks_%s' % tt] = hdf5_file.create_dataset( "masks_%s" % tt, [num_points] + list(size), dtype=np.uint8) mask_list = {'test': [], 'train': [], 'validation': []} img_list = {'test': [], 'train': [], 'validation': []} logging.info('Parsing image files') for train_test in ['test', 'train', 'validation']: write_buffer = 0 counter_from = 0 patient_counter = 0 for folder in folder_list[train_test]: patient_counter += 1 logging.info( '-----------------------------------------------------------') logging.info('Doing: %s' % folder) lstFilesDCM = [] # create an empty list for dirName, subdirList, fileList in os.walk(folder): # fileList.sort() for filename in fileList: if ".dcm" in filename.lower( ): # check whether the file's DICOM lstFilesDCM.append(os.path.join(dirName, filename)) # Get ref file RefDs = dicom.read_file(lstFilesDCM[0]) # Load dimensions based on the number of rows, columns, and slices (along the Z axis) ConstPixelDims = (int(RefDs.Rows), int(RefDs.Columns), len(lstFilesDCM)) # Load spacing values (in mm) pixel_size = (float(RefDs.PixelSpacing[0]), float(RefDs.PixelSpacing[1]), float(RefDs.SliceThickness)) # print("pixel spacing 0,1; slice thickness ",ConstPixelSpacing) print('PixelDims') print(ConstPixelDims) print('PixelSpacing') print(pixel_size) # The array is sized based on 'ConstPixelDims' img = np.zeros(ConstPixelDims, dtype=RefDs.pixel_array.dtype) # loop through all the DICOM files for filenameDCM in lstFilesDCM: # read the file ds = dicom.read_file(filenameDCM) # store the raw image data # img[:, :, lstFilesDCM.index(filenameDCM)] = ds.pixel_array # index number field is not set correctly ! instead instance no is the slice no ! img[:, :, ds.InstanceNumber - 1] = ds.pixel_array img = utils.normalise_image(img) mask_path = os.path.join(mask_folder, folder.split('/')[-1] + '.nrrd') mask, options = nrrd.read(mask_path) # fix swap axis mask = np.swapaxes(mask, 0, 1) print('mask.shape') print(mask.shape) print('img.shape') print(img.shape) ### PROCESSING LOOP FOR SLICE-BY-SLICE 3D DATA ################### if mode == '3D': scale_vector = [ pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1], pixel_size[2] / target_resolution[2] ] img_scaled = transform.rescale(img, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') mask_scaled = transform.rescale(mask, scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') slice_vol = np.zeros((nx, ny, nz_max), dtype=np.float32) mask_vol = np.zeros((nx, ny, nz_max), dtype=np.uint8) nz_curr = img_scaled.shape[2] stack_from = (nz_max - nz_curr) // 2 if stack_from < 0: raise AssertionError( 'nz_max is too small for the chosen through plane resolution. Consider changing' 'the size or the target resolution in the through-plane.' ) for zz in range(nz_curr): slice_rescaled = img_scaled[:, :, zz] mask_rescaled = mask_scaled[:, :, zz] slice_cropped = crop_or_pad_slice_to_size( slice_rescaled, nx, ny) mask_cropped = crop_or_pad_slice_to_size( mask_rescaled, nx, ny) slice_vol[:, :, stack_from] = slice_cropped mask_vol[:, :, stack_from] = mask_cropped stack_from += 1 img_list[train_test].append(slice_vol) mask_list[train_test].append(mask_vol) write_buffer += 1 if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 ### PROCESSING LOOP FOR SLICE-BY-SLICE 2D DATA ################### elif mode == '2D': scale_vector = [ pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1] ] for zz in range(img.shape[2]): slice_img = np.squeeze(img[:, :, zz]) slice_rescaled = transform.rescale(slice_img, scale_vector, order=1, preserve_range=True, multichannel=False, mode='constant') slice_mask = np.squeeze(mask[:, :, zz]) mask_rescaled = transform.rescale(slice_mask, scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') slice_cropped = crop_or_pad_slice_to_size( slice_rescaled, nx, ny) mask_cropped = crop_or_pad_slice_to_size( mask_rescaled, nx, ny) img_list[train_test].append(slice_cropped) mask_list[train_test].append(mask_cropped) write_buffer += 1 # Writing needs to happen inside the loop over the slices if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 logging.info('Writing remaining data') counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, train_test) # After test train loop: hdf5_file.close()