def add_noise_to_channel(ct_dataset:[str, np.ndarray], channel_index=5, outfile=None): if isinstance(ct_dataset, str): data_dir = os.path.dirname(ct_dataset) file_name = os.path.basename(ct_dataset) data = dl.load_saved_data(data_dir, file_name) clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, masks, ids, params = data if masks.size <= 1: masks = None else: ct_inputs = ct_dataset noise = np.random.choice([0, 1], ct_inputs[..., channel_index].shape, p=[0.998, 0.002]) noised_ct_inputs = ct_inputs noised_ct_inputs[..., channel_index] += noise noised_ct_inputs[..., channel_index][noised_ct_inputs[..., channel_index] == 2] = 1 if isinstance(ct_dataset, str): if outfile is None: outfile = os.path.basename(ct_dataset).split('.')[0] + f'_noisy{channel_index}.npz' params = params.item() dataset = (clinical_inputs, noised_ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, masks, ids, params) dl.save_dataset(dataset, os.path.dirname(ct_dataset), out_file_name=outfile) return ct_inputs
def pad_dataset_to_shape(dataset_path: str, shape: tuple): """ Pad a dataset to a desired shape (equal padding on both sides) :param dataset_path: path do dataset :param shape: tuple, desired shape :return: saves new padded dataset to same location with "padded_" prefix """ data_dir = os.path.dirname(dataset_path) filename = os.path.basename(dataset_path) (clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, brain_masks, ids, params) = \ load_saved_data(data_dir, filename) padded_ct_inputs = np.array( [pad_to_shape(ct_input, shape) for ct_input in ct_inputs]) padded_ct_lesion_GT = np.array( [pad_to_shape(lesion_GT, shape) for lesion_GT in ct_lesion_GT]) padded_brain_masks = np.array( [pad_to_shape(brain_mask, shape) for brain_mask in brain_masks]) if len(mri_inputs) == 0: padded_mri_inputs = [] else: padded_mri_inputs = np.array( [pad_to_shape(mri_input, shape) for mri_input in mri_inputs]) if len(mri_lesion_GT) == 0: padded_mri_lesion_GT = [] else: padded_mri_lesion_GT = np.array( [pad_to_shape(lesion_GT, shape) for lesion_GT in mri_lesion_GT]) dataset = (clinical_inputs, padded_ct_inputs, padded_ct_lesion_GT, padded_mri_inputs, padded_mri_lesion_GT, padded_brain_masks, ids, params) save_dataset(dataset, data_dir, 'padded_' + filename)
def crop_to_minimal(data_dir, filename='data_set.npz', n_c=4): """ Crop all images of dataset to a common minimal shape and saves new dataset :param data_dir: dir containing dataset file :param filename: dataset filename :param n_c: number of channels in CT image :return: """ print('Cropping to Minimal Common shape') print('WARNING: MRI files are not cropped for now.') minimal_common_shape, crop_offsets = find_minimal_common_shape( data_dir, filename) temp_dir = tempfile.mkdtemp() print('Using temporary dir', temp_dir) (clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, brain_masks, ids, params) = load_saved_data(data_dir, filename) for subj_index, id in enumerate(ids): print('Processing', subj_index, id) # Crop to minimal zero shape found above ct_input_img = nib.Nifti1Image(ct_inputs[subj_index], np.eye(4)) cropped_ct_input_img = nilimg.image._crop_img_to( ct_input_img, crop_offsets[subj_index]) ct_lesion_img = nib.Nifti1Image(ct_lesion_GT[subj_index], np.eye(4)) cropped_ct_lesion_img = nilimg.image._crop_img_to( ct_lesion_img, crop_offsets[subj_index]) mask_img = nib.Nifti1Image(brain_masks[subj_index].astype(int), np.eye(4)) cropped_mask_img = nilimg.image._crop_img_to(mask_img, crop_offsets[subj_index]) # pad minimal common shape (found from max shapes of individual subjects) padded_ct_input = pad_to_shape(cropped_ct_input_img.get_fdata(), minimal_common_shape) padded_ct_lesion = pad_to_shape(cropped_ct_lesion_img.get_fdata(), minimal_common_shape) padded_brain_mask = pad_to_shape(cropped_mask_img.get_fdata(), minimal_common_shape) padded_ct_input_img = nib.Nifti1Image(padded_ct_input, np.eye(4)) padded_ct_lesion_img = nib.Nifti1Image(padded_ct_lesion, np.eye(4)) padded_brain_mask_img = nib.Nifti1Image(padded_brain_mask, np.eye(4)) nib.save(padded_ct_input_img, os.path.join(temp_dir, f'{id}_padded_ct_input.nii')) nib.save(padded_ct_lesion_img, os.path.join(temp_dir, f'{id}_padded_ct_lesion.nii')) nib.save(padded_brain_mask_img, os.path.join(temp_dir, f'{id}_padded_brain_mask.nii')) print('Processing done. Now loading files.') group_to_file(minimal_common_shape, data_dir, temp_dir, filename, n_c) shutil.rmtree(temp_dir)
def downsample_dataset(data_dir, scale_factor: float, filename='data_set.npz'): (clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, brain_masks, ids, params) = \ load_saved_data(data_dir, filename) final_ct_inputs = [] final_ct_lesion_GT = [] final_brain_masks = [] for subj_idx, id, in enumerate(ids): print('Scaling', subj_idx, id) final_ct_inputs.append( rescale(ct_inputs[subj_idx], (scale_factor, scale_factor, scale_factor, 1))) final_ct_lesion_GT.append( rescale(ct_lesion_GT[subj_idx], (scale_factor, scale_factor, scale_factor))) final_brain_masks.append( rescale(brain_masks[subj_idx], (scale_factor, scale_factor, scale_factor))) dataset = (clinical_inputs, np.array(final_ct_inputs), np.array(final_ct_lesion_GT), mri_inputs, mri_lesion_GT, np.array(final_brain_masks), ids, params) save_dataset(dataset, data_dir, f'scale{scale_factor}_' + filename)
def downsample_dataset_to_shape(data_dir, target_shape: tuple, filename='data_set.npz', n_c_ct=4): (clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, brain_masks, ids, params) = \ load_saved_data(data_dir, filename) final_ct_inputs = [] final_ct_lesion_GT = [] final_brain_masks = [] for subj_idx, id, in enumerate(ids): print('Scaling', subj_idx, id) final_ct_inputs.append( resize(ct_inputs[subj_idx], target_shape + (n_c_ct, ))) final_ct_lesion_GT.append(resize(ct_lesion_GT[subj_idx], target_shape)) final_brain_masks.append(resize(brain_masks[subj_idx], target_shape)) dataset = (clinical_inputs, np.array(final_ct_inputs), np.array(final_ct_lesion_GT), mri_inputs, mri_lesion_GT, np.array(final_brain_masks), ids, params) save_dataset( dataset, data_dir, f'shape{target_shape[0]}x{target_shape[1]}x{target_shape[2]}_' + filename)
def add_penumbra_map(ct_dataset:[str, np.ndarray], masks = None, tmax_channel = 0, outfile = None, one_hot_encode=False): if isinstance(ct_dataset, str): data_dir = os.path.dirname(ct_dataset) file_name = os.path.basename(ct_dataset) data = dl.load_saved_data(data_dir, file_name) clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, masks, ids, params = data if masks.size <= 1: masks = None else: ct_inputs = ct_dataset n_subj, n_x, n_y, n_z, n_c = ct_inputs.shape penumbra_mask = np.zeros((n_subj, n_x, n_y, n_z)) penumbra_mask[ct_inputs[..., tmax_channel] > 6] = 1 if masks is not None: # Restrict to defined prior mask restr_penumbra_mask = penumbra_mask * masks else: restr_penumbra_mask = penumbra_mask restr_penumbra_mask = np.expand_dims(restr_penumbra_mask, axis=-1) if one_hot_encode: class_0_core = 1 - restr_penumbra_mask class_1_core = restr_penumbra_mask restr_penumbra_mask = np.concatenate((class_0_core, class_1_core), axis=-1) ct_inputs = np.concatenate((ct_inputs, restr_penumbra_mask), axis=-1) if isinstance(ct_dataset, str): if outfile is None: if one_hot_encode: outfile = os.path.basename(ct_dataset).split('.')[0] + '_with_one_hot_encoded_core_penumbra.npz' else: outfile = os.path.basename(ct_dataset).split('.')[0] + '_with_penumbra.npz' params = params.item() if one_hot_encode: params['ct_sequences'].append('penumbra_Tmax_6_class0') params['ct_sequences'].append('penumbra_Tmax_6_class1') else: params['ct_sequences'].append('penumbra_Tmax_6') dataset = (clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, masks, ids, params) dl.save_dataset(dataset, os.path.dirname(ct_dataset), out_file_name=outfile) return ct_inputs, restr_penumbra_mask
def filter_for_clinical_param(dataset_path, clinical_path, clinical_parameter, id_parameter='anonymised_id'): data_dir = os.path.dirname(dataset_path) file_name = os.path.basename(dataset_path) data = dl.load_saved_data(data_dir, file_name) clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, masks, ids, params = data clinical_df = pd.read_excel(clinical_path) indices_to_remove = [] for idx, subj_id in enumerate(ids): if not subj_id in set(clinical_df[id_parameter]): print( f'{subj_id} not found in clinical database. Will be removed.') indices_to_remove.append(idx) continue if math.isnan(clinical_df.loc[clinical_df[id_parameter] == subj_id, clinical_parameter].iloc[0]): print( f'{subj_id} has no parameter "{clinical_parameter}" in clinical database. Will be removed.' ) indices_to_remove.append(idx) ct_inputs = np.delete(ct_inputs, indices_to_remove, axis=0) masks = np.delete(masks, indices_to_remove, axis=0) ids = np.delete(ids, indices_to_remove, axis=0) assert ct_inputs.shape[0] == masks.shape[0] assert ct_inputs.shape[0] == ids.shape[0] if not ct_lesion_GT.size <= 1: ct_lesion_GT = np.delete(ct_lesion_GT, indices_to_remove, axis=0) assert ct_inputs.shape[0] == ct_lesion_GT.shape[0] if not len(mri_inputs) == 0: mri_inputs = np.delete(mri_inputs, indices_to_remove, axis=0) mri_lesion_GT = np.delete(mri_lesion_GT, indices_to_remove, axis=0) outfile = os.path.basename(dataset_path).split( '.')[0] + f'_with_{clinical_parameter}.npz' dataset = (clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, masks, ids, params) dl.save_dataset(dataset, os.path.dirname(dataset_path), out_file_name=outfile)
def binarize_lesions(data_dir, filename='data_set.npz', threshold=0.1): (clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, brain_masks, ids, params) = load_saved_data(data_dir, filename) brain_masks[brain_masks < threshold] = 0 brain_masks[brain_masks >= threshold] = 1 brain_masks = brain_masks.astype(int) np.savez_compressed(os.path.join(data_dir, 'bin_' + filename), params=params, ids=ids, clinical_inputs=clinical_inputs, ct_inputs=ct_inputs, ct_lesion_GT=ct_lesion_GT, mri_inputs=mri_inputs, mri_lesion_GT=mri_lesion_GT, brain_masks=brain_masks)
def mirror_dataset_images(dataset_path: str): data_dir = os.path.dirname(dataset_path) file_name = os.path.basename(dataset_path) data = dl.load_saved_data(data_dir, file_name) clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, masks, ids, params = data # flip along x axis flipped_ct_inputs = np.flip(ct_inputs, axis=1) flipped_ct_lesion_GT = np.flip(ct_lesion_GT, axis=1) flipped_masks = np.flip(masks, axis=1) flipped_ids = ['flipped_' + id for id in ids] augmented_ct_inputs = np.concatenate((ct_inputs, flipped_ct_inputs), axis=0) augmented_ct_lesion_GT = np.concatenate( (ct_lesion_GT, flipped_ct_lesion_GT), axis=0) augmented_masks = np.concatenate((masks, flipped_masks), axis=0) augmented_ids = np.concatenate((ids, flipped_ids), axis=0) if len(mri_inputs) == 0: augmented_mri_inputs = [] augmented_mri_lesion_GT = [] else: flipped_mri_inputs = np.flip(mri_inputs, axis=1) flipped_mri_lesion_GT = np.flip(mri_lesion_GT, axis=1) augmented_mri_inputs = np.concatenate((mri_inputs, flipped_mri_inputs), axis=0) augmented_mri_lesion_GT = np.concatenate( (mri_lesion_GT, flipped_mri_lesion_GT), axis=0) augmented_dataset = (clinical_inputs, augmented_ct_inputs, augmented_ct_lesion_GT, augmented_mri_inputs, augmented_mri_lesion_GT, augmented_masks, augmented_ids, params) save_dataset(augmented_dataset, data_dir, 'flipped_' + file_name)
parser.add_argument('data_path') parser.add_argument('-c', '--channel_names', nargs='+', help='Name of output file', required=False, default=None) parser.add_argument('-o', '--output_dir', help='Directory to save output', required=False, default=None) args = parser.parse_args() (clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, masks, ids, params) = \ dl.load_saved_data(os.path.dirname(args.data_path), os.path.basename(args.data_path)) if args.channel_names is None: args.channel_names = params.item()['ct_sequences'] if args.output_dir is None: args.output_dir = os.path.dirname(args.data_path) outfile = os.path.basename( args.data_path).split('.')[0] + '_dataset_visualisation' visualize_dataset(ct_inputs, args.channel_names, ids, args.output_dir, gt_data=ct_lesion_GT, save_name=outfile)
def dataset_train_test_split(dataset_path, test_size=0.33, random_state=42, stratification_data=None, stratify_var=None, shuffle=True): data_dir = os.path.dirname(dataset_path) file_name = os.path.basename(dataset_path) data = dl.load_saved_data(data_dir, file_name) clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, masks, ids, params = data params = params.item() all_indices = list(range(len(ids))) if stratification_data is not None: stratification_df = pd.read_excel(stratification_data) stratification_labels = [ stratification_df.loc[stratification_df['anonymised_id'] == subj_id, stratify_var].iloc[0] for subj_id in ids ] print( f'Initial positive label distribution: {sum(stratification_labels)/len(stratification_labels)}' ) train_indices, test_indices = train_test_split( all_indices, test_size=test_size, random_state=random_state, shuffle=shuffle, stratify=stratification_labels) else: train_indices, test_indices = train_test_split( all_indices, test_size=test_size, random_state=random_state, shuffle=shuffle) def split_data(data, train_indices, test_indices): if len(data.shape) > 0 and data.size > 0: train_data = data[train_indices] test_data = data[test_indices] else: train_data = data test_data = data return train_data, test_data train_clinical, test_clinical = split_data(clinical_inputs, train_indices, test_indices) train_ct_inputs, test_ct_inputs = split_data(ct_inputs, train_indices, test_indices) train_ct_lesion_GT, test_ct_lesion_GT = split_data(ct_lesion_GT, train_indices, test_indices) train_mri_inputs, test_mri_inputs = split_data(mri_inputs, train_indices, test_indices) train_mri_lesion_GT, test_mri_lesion_GT = split_data( mri_lesion_GT, train_indices, test_indices) train_brain_masks, test_brain_masks = split_data(masks, train_indices, test_indices) train_ids, test_ids = split_data(ids, train_indices, test_indices) if stratification_data is not None: print( f'Train positive label distribution: ' f'{sum(np.array(stratification_labels)[train_indices])/len(np.array(stratification_labels)[train_indices])}' ) train_dataset = (train_clinical, train_ct_inputs, train_ct_lesion_GT, train_mri_inputs, train_mri_lesion_GT, train_brain_masks, train_ids, params) if stratification_data is not None: print( f'Test positive label distribution: ' f'{sum(np.array(stratification_labels)[test_indices])/len(np.array(stratification_labels)[test_indices])}' ) test_dataset = (test_clinical, test_ct_inputs, test_ct_lesion_GT, test_mri_inputs, test_mri_lesion_GT, test_brain_masks, test_ids, params) train_outfile = 'train_' + file_name test_outfile = 'test_' + file_name dl.save_dataset(train_dataset, data_dir, out_file_name=train_outfile) dl.save_dataset(test_dataset, data_dir, out_file_name=test_outfile)
def join_datasets(dataset1_path, dataset2_path, outfile=None): data_dir1 = os.path.dirname(dataset1_path) file_name1 = os.path.basename(dataset1_path) data_dir2 = os.path.dirname(dataset2_path) file_name2 = os.path.basename(dataset2_path) data1 = dl.load_saved_data(data_dir1, file_name1) data2 = dl.load_saved_data(data_dir2, file_name2) clinical_inputs1, ct_inputs1, ct_lesion_GT1, mri_inputs1, mri_lesion_GT1, masks1, ids1, params1 = data1 clinical_inputs2, ct_inputs2, ct_lesion_GT2, mri_inputs2, mri_lesion_GT2, masks2, ids2, params2 = data2 out_ids = np.append(ids1, ids2, axis=0) assert not contains_duplicates( out_ids), 'Joined dataset would contain duplicated ids. Aborting.' assert ct_inputs2.shape[-1] == ct_inputs1.shape[ -1], 'Datasets do not have the same number of channels. Aborting.' out_ct_inputs = np.append(ct_inputs1, ct_inputs2, axis=0) if ct_lesion_GT1.size <= 1 or ct_lesion_GT2.size <= 1: print( 'Ignoring ct ground truth, as at least one dataset has no entries.' ) out_ct_lesion_GT = np.array([]) else: out_ct_lesion_GT = np.append(ct_lesion_GT1, ct_lesion_GT2, axis=0) if clinical_inputs1.size <= 1 or clinical_inputs2.size <= 1: print( 'Ignoring clinical input, as at least one dataset has no entries.') out_clinical_inputs = np.array([]) else: out_clinical_inputs = np.append(clinical_inputs1, clinical_inputs2, axis=0) if len(mri_inputs1) == 0 or len(mri_inputs2) == 0: print('Ignoring mri input, as at least one dataset has no entries.') out_mri_inputs = np.array([]) out_mri_lesion_GT = np.array([]) else: assert mri_inputs1.shape[-1] == mri_inputs2.shape[ -1], 'Datasets do not have the same number of channels. Aborting.' out_mri_inputs = np.append(mri_inputs1, mri_inputs2, axis=0) out_mri_lesion_GT = np.append(mri_lesion_GT1, mri_lesion_GT2, axis=0) out_masks = np.append(masks1, masks2, axis=0) # params should stay the same out_params = params1 print('Saving new dataset with: ', out_params) print('Ids:', out_ids.shape) print('Clinical:', out_clinical_inputs.shape) print('CT in:', out_ct_inputs.shape) print('CT gt:', out_ct_lesion_GT.shape) print('MRI in:', out_mri_inputs.shape) print('MRI gt:', out_mri_lesion_GT.shape) print('masks:', out_masks.shape) if outfile is None: outfile = 'joined_dataset.npz' dataset = (out_clinical_inputs, out_ct_inputs, out_ct_lesion_GT, out_mri_inputs, out_mri_lesion_GT, out_masks, out_ids, out_params) dl.save_dataset(dataset, os.path.dirname(dataset1_path), out_file_name=outfile)
def add_core_map(ct_dataset:[str, np.ndarray], masks = None, cbf_channel = 1, ncct_channel = 4, outfile = None, one_hot_encode_core=False, dilation_dimension=3): if int(dilation_dimension) == 2: dilation_structure = dilation_structure_2d elif int(dilation_dimension) == 3: dilation_structure = dilation_structure_3d else: raise NotImplementedError if isinstance(ct_dataset, str): data_dir = os.path.dirname(ct_dataset) file_name = os.path.basename(ct_dataset) data = dl.load_saved_data(data_dir, file_name) clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, masks, ids, params = data if masks.size <= 1: masks = None else: ct_inputs = ct_dataset n_subj, n_x, n_y, n_z, n_c = ct_inputs.shape # Create CSF mask low_bounded_ncct = ct_inputs[..., ncct_channel][ct_inputs[..., ncct_channel] > 0] up_and_low_bounded_ncct = low_bounded_ncct[low_bounded_ncct < 100] # threshold = 20 threshold = np.percentile(up_and_low_bounded_ncct, 5) csf_mask = gaussian_smoothing(ct_inputs[..., ncct_channel, None], kernel_width=3) < threshold enlarged_csf_mask = np.array( [ndimage.binary_dilation(csf_mask[idx, ..., 0], structure=dilation_structure(2)) for idx in range(csf_mask.shape[0])]) inv_csf_mask = -1 * enlarged_csf_mask + 1 # Create Skull mask brain_mask = np.array([ct_brain_extraction(ct_inputs[subj, ..., ncct_channel], fsl_path='/usr/local/fsl/bin')[0] for subj in range(n_subj)]) not_brain_mask = 1 - brain_mask # enlargen slighlty enlarged_not_brain_mask = np.array( [ndimage.binary_dilation(not_brain_mask[subj], dilation_structure(3)) for subj in range(n_subj)]) inv_skull_mask = 1 - enlarged_not_brain_mask ## Create major vessel mask threshold = np.percentile(ct_inputs[..., cbf_channel], 99) vessel_mask = ct_inputs[..., cbf_channel] > threshold enlarged_vessel_mask = np.array( [ndimage.binary_dilation(vessel_mask[idx], structure=dilation_structure(2)) for idx in range(vessel_mask.shape[0])]) vessel_mask = enlarged_vessel_mask inv_vessel_mask = -1 * vessel_mask + 1 ## Create Core mask smooth_rCBF = normalise_by_contralateral_median(gaussian_smoothing(ct_inputs[..., 1, None], kernel_width=2)) smooth_core_masks = smooth_rCBF < 0.38 corr_csf_core_masks = smooth_core_masks * inv_csf_mask[..., None] corr_vx_core_masks = corr_csf_core_masks * inv_vessel_mask[..., None] corr_skull_core_masks = corr_vx_core_masks * inv_skull_mask[..., None] if masks is not None: # Restrict to defined prior mask restr_core = corr_skull_core_masks * masks[..., None] else: restr_core = corr_skull_core_masks if one_hot_encode_core: class_0_core = 1 - restr_core class_1_core = restr_core restr_core = np.concatenate((class_0_core, class_1_core), axis=-1) ct_inputs = np.concatenate((ct_inputs, restr_core), axis=-1) if isinstance(ct_dataset, str): if outfile is None: if one_hot_encode_core: outfile = os.path.basename(ct_dataset).split('.')[0] + '_with_one_hot_encoded_core.npz' else: outfile = os.path.basename(ct_dataset).split('.')[0] + '_with_core.npz' params = params.item() if one_hot_encode_core: params['ct_sequences'].append('core_rCBF_0.38_class0') params['ct_sequences'].append('core_rCBF_0.38_class1') else: params['ct_sequences'].append('core_rCBF_0.38') dataset = (clinical_inputs, ct_inputs, ct_lesion_GT, mri_inputs, mri_lesion_GT, masks, ids, params) dl.save_dataset(dataset, os.path.dirname(ct_dataset), out_file_name=outfile) return ct_inputs, restr_core