def prepare_data(input_folder, output_file, mode, size, target_resolution):
    '''
    Main function that prepares a dataset from the raw challenge data to an hdf5 dataset
    '''

    assert (mode in ['2D', '3D']), 'Unknown mode: %s' % mode
    if mode == '2D' and not len(size) == 2:
        raise AssertionError('Inadequate number of size parameters')
    if mode == '3D' and not len(size) == 3:
        raise AssertionError('Inadequate number of size parameters')
    if mode == '2D' and not len(target_resolution) == 2:
        raise AssertionError(
            'Inadequate number of target resolution parameters')
    if mode == '3D' and not len(target_resolution) == 3:
        raise AssertionError(
            'Inadequate number of target resolution parameters')

    hdf5_file = h5py.File(output_file, "w")

    diag_list = {'test': [], 'train': []}
    height_list = {'test': [], 'train': []}
    weight_list = {'test': [], 'train': []}
    patient_id_list = {'test': [], 'train': []}
    cardiac_phase_list = {'test': [], 'train': []}

    file_list = {'test': [], 'train': []}
    num_slices = {'test': 0, 'train': 0}

    logging.info('Counting files and parsing meta data...')

    for folder in os.listdir(input_folder):

        folder_path = os.path.join(input_folder, folder)

        if os.path.isdir(folder_path):

            train_test = 'test' if (int(folder[-3:]) % 5 == 0) else 'train'

            infos = {}
            for line in open(os.path.join(folder_path, 'Info.cfg')):
                label, value = line.split(':')
                infos[label] = value.rstrip('\n').lstrip(' ')

            patient_id = folder.lstrip('patient')

            for file in glob.glob(
                    os.path.join(folder_path, 'patient???_frame??.nii.gz')):

                file_list[train_test].append(file)

                # diag_list[train_test].append(diagnosis_to_int(infos['Group']))
                diag_list[train_test].append(diagnosis_dict[infos['Group']])
                weight_list[train_test].append(infos['Weight'])
                height_list[train_test].append(infos['Height'])

                patient_id_list[train_test].append(patient_id)

                systole_frame = int(infos['ES'])
                diastole_frame = int(infos['ED'])

                file_base = file.split('.')[0]
                frame = int(file_base.split('frame')[-1])
                if frame == systole_frame:
                    cardiac_phase_list[train_test].append(1)  # 1 == systole
                elif frame == diastole_frame:
                    cardiac_phase_list[train_test].append(2)  # 2 == diastole
                else:
                    cardiac_phase_list[train_test].append(
                        0)  # 0 means other phase

                nifty_img = nib.load(file)
                num_slices[train_test] += nifty_img.shape[2]

    # Write the small datasets
    for tt in ['test', 'train']:
        hdf5_file.create_dataset('diagnosis_%s' % tt,
                                 data=np.asarray(diag_list[tt],
                                                 dtype=np.uint8))
        hdf5_file.create_dataset('weight_%s' % tt,
                                 data=np.asarray(weight_list[tt],
                                                 dtype=np.float32))
        hdf5_file.create_dataset('height_%s' % tt,
                                 data=np.asarray(height_list[tt],
                                                 dtype=np.float32))
        hdf5_file.create_dataset('patient_id_%s' % tt,
                                 data=np.asarray(patient_id_list[tt],
                                                 dtype=np.uint8))
        hdf5_file.create_dataset('cardiac_phase_%s' % tt,
                                 data=np.asarray(cardiac_phase_list[tt],
                                                 dtype=np.uint8))

    if mode == '3D':
        nx, ny, nz_max = size
        n_train = len(file_list['train'])
        n_test = len(file_list['test'])

    elif mode == '2D':
        nx, ny = size
        n_test = num_slices['test']
        n_train = num_slices['train']

    else:
        raise AssertionError('Wrong mode setting. This should never happen.')

    # Create datasets for images and masks
    data = {}

    for tt, num_points in zip(['test', 'train'], [n_test, n_train]):
        data['images_%s' % tt] = hdf5_file.create_dataset(
            "images_%s" % tt, [num_points] + list(size), dtype=np.float32)
        data['masks_%s' % tt] = hdf5_file.create_dataset(
            "masks_%s" % tt, [num_points] + list(size), dtype=np.uint8)

    mask_list = {'test': [], 'train': []}
    img_list = {'test': [], 'train': []}

    logging.info('Parsing image files')

    for train_test in ['test', 'train']:

        write_buffer = 0
        counter_from = 0

        for file in file_list[train_test]:

            logging.info(
                '-----------------------------------------------------------')
            logging.info('Doing: %s' % file)

            file_base = file.split('.nii.gz')[0]
            file_mask = file_base + '_gt.nii.gz'

            img_dat = utils.load_nii(file)
            mask_dat = utils.load_nii(file_mask)

            img = img_dat[0].copy()
            mask = mask_dat[0].copy()

            img = image_utils.normalise_image(img)

            pixel_size = (img_dat[2].structarr['pixdim'][1],
                          img_dat[2].structarr['pixdim'][2],
                          img_dat[2].structarr['pixdim'][3])

            logging.info('Pixel size:')
            logging.info(pixel_size)

            ### PROCESSING LOOP FOR 3D DATA ################################
            if mode == '3D':

                scale_vector = [
                    pixel_size[0] / target_resolution[0],
                    pixel_size[1] / target_resolution[1],
                    pixel_size[2] / target_resolution[2]
                ]

                img_scaled = transform.rescale(img,
                                               scale_vector,
                                               order=1,
                                               preserve_range=True,
                                               multichannel=False,
                                               mode='constant')
                mask_scaled = transform.rescale(mask,
                                                scale_vector,
                                                order=0,
                                                preserve_range=True,
                                                multichannel=False,
                                                mode='constant')

                slice_vol = np.zeros((nx, ny, nz_max), dtype=np.float32)
                mask_vol = np.zeros((nx, ny, nz_max), dtype=np.uint8)

                nz_curr = img_scaled.shape[2]
                stack_from = (nz_max - nz_curr) // 2

                if stack_from < 0:
                    raise AssertionError(
                        'nz_max is too small for the chosen through plane resolution. Consider changing'
                        'the size or the target resolution in the through-plane.'
                    )

                for zz in range(nz_curr):

                    slice_rescaled = img_scaled[:, :, zz]
                    mask_rescaled = mask_scaled[:, :, zz]

                    slice_cropped = crop_or_pad_slice_to_size(
                        slice_rescaled, nx, ny)
                    mask_cropped = crop_or_pad_slice_to_size(
                        mask_rescaled, nx, ny)

                    slice_vol[:, :, stack_from] = slice_cropped
                    mask_vol[:, :, stack_from] = mask_cropped

                    stack_from += 1

                img_list[train_test].append(slice_vol)
                mask_list[train_test].append(mask_vol)

                write_buffer += 1

                if write_buffer >= MAX_WRITE_BUFFER:

                    counter_to = counter_from + write_buffer
                    _write_range_to_hdf5(data, train_test, img_list, mask_list,
                                         counter_from, counter_to)
                    _release_tmp_memory(img_list, mask_list, train_test)

                    # reset stuff for next iteration
                    counter_from = counter_to
                    write_buffer = 0

            ### PROCESSING LOOP FOR SLICE-BY-SLICE 2D DATA ###################
            elif mode == '2D':

                scale_vector = [
                    pixel_size[0] / target_resolution[0],
                    pixel_size[1] / target_resolution[1]
                ]

                for zz in range(img.shape[2]):

                    slice_img = np.squeeze(img[:, :, zz])
                    slice_rescaled = transform.rescale(slice_img,
                                                       scale_vector,
                                                       order=1,
                                                       preserve_range=True,
                                                       multichannel=False,
                                                       mode='constant')

                    slice_mask = np.squeeze(mask[:, :, zz])
                    mask_rescaled = transform.rescale(slice_mask,
                                                      scale_vector,
                                                      order=0,
                                                      preserve_range=True,
                                                      multichannel=False,
                                                      mode='constant')

                    slice_cropped = crop_or_pad_slice_to_size(
                        slice_rescaled, nx, ny)
                    mask_cropped = crop_or_pad_slice_to_size(
                        mask_rescaled, nx, ny)

                    img_list[train_test].append(slice_cropped)
                    mask_list[train_test].append(mask_cropped)

                    write_buffer += 1

                    # Writing needs to happen inside the loop over the slices
                    if write_buffer >= MAX_WRITE_BUFFER:

                        counter_to = counter_from + write_buffer
                        _write_range_to_hdf5(data, train_test, img_list,
                                             mask_list, counter_from,
                                             counter_to)
                        _release_tmp_memory(img_list, mask_list, train_test)

                        # reset stuff for next iteration
                        counter_from = counter_to
                        write_buffer = 0

        # after file loop: Write the remaining data

        logging.info('Writing remaining data')
        counter_to = counter_from + write_buffer

        _write_range_to_hdf5(data, train_test, img_list, mask_list,
                             counter_from, counter_to)
        _release_tmp_memory(img_list, mask_list, train_test)

    # After test train loop:
    hdf5_file.close()
Beispiel #2
0
def prepare_data(input_folder, preproc_folder, idx_start, idx_end):

    images = []
    affines = []
    patnames = []
    masks = []

    # read the filenames which have segmentations available
    filenames = sorted(glob.glob(input_folder + '*_seg.nii'))
    logging.info(
        'Number of images in the dataset that have ground truth annotations: %s'
        % str(len(filenames)))

    # iterate through all indices
    for idx in range(len(filenames)):

        # only consider images within the indices requested
        if (idx < idx_start) or (idx >= idx_end):
            #logging.info('skipping subject: %d' %idx)
            continue

        logging.info('==============================================')

        # get the name of the ground truth annotation for this subject
        filename_seg = filenames[idx]
        filename_img = filename_seg[:-8] + '.nii.gz'
        _patname = filename_seg[filename_seg[:-1].rfind('/') + 1:-8]

        if _patname == 'IXI014-HH-1236-T2':  # this subject has very poor resolution - 256x256x28
            continue

        # read the image
        logging.info('reading image: %s' % _patname)
        _img_data, _img_affine, _img_header = utils.load_nii(filename_img)
        # make all the images of the same size by appending zero slices to facilitate appending
        # most images are of the size 256*256*130
        if (_img_data.shape[2] is not 130):
            num_zero_slices = 130 - _img_data.shape[2]
            zero_slices = np.zeros(
                (_img_data.shape[0], _img_data.shape[1], num_zero_slices))
            _img_data = np.concatenate((_img_data, zero_slices), axis=-1)
        # normalise the image
        _img_data = image_utils.normalise_image(_img_data,
                                                norm_type='div_by_max')
        # save the pre-processed image
        utils.makefolder(preproc_folder + _patname)
        savepath = preproc_folder + _patname + '/preprocessed_image.nii'
        utils.save_nii(savepath, _img_data, _img_affine)
        # append to the list of all images, affines and patient names
        images.append(_img_data)
        affines.append(_img_affine)
        patnames.append(_patname)

        # read the segmentation mask (already grouped)
        _seg_data, _seg_affine, _seg_header = utils.load_nii(filename_seg)
        # make all the images of the same size by appending zero slices to facilitate appending
        # most images are of the size 256*256*130
        if (_seg_data.shape[2] is not 130):
            num_zero_slices = 130 - _seg_data.shape[2]
            zero_slices = np.zeros(
                (_seg_data.shape[0], _seg_data.shape[1], num_zero_slices))
            _seg_data = np.concatenate((_seg_data, zero_slices), axis=-1)
        # save the pre-processed segmentation ground truth
        utils.makefolder(preproc_folder + _patname)
        savepath = preproc_folder + _patname + '/preprocessed_gt15.nii'
        utils.save_nii(savepath, _seg_data, _seg_affine)
        # append to the list of all masks
        masks.append(_seg_data)

    # convert the lists to arrays
    images = np.array(images)
    affines = np.array(affines)
    patnames = np.array(patnames)
    masks = np.array(masks, dtype='uint8')

    # merge along the y-zis to get a stack of x-z slices, for the images as well as the masks
    images = images.swapaxes(1, 2)
    images = images.reshape(-1, images.shape[2], images.shape[3])
    masks = masks.swapaxes(1, 2)
    masks = masks.reshape(-1, masks.shape[2], masks.shape[3])

    # save the processed images and masks so that they can be directly read the next time
    # make appropriate filenames according to the requested indices of training, validation and test images
    logging.info('Saving pre-processed files...')
    config_details = 'from%dto%d_' % (idx_start, idx_end)

    filepath_images = preproc_folder + config_details + 'images.npy'
    filepath_masks = preproc_folder + config_details + 'annotations15.npy'
    filepath_affine = preproc_folder + config_details + 'affines.npy'
    filepath_patnames = preproc_folder + config_details + 'patnames.npy'

    np.save(filepath_images, images)
    np.save(filepath_masks, masks)
    np.save(filepath_affine, affines)
    np.save(filepath_patnames, patnames)

    return images, masks, affines, patnames
def score_data(input_folder,
               output_folder,
               model_path,
               exp_config,
               do_postprocessing=False,
               gt_exists=True,
               evaluate_all=False,
               use_iter=None):

    nx, ny = exp_config.image_size[:2]
    batch_size = 1
    num_channels = exp_config.nlabels

    image_tensor_shape = [batch_size] + list(exp_config.image_size) + [1]
    images_pl = tf.placeholder(tf.float32,
                               shape=image_tensor_shape,
                               name='images')

    mask_pl, softmax_pl = model.predict(images_pl, exp_config)
    saver = tf.train.Saver()
    init = tf.global_variables_initializer()

    evaluate_test_set = not gt_exists

    with tf.Session() as sess:

        sess.run(init)

        if not use_iter:
            checkpoint_path = utils.get_latest_model_checkpoint_path(
                model_path, 'model_best_dice.ckpt')
        else:
            checkpoint_path = os.path.join(model_path,
                                           'model.ckpt-%d' % use_iter)

        saver.restore(sess, checkpoint_path)

        init_iteration = int(checkpoint_path.split('/')[-1].split('-')[-1])

        total_time = 0
        total_volumes = 0

        for folder in os.listdir(input_folder):

            folder_path = os.path.join(input_folder, folder)

            if os.path.isdir(folder_path):

                if evaluate_test_set or evaluate_all:
                    train_test = 'test'  # always test
                else:
                    train_test = 'test' if (int(folder[-3:]) %
                                            5 == 0) else 'train'

                if train_test == 'test':

                    infos = {}
                    for line in open(os.path.join(folder_path, 'Info.cfg')):
                        label, value = line.split(':')
                        infos[label] = value.rstrip('\n').lstrip(' ')

                    patient_id = folder.lstrip('patient')
                    ED_frame = int(infos['ED'])
                    ES_frame = int(infos['ES'])

                    for file in glob.glob(
                            os.path.join(folder_path,
                                         'patient???_frame??.nii.gz')):

                        logging.info(
                            ' ----- Doing image: -------------------------')
                        logging.info('Doing: %s' % file)
                        logging.info(
                            ' --------------------------------------------')

                        file_base = file.split('.nii.gz')[0]

                        frame = int(file_base.split('frame')[-1])
                        img_dat = utils.load_nii(file)
                        img = img_dat[0].copy()
                        img = image_utils.normalise_image(img)

                        if gt_exists:
                            file_mask = file_base + '_gt.nii.gz'
                            mask_dat = utils.load_nii(file_mask)
                            mask = mask_dat[0]

                        start_time = time.time()

                        if exp_config.data_mode == '2D':

                            pixel_size = (img_dat[2].structarr['pixdim'][1],
                                          img_dat[2].structarr['pixdim'][2])
                            scale_vector = (pixel_size[0] /
                                            exp_config.target_resolution[0],
                                            pixel_size[1] /
                                            exp_config.target_resolution[1])

                            predictions = []

                            for zz in range(img.shape[2]):

                                slice_img = np.squeeze(img[:, :, zz])
                                slice_rescaled = transform.rescale(
                                    slice_img,
                                    scale_vector,
                                    order=1,
                                    preserve_range=True,
                                    multichannel=False,
                                    mode='constant')

                                x, y = slice_rescaled.shape

                                x_s = (x - nx) // 2
                                y_s = (y - ny) // 2
                                x_c = (nx - x) // 2
                                y_c = (ny - y) // 2

                                # Crop section of image for prediction
                                if x > nx and y > ny:
                                    slice_cropped = slice_rescaled[x_s:x_s +
                                                                   nx,
                                                                   y_s:y_s +
                                                                   ny]
                                else:
                                    slice_cropped = np.zeros((nx, ny))
                                    if x <= nx and y > ny:
                                        slice_cropped[
                                            x_c:x_c +
                                            x, :] = slice_rescaled[:, y_s:y_s +
                                                                   ny]
                                    elif x > nx and y <= ny:
                                        slice_cropped[:, y_c:y_c +
                                                      y] = slice_rescaled[
                                                          x_s:x_s + nx, :]
                                    else:
                                        slice_cropped[x_c:x_c + x, y_c:y_c +
                                                      y] = slice_rescaled[:, :]

                                # GET PREDICTION
                                network_input = np.float32(
                                    np.tile(
                                        np.reshape(slice_cropped, (nx, ny, 1)),
                                        (batch_size, 1, 1, 1)))
                                mask_out, logits_out = sess.run(
                                    [mask_pl, softmax_pl],
                                    feed_dict={images_pl: network_input})
                                prediction_cropped = np.squeeze(
                                    logits_out[0, ...])

                                # ASSEMBLE BACK THE SLICES
                                slice_predictions = np.zeros(
                                    (x, y, num_channels))
                                # insert cropped region into original image again
                                if x > nx and y > ny:
                                    slice_predictions[
                                        x_s:x_s + nx,
                                        y_s:y_s + ny, :] = prediction_cropped
                                else:
                                    if x <= nx and y > ny:
                                        slice_predictions[:, y_s:y_s +
                                                          ny, :] = prediction_cropped[
                                                              x_c:x_c +
                                                              x, :, :]
                                    elif x > nx and y <= ny:
                                        slice_predictions[
                                            x_s:x_s +
                                            nx, :, :] = prediction_cropped[:,
                                                                           y_c:
                                                                           y_c +
                                                                           y, :]
                                    else:
                                        slice_predictions[:, :, :] = prediction_cropped[
                                            x_c:x_c + x, y_c:y_c + y, :]

                                # RESCALING ON THE LOGITS
                                if gt_exists:
                                    prediction = transform.resize(
                                        slice_predictions,
                                        (mask.shape[0], mask.shape[1],
                                         num_channels),
                                        order=1,
                                        preserve_range=True,
                                        mode='constant')
                                else:  # This can occasionally lead to wrong volume size, therefore if gt_exists
                                    # we use the gt mask size for resizing.
                                    prediction = transform.rescale(
                                        slice_predictions,
                                        (1.0 / scale_vector[0],
                                         1.0 / scale_vector[1], 1),
                                        order=1,
                                        preserve_range=True,
                                        multichannel=False,
                                        mode='constant')

                                # prediction = transform.resize(slice_predictions,
                                #                               (mask.shape[0], mask.shape[1], num_channels),
                                #                               order=1,
                                #                               preserve_range=True,
                                #                               mode='constant')

                                prediction = np.uint8(
                                    np.argmax(prediction, axis=-1))
                                predictions.append(prediction)

                            prediction_arr = np.transpose(
                                np.asarray(predictions, dtype=np.uint8),
                                (1, 2, 0))

                        elif exp_config.data_mode == '3D':

                            pixel_size = (img_dat[2].structarr['pixdim'][1],
                                          img_dat[2].structarr['pixdim'][2],
                                          img_dat[2].structarr['pixdim'][3])

                            scale_vector = (pixel_size[0] /
                                            exp_config.target_resolution[0],
                                            pixel_size[1] /
                                            exp_config.target_resolution[1],
                                            pixel_size[2] /
                                            exp_config.target_resolution[2])

                            vol_scaled = transform.rescale(img,
                                                           scale_vector,
                                                           order=1,
                                                           preserve_range=True,
                                                           multichannel=False,
                                                           mode='constant')

                            nz_max = exp_config.image_size[2]
                            slice_vol = np.zeros((nx, ny, nz_max),
                                                 dtype=np.float32)

                            nz_curr = vol_scaled.shape[2]
                            stack_from = (nz_max - nz_curr) // 2
                            stack_counter = stack_from

                            x, y, z = vol_scaled.shape

                            x_s = (x - nx) // 2
                            y_s = (y - ny) // 2
                            x_c = (nx - x) // 2
                            y_c = (ny - y) // 2

                            for zz in range(nz_curr):

                                slice_rescaled = vol_scaled[:, :, zz]

                                if x > nx and y > ny:
                                    slice_cropped = slice_rescaled[x_s:x_s +
                                                                   nx,
                                                                   y_s:y_s +
                                                                   ny]
                                else:
                                    slice_cropped = np.zeros((nx, ny))
                                    if x <= nx and y > ny:
                                        slice_cropped[
                                            x_c:x_c +
                                            x, :] = slice_rescaled[:, y_s:y_s +
                                                                   ny]
                                    elif x > nx and y <= ny:
                                        slice_cropped[:, y_c:y_c +
                                                      y] = slice_rescaled[
                                                          x_s:x_s + nx, :]

                                    else:
                                        slice_cropped[x_c:x_c + x, y_c:y_c +
                                                      y] = slice_rescaled[:, :]

                                slice_vol[:, :, stack_counter] = slice_cropped
                                stack_counter += 1

                            stack_to = stack_counter

                            network_input = np.float32(
                                np.reshape(slice_vol, (1, nx, ny, nz_max, 1)))

                            start_time = time.time()
                            mask_out, logits_out = sess.run(
                                [mask_pl, softmax_pl],
                                feed_dict={images_pl: network_input})

                            logging.info('Classified 3D: %f secs' %
                                         (time.time() - start_time))

                            prediction_nzs = logits_out[0, :, :,
                                                        stack_from:stack_to,
                                                        ...]  # non-zero-slices

                            if not prediction_nzs.shape[2] == nz_curr:
                                raise ValueError('sizes mismatch')

                            # ASSEMBLE BACK THE SLICES
                            prediction_scaled = np.zeros(
                                list(vol_scaled.shape) +
                                [num_channels
                                 ])  # last dim is for logits classes

                            # insert cropped region into original image again
                            if x > nx and y > ny:
                                prediction_scaled[x_s:x_s + nx,
                                                  y_s:y_s + ny, :,
                                                  ...] = prediction_nzs
                            else:
                                if x <= nx and y > ny:
                                    prediction_scaled[:, y_s:y_s + ny, :,
                                                      ...] = prediction_nzs[
                                                          x_c:x_c + x, :, :,
                                                          ...]
                                elif x > nx and y <= ny:
                                    prediction_scaled[
                                        x_s:x_s +
                                        nx, :, :...] = prediction_nzs[:,
                                                                      y_c:y_c +
                                                                      y, :...]
                                else:
                                    prediction_scaled[:, :, :
                                                      ...] = prediction_nzs[
                                                          x_c:x_c + x,
                                                          y_c:y_c + y, :...]

                            logging.info('Prediction_scaled mean %f' %
                                         (np.mean(prediction_scaled)))

                            prediction = transform.resize(
                                prediction_scaled,
                                (mask.shape[0], mask.shape[1], mask.shape[2],
                                 num_channels),
                                order=1,
                                preserve_range=True,
                                mode='constant')
                            prediction = np.argmax(prediction, axis=-1)
                            prediction_arr = np.asarray(prediction,
                                                        dtype=np.uint8)

                        # This is the same for 2D and 3D again
                        if do_postprocessing:
                            prediction_arr = image_utils.keep_largest_connected_components(
                                prediction_arr)

                        elapsed_time = time.time() - start_time
                        total_time += elapsed_time
                        total_volumes += 1

                        logging.info('Evaluation of volume took %f secs.' %
                                     elapsed_time)

                        if frame == ED_frame:
                            frame_suffix = '_ED'
                        elif frame == ES_frame:
                            frame_suffix = '_ES'
                        else:
                            raise ValueError(
                                'Frame doesnt correspond to ED or ES. frame = %d, ED = %d, ES = %d'
                                % (frame, ED_frame, ES_frame))

                        # Save prediced mask
                        out_file_name = os.path.join(
                            output_folder, 'prediction',
                            'patient' + patient_id + frame_suffix + '.nii.gz')
                        if gt_exists:
                            out_affine = mask_dat[1]
                            out_header = mask_dat[2]
                        else:
                            out_affine = img_dat[1]
                            out_header = img_dat[2]

                        logging.info('saving to: %s' % out_file_name)
                        utils.save_nii(out_file_name, prediction_arr,
                                       out_affine, out_header)

                        # Save image data to the same folder for convenience
                        image_file_name = os.path.join(
                            output_folder, 'image',
                            'patient' + patient_id + frame_suffix + '.nii.gz')
                        logging.info('saving to: %s' % image_file_name)
                        utils.save_nii(image_file_name, img_dat[0], out_affine,
                                       out_header)

                        if gt_exists:

                            # Save GT image
                            gt_file_name = os.path.join(
                                output_folder, 'ground_truth', 'patient' +
                                patient_id + frame_suffix + '.nii.gz')
                            logging.info('saving to: %s' % gt_file_name)
                            utils.save_nii(gt_file_name, mask, out_affine,
                                           out_header)

                            # Save difference mask between predictions and ground truth
                            difference_mask = np.where(
                                np.abs(prediction_arr - mask) > 0, [1], [0])
                            difference_mask = np.asarray(difference_mask,
                                                         dtype=np.uint8)
                            diff_file_name = os.path.join(
                                output_folder, 'difference', 'patient' +
                                patient_id + frame_suffix + '.nii.gz')
                            logging.info('saving to: %s' % diff_file_name)
                            utils.save_nii(diff_file_name, difference_mask,
                                           out_affine, out_header)

        logging.info('Average time per volume: %f' %
                     (total_time / total_volumes))

    return init_iteration
Beispiel #4
0
def prepare_data(input_folder, output_file, size, target_resolution, labels_list, rescale_to_one, offset=None, image_postfix='.nii.gz'):

    '''
    Main function that prepares a dataset from the raw challenge data to an hdf5 dataset
    '''

    csv_summary_file = os.path.join(input_folder, 'summary_alldata.csv')

    summary = pd.read_csv(csv_summary_file)
    summary = summary.loc[summary['image_exists']==True]
    summary = summary.loc[~(summary['diagnosis_3cat'] == 'unknown')]  # Don't use images with unknown diagnosis

    # Get list of unique rids
    rids = summary.rid.unique()

    # Get initial diagnosis for rough stratification
    diagnoses = []
    for rid in rids:
        diagnoses.append(summary.loc[summary['rid'] == rid]['diagnosis_3cat'].values[0])

    train_and_val_rids, test_rids, train_and_val_diagnoses, _ = train_test_split(rids, diagnoses, test_size=0.2, stratify=diagnoses)
    train_rids, val_rids = train_test_split(train_and_val_rids, test_size=0.2, stratify=train_and_val_diagnoses)

    print(len(train_rids), len(test_rids), len(val_rids))

    # n_images_train = len(summary.loc[summary['rid'].isin(train_rids)])
    # n_images_test = len(summary.loc[summary['rid'].isin(test_rids)])
    # n_images_val = len(summary.loc[summary['rid'].isin(val_rids)])

    hdf5_file = h5py.File(output_file, "w")

    diag_list = {'test': [], 'train': [], 'val': []}
    weight_list = {'test': [], 'train': [], 'val': []}
    age_list = {'test': [], 'train': [], 'val': []}
    gender_list  = {'test': [], 'train': [], 'val': []}
    rid_list = {'test': [], 'train': [], 'val': []}
    viscode_list = {'test': [], 'train': [], 'val': []}
    adas13_list = {'test': [], 'train': [], 'val': []}
    mmse_list = {'test': [], 'train': [], 'val': []}
    field_strength_list = {'test': [], 'train': [], 'val': []}

    file_list = {'test': [], 'train': [], 'val': []}

    logging.info('Counting files and parsing meta data...')

    for train_test, set_rids in zip(['train', 'test', 'val'], [train_rids, test_rids, val_rids]):

        for ii, row in summary.iterrows():

            rid = row['rid']
            if rid not in set_rids:
                continue

            diagnosis_str = row['diagnosis_3cat']
            diagnosis = diagnosis_dict[diagnosis_str]
            if diagnosis not in labels_list:
                continue

            rid_list[train_test].append(rid)
            diag_list[train_test].append(diagnosis)

            viscode = row['viscode']
            viscode_list[train_test].append(viscode_dict[viscode])
            weight_list[train_test].append(row['weight'])
            age_list[train_test].append(row['age'])
            gender_list[train_test].append(gender_dict[row['gender']])
            adas13_list[train_test].append(fix_nan_and_unknown(row['adas13'], target_data_format=np.float32))
            mmse_list[train_test].append(fix_nan_and_unknown(row['mmse'], target_data_format=np.uint8))

            field_strength = row['field_strength']
            field_strength_list[train_test].append(field_strength)

            phase = row['phase']

            file_name = 'rid_%s/%s_%sT_%s_rid%s_%s%s' % (str(rid).zfill(4),
                                                         phase.lower(),
                                                         str(field_strength),
                                                         diagnosis_str,
                                                         str(rid).zfill(4),
                                                         viscode,
                                                         image_postfix)
            file_list[train_test].append(os.path.join(input_folder, file_name))


    # Write the small datasets
    for tt in ['test', 'train', 'val']:

        hdf5_file.create_dataset('rid_%s' % tt, data=np.asarray(rid_list[tt], dtype=np.uint16))
        hdf5_file.create_dataset('viscode_%s' % tt, data=np.asarray(viscode_list[tt], dtype=np.uint8))
        hdf5_file.create_dataset('diagnosis_%s' % tt, data=np.asarray(diag_list[tt], dtype=np.uint8))
        hdf5_file.create_dataset('age_%s' % tt, data=np.asarray(age_list[tt], dtype=np.float32))
        hdf5_file.create_dataset('weight_%s' % tt, data=np.asarray(weight_list[tt], dtype=np.float32))
        hdf5_file.create_dataset('gender_%s' % tt, data=np.asarray(gender_list[tt], dtype=np.uint8))
        hdf5_file.create_dataset('adas13_%s' % tt, data=np.asarray(adas13_list[tt], dtype=np.float32))
        hdf5_file.create_dataset('mmse_%s' % tt, data=np.asarray(mmse_list[tt], dtype=np.uint8))
        hdf5_file.create_dataset('field_strength_%s' % tt, data=np.asarray(field_strength_list[tt], dtype=np.float16))


    n_train = len(file_list['train'])
    n_test = len(file_list['test'])
    n_val = len(file_list['val'])

    # assert n_train == n_images_train, 'Mismatch in data sizes, %d not == %d' % (n_train, n_images_train)
    # assert n_test == n_images_test, 'Mismatch in data sizes, %d not == %d' % (n_test, n_images_test)
    # assert n_val == n_images_val, 'Mismatch in data sizes, %d not == %d' % (n_val, n_images_val)

    # Create datasets for images and masks
    data = {}
    for tt, num_points in zip(['test', 'train', 'val'], [n_test, n_train, n_val]):
        data['images_%s' % tt] = hdf5_file.create_dataset("images_%s" % tt, [num_points] + list(size), dtype=np.float32)

    img_list = {'test': [], 'train': [] , 'val': []}

    logging.info('Parsing image files')

    for train_test in ['test', 'train', 'val']:

        write_buffer = 0
        counter_from = 0

        for file in file_list[train_test]:

            logging.info('-----------------------------------------------------------')
            logging.info('Doing: %s' % file)

            img_dat = utils.load_nii(file)
            img = img_dat[0].copy()

            pixel_size = (img_dat[2].structarr['pixdim'][1],
                          img_dat[2].structarr['pixdim'][2],
                          img_dat[2].structarr['pixdim'][3])

            logging.info('Pixel size:')
            logging.info(pixel_size)


            scale_vector = [pixel_size[0] / target_resolution[0],
                            pixel_size[1] / target_resolution[1],
                            pixel_size[2] / target_resolution[2]]

            img_scaled = transform.rescale(img,
                                           scale_vector,
                                           order=1,
                                           preserve_range=True,
                                           multichannel=False,
                                           mode='constant')

            img_resized = crop_or_pad_slice_to_size(img_scaled, size, offset=offset)

            if rescale_to_one:
                img_resized = image_utils.map_image_to_intensity_range(img_resized, -1, 1)
            else:
                img_resized = image_utils.normalise_image(img_resized)


            ### DEBUGGING ############################################
            # utils.create_and_save_nii(img_resized, 'debug.nii.gz')
            # exit()
            #########################################################

            img_list[train_test].append(img_resized)

            write_buffer += 1

            if write_buffer >= MAX_WRITE_BUFFER:

                counter_to = counter_from + write_buffer
                _write_range_to_hdf5(data, train_test, img_list, counter_from, counter_to)
                _release_tmp_memory(img_list, train_test)

                # reset stuff for next iteration
                counter_from = counter_to
                write_buffer = 0



        # after file loop: Write the remaining data

        logging.info('Writing remaining data')
        counter_to = counter_from + write_buffer

        _write_range_to_hdf5(data, train_test, img_list, counter_from, counter_to)
        _release_tmp_memory(img_list, train_test)


    # After test train loop:
    hdf5_file.close()
def prepare_data(input_folder,
                 output_file,
                 size,
                 target_resolution,
                 labels_list,
                 rescale_to_one,
                 image_postfix='.nii.gz'):
    '''
    Main function that prepares a dataset from the raw challenge data to an hdf5 dataset
    '''

    csv_summary_file = os.path.join(input_folder, 'summary_screening.csv')

    summary = pd.read_csv(csv_summary_file)
    summary = summary.loc[summary['image_exists'] == True]

    train_and_val_cases, test_cases = train_test_split(
        summary, test_size=0.2, stratify=summary['diagnosis_3cat'])
    train_cases, val_cases = train_test_split(
        train_and_val_cases,
        test_size=0.2,
        stratify=train_and_val_cases['diagnosis_3cat'])

    hdf5_file = h5py.File(output_file, "w")

    diag_list = {'test': [], 'train': [], 'val': []}
    weight_list = {'test': [], 'train': [], 'val': []}
    age_list = {'test': [], 'train': [], 'val': []}
    gender_list = {'test': [], 'train': [], 'val': []}
    rid_list = {'test': [], 'train': [], 'val': []}
    confidence_list = {'test': [], 'train': [], 'val': []}
    adas13_list = {'test': [], 'train': [], 'val': []}
    mmse_list = {'test': [], 'train': [], 'val': []}
    field_strength_list = {'test': [], 'train': [], 'val': []}

    file_list = {'test': [], 'train': [], 'val': []}

    logging.info('Counting files and parsing meta data...')

    for train_test, sum_df in zip(['train', 'test', 'val'],
                                  [train_cases, test_cases, val_cases]):

        for ii, row in sum_df.iterrows():

            diagnosis_str = row['diagnosis_3cat']
            diagnosis = diagnosis_dict[diagnosis_str]

            if diagnosis not in labels_list:
                continue

            diag_list[train_test].append(diagnosis)

            rid = row['rid']
            rid_list[train_test].append(rid)

            confidence = fix_nan_and_unknown(np.float16,
                                             row['confidence'],
                                             nan_val=255,
                                             unknown_val=254)
            confidence_list[train_test].append(confidence)

            weight_list[train_test].append(row['weight'])
            age_list[train_test].append(row['age'])
            gender_list[train_test].append(gender_dict[row['gender']])
            adas13_list[train_test].append(row['adas13'])
            mmse_list[train_test].append(row['mmse'])

            field_strength = row['field_strength']
            field_strength_list[train_test].append(field_strength)

            phase = row['phase']

            file_name = '%s_%sT_%s_rid%s%s' % (
                phase.lower(), str(field_strength), diagnosis_str,
                str(rid).zfill(4), image_postfix)
            file_list[train_test].append(os.path.join(input_folder, file_name))

    # Write the small datasets
    for tt in ['test', 'train', 'val']:

        hdf5_file.create_dataset('rid_%s' % tt,
                                 data=np.asarray(rid_list[tt],
                                                 dtype=np.uint16))
        hdf5_file.create_dataset('confidence_%s' % tt,
                                 data=np.asarray(confidence_list[tt],
                                                 dtype=np.uint8))
        hdf5_file.create_dataset('diagnosis_%s' % tt,
                                 data=np.asarray(diag_list[tt],
                                                 dtype=np.uint8))
        hdf5_file.create_dataset('age_%s' % tt,
                                 data=np.asarray(age_list[tt],
                                                 dtype=np.float32))
        hdf5_file.create_dataset('weight_%s' % tt,
                                 data=np.asarray(weight_list[tt],
                                                 dtype=np.float32))
        hdf5_file.create_dataset('gender_%s' % tt,
                                 data=np.asarray(gender_list[tt],
                                                 dtype=np.uint8))
        hdf5_file.create_dataset('adas13_%s' % tt,
                                 data=np.asarray(adas13_list[tt],
                                                 dtype=np.float32))
        hdf5_file.create_dataset('mmse_%s' % tt,
                                 data=np.asarray(mmse_list[tt],
                                                 dtype=np.uint8))
        hdf5_file.create_dataset('field_strength_%s' % tt,
                                 data=np.asarray(field_strength_list[tt],
                                                 dtype=np.float16))

    n_train = len(file_list['train'])
    n_test = len(file_list['test'])
    n_val = len(file_list['val'])

    # Create datasets for images and masks
    data = {}
    for tt, num_points in zip(['test', 'train', 'val'],
                              [n_test, n_train, n_val]):
        data['images_%s' % tt] = hdf5_file.create_dataset(
            "images_%s" % tt, [num_points] + list(size), dtype=np.float32)

    img_list = {'test': [], 'train': [], 'val': []}

    logging.info('Parsing image files')

    for train_test in ['test', 'train', 'val']:

        write_buffer = 0
        counter_from = 0

        for file in file_list[train_test]:

            logging.info(
                '-----------------------------------------------------------')
            logging.info('Doing: %s' % file)

            img_dat = utils.load_nii(file)
            img = img_dat[0].copy()

            pixel_size = (img_dat[2].structarr['pixdim'][1],
                          img_dat[2].structarr['pixdim'][2],
                          img_dat[2].structarr['pixdim'][3])

            logging.info('Pixel size:')
            logging.info(pixel_size)

            scale_vector = [
                pixel_size[0] / target_resolution[0],
                pixel_size[1] / target_resolution[1],
                pixel_size[2] / target_resolution[2]
            ]

            img_scaled = transform.rescale(img,
                                           scale_vector,
                                           order=1,
                                           preserve_range=True,
                                           multichannel=False,
                                           mode='constant')

            if rescale_to_one:
                img_scaled = image_utils.map_image_to_intensity_range(
                    img_scaled, -1, 1)
            else:
                img_scaled = image_utils.normalise_image(img_scaled)

            img_resized = crop_or_pad_slice_to_size(img_scaled, size)
            img_list[train_test].append(img_resized)

            write_buffer += 1

            if write_buffer >= MAX_WRITE_BUFFER:

                counter_to = counter_from + write_buffer
                _write_range_to_hdf5(data, train_test, img_list, counter_from,
                                     counter_to)
                _release_tmp_memory(img_list, train_test)

                # reset stuff for next iteration
                counter_from = counter_to
                write_buffer = 0

        # after file loop: Write the remaining data

        logging.info('Writing remaining data')
        counter_to = counter_from + write_buffer

        _write_range_to_hdf5(data, train_test, img_list, counter_from,
                             counter_to)
        _release_tmp_memory(img_list, train_test)

    # After test train loop:
    hdf5_file.close()
def prepare_data(input_folder,
                 output_file,
                 mode,
                 size,
                 target_resolution,
                 split_test_train=True):
    '''
    Main function that prepares a dataset from the raw challenge data to an hdf5 dataset
    '''
    assert (mode in ['2D']), 'Unknown mode: %s' % mode
    if mode == '2D' and not len(size) == 2:
        raise AssertionError('Inadequate number of size parameters')
    if mode == '2D' and not len(target_resolution) == 2:
        raise AssertionError(
            'Inadequate number of target resolution parameters')

    hdf5_file = h5py.File(output_file, "w")
    file_list = {'test': [], 'train': []}
    num_slices = {'test': 0, 'train': 0}

    logging.info('Counting files and parsing meta data...')

    cptImage = 0
    for file in glob.glob(os.path.join(input_folder, '*_image.nii.gz')):
        if split_test_train:
            train_test = 'test' if (cptImage %
                                    5 == 1) else 'train'  #aiming for 80/20%
        else:
            train_test = 'train'

        file_list[train_test].append(file)
        cptImage = cptImage + 1

        nifty_img = nib.load(file)
        num_slices[train_test] += nifty_img.shape[2]

    # Write the small datasets
    nx, ny = size
    n_test = num_slices['test']
    n_train = num_slices['train']

    # Create datasets for images and masks
    data = {}
    for tt, num_points in zip(['test', 'train'], [n_test, n_train]):

        if num_points > 0:
            data['images_%s' % tt] = hdf5_file.create_dataset(
                "images_%s" % tt, [num_points] + list(size), dtype=np.float32)
            data['masks_%s' % tt] = hdf5_file.create_dataset(
                "masks_%s" % tt, [num_points] + list(size), dtype=np.uint8)

    mask_list = {'test': [], 'train': []}
    img_list = {'test': [], 'train': []}

    logging.info('Parsing image files')

    train_test_range = ['test', 'train'] if split_test_train else ['train']
    logging.info("split_test_train : ")
    logging.info(split_test_train)
    for train_test in train_test_range:

        write_buffer = 0
        counter_from = 0

        for file in file_list[train_test]:
            logging.info(
                '-----------------------------------------------------------')
            logging.info('Doing: %s' % file)

            file_base = file.split('.nii.gz')[0]
            file_mask = file_base + '_gt.nii.gz'

            img_dat = utils.load_nii(file)
            mask_dat = utils.load_nii(file_mask)

            img = img_dat[0].copy()
            mask = mask_dat[0].copy()

            img = image_utils.normalise_image(img)

            pixel_size = (img_dat[2].structarr['pixdim'][1],
                          img_dat[2].structarr['pixdim'][2],
                          img_dat[2].structarr['pixdim'][3])

            logging.info('Pixel size:')
            logging.info(pixel_size)

            ### PROCESSING LOOP FOR SLICE-BY-SLICE 2D DATA ###################
            scale_vector = [
                pixel_size[0] / target_resolution[0],
                pixel_size[1] / target_resolution[1]
            ]

            for zz in range(img.shape[2]):

                slice_img = np.squeeze(img[:, :, zz])
                slice_rescaled = transform.rescale(
                    slice_img,
                    scale_vector,
                    order=1,
                    preserve_range=True,
                    #multichannel=False,
                    mode='constant')

                slice_mask = np.squeeze(mask[:, :, zz])
                mask_rescaled = transform.rescale(
                    slice_mask,
                    scale_vector,
                    order=0,
                    preserve_range=True,
                    #multichannel=False,
                    mode='constant')

                slice_cropped = crop_or_pad_slice_to_size(
                    slice_rescaled, nx, ny)
                mask_cropped = crop_or_pad_slice_to_size(mask_rescaled, nx, ny)

                img_list[train_test].append(slice_cropped)
                mask_list[train_test].append(mask_cropped)

                write_buffer += 1

                # Writing needs to happen inside the loop over the slices
                if write_buffer >= MAX_WRITE_BUFFER:

                    counter_to = counter_from + write_buffer
                    _write_range_to_hdf5(data, train_test, img_list, mask_list,
                                         counter_from, counter_to)
                    _release_tmp_memory(img_list, mask_list, train_test)

                    # reset stuff for next iteration
                    counter_from = counter_to
                    write_buffer = 0

        # after file loop: Write the remaining data

        logging.info('Writing remaining data')
        counter_to = counter_from + write_buffer
        _write_range_to_hdf5(data, train_test, img_list, mask_list,
                             counter_from, counter_to)
        _release_tmp_memory(img_list, mask_list, train_test)

    # After test train loop:
    hdf5_file.close()