Example #1
0
    def extract_patches(self, h5db, new_folder):
        print 'OpenSlide needed to extract patches.'
        return None
        '''
        for centre in self.centres:
            print('[cnn][patch_extraction] Selected Centre: ', centre)
            # each centre may have more than one annotation XML file, so here we retrieve
            # a list of all the XMLs related to the current centre
            annotation_list = np.sort(self.get_annotation_list(centre, self.xml_source_fld))
            # for each XML file in the annotation list
            # we want to extract tumor and normal patches
            for xml_file in annotation_list:
                files_counter +=1 # variable to shape the final data vector
        '''
        print('[debug] ', self.name)
        print('[debug] ', self.settings)

        self.set_files_counter(self.count_annotation_files())

        print('[dataset] {0} [extract_patches] {1} total annotation files.'.
              format(self.name, self.files_counter))

        for centre in self.centres:
            annotation_list = self.get_annotation_list(centre)
            for xml_file in annotation_list:
                slide_path = self.get_wsi_path(centre, xml_file)
                xml_path = os.path.join(self.xml_source_fld, xml_file)
                # retrieving the information about the file analysed.
                #   info is a dictionary with the following keys:
                #   info['centre'], current centre number
                #   info['patient'], current patient number
                #   info['node'], current WSI node
                info = self.get_info(xml_path, centre)
                #functions.setDBHierarchy(h5db, self.settings,info)
                if info['patient'] == '008_Mask.tif':
                    continue
                if xml_path != None:  ## add check slide is open and ok
                    # preprocess takes the WSI path, and the slide_level and returns the
                    # the WSI openslide obj, the tumor annotation mask, the WSI image
                    # and the tumor contours

                    if self.name == 'camelyon16':
                        print('import openslides')
                        #slide = openslide.OpenSlide(slide_path)
                        #rgb_im = np.array(slide.read_region((0,0),7,slide.level_dimensions[7]))
                        #mask_file = xml_path+'Tumor_{}_Mask.tif'.format(info['patient'])
                        #import pdb; pdb.set_trace()
                        annotations = np.asarray(
                            openslide.OpenSlide(xml_path).read_region(
                                (0, 0), 7, slide.level_dimensions[7]))
                        annotations_mask = annotations[:, :, 0]
                        #import pdb; pdb.set_trace()
                        im_contour = rgb_im

                    else:
                        import pdb
                        pdb.set_trace()
                        slide, annotations_mask, rgb_im, im_contour = functions.preprocess(
                            slide_path,
                            xml_path,
                            slide_level=self.settings['slide_level'])

                    tum_patch_list, tum_patch_point = integral.patch_sampling_using_integral(
                        slide, self.settings['slide_level'], annotations_mask,
                        self.settings['patch_size'],
                        self.settings['n_samples'])
                    # conversion of the lists to np arrays
                    tum_patch_array = np.asarray(tum_patch_list)
                    #import pdb; pdb.set_trace()
                    tum_locations = np.array(tum_patch_point)
                    # storage in the HDF5 db
                    self.store(h5db, info, tum_patch_array, tum_locations,
                               'tumor')

                    # reverting the tumor mask to find normal tissue and extract patches
                    #    Note :
                    #    normal_mask = tissu mask(morp_im) - tummor mask(annotations_mask)

                    ##### restart from here ##

                    morp_im = functions.get_morp_im(rgb_im)
                    normal_im = morp_im - annotations_mask  ## np.min(normal_im) := -1.0
                    normal_im = normal_im == 1.0
                    normal_im = (normal_im).astype(int)
                    # sampling normal patches with uniform distribution
                    nor_patch_list, nor_patch_point = integral.patch_sampling_using_integral(
                        slide, self.settings['slide_level'], normal_im,
                        self.settings['patch_size'],
                        self.settings['n_samples'])
                    nor_patch_array = np.asarray(nor_patch_list)
                    normal_patches_locations = np.array(nor_patch_point)
                    # storing the normal patches and their locations
                    self.store(h5db, info, nor_patch_array, nor_patch_point,
                               'normal')
                    ''' Visualisation '''

                    # plotting the tumor locations in the XML file
                    # Drawing the normal patches sampling points
                    # tumor_locations.png shows the tumor patches locations in red
                    # and the normal patches locations in green
                    tumor_locations_im = rgb_im
                    plt.figure()
                    plt.imshow(tumor_locations_im)
                    for p_x, p_y in normal_patches_locations:
                        plt.scatter(p_y, p_x, c='g')
                        #cv2.circle(tumor_locations_im,(p_y,p_x),30,(0,255,0),10)
                    for p_x, p_y in tum_locations:
                        plt.scatter(p_y, p_x, c='r')
                        #cv2.circle(tumor_locations_im,(p_y,p_x),30,(255,0,0), 10)
                    print(
                        '[cnn][patch_extraction] Saving tumor locations image')
                    plt.savefig(
                        os.path.join(
                            new_folder,
                            'level{}_centre{}_patient{}_node{}_tumor_locations.png'
                            .format(self.settings['slide_level'],
                                    info['centre'], info['patient'],
                                    info['node'])))
                    plt.close()
                    #print('Saving tumor locations image')
                    #plt.savefig('tumor_locations_patient0{}_node{}'.format(info['patient'], info['node']))

                    print(
                        '[cnn][patch_extraction] Saving annotation mask and normal tissue mask'
                    )
                    plt.figure()
                    plt.imshow(annotations_mask)
                    plt.savefig(
                        os.path.join(
                            new_folder,
                            'level{}_centre{}_patient{}_node{}_annotation_mask.png'
                            .format(self.settings['slide_level'],
                                    info['centre'], info['patient'],
                                    info['node'])))
                    plt.close()

                    plt.figure()
                    plt.imshow(normal_im)
                    plt.savefig(
                        os.path.join(
                            new_folder,
                            'level{}_centre{}_patient{}_node{}_normal_tissue_mask.png'
                            .format(self.settings['slide_level'],
                                    info['centre'], info['patient'],
                                    info['node'])))
                    plt.close()
                    plt.close('all')

                    self.tum_counter += len(tum_patch_array)
                    self.nor_counter += len(nor_patch_array)
                    #self.nor_counter = 0
        return
Example #2
0
    def extract_patches(self):
        """
        (more doc please)
        """
        errors = 0
        warnings = 0
        settings = self.config['settings']
        for centre in self.centres:
            for patient in self.get_patients(centre):
                self.logger.info('processing patient: {}'.format(patient))

                slide_path = self.get_wsi_path(centre, patient)
                xml_path = self.get_annotation_path(centre, patient)
                info = self.get_info(centre, patient)

                pat_res_dir = self.make_patient_dir(info)
                if not pat_res_dir:
                    self.logger.error(
                        "patient {}: problems with results dir...".format(
                            patient))
                    errors += 1
                    continue

                h5db_path = os.path.join(pat_res_dir, self.h5db_bname + '.h5')
                try:
                    h5db = hd.File(h5db_path, 'w')
                except Exception as e:
                    self.logger.error(
                        "patient {}: can't open my H5 DB '{}': {} ".format(
                            patient, h5db_path, e))
                    errors += 1
                    continue

                slide, annotations_mask, rgb_im, im_contour = preprocess(
                    slide_path, xml_path, slide_level=settings['slide_level'])

                # reverting the tumor mask to find normal tissue and extract patches
                # Note :
                #    normal_mask = tissu mask(morp_im) - tummor mask(annotations_mask)
                morp_im = get_morp_im(rgb_im)
                normal_im = morp_im - annotations_mask  # np.min(normal_im) := -1.0
                normal_im = normal_im == 1.0
                normal_im = (normal_im).astype(int)

                # masks are the same for any sample batch ;-)
                # [TO-DO] make switchable from config/CL
                plt.figure()
                plt.imshow(annotations_mask)
                img_file = self.get_image_fname(pat_res_dir, 'annotation_mask',
                                                info)
                plt.savefig(img_file)
                plt.close()
                self.logger.info(
                    'patient {}: Annotation mask image saved to: {}'.format(
                        patient, img_file))

                plt.figure()
                plt.imshow(normal_im)
                img_file = self.get_image_fname(pat_res_dir,
                                                'normal_tissue_mask', info)
                plt.savefig(img_file)
                plt.close()
                self.logger.info(
                    'patient {}: Normal tissue mask image saved to: {}'.format(
                        patient, img_file))

                opts = dict(
                    map(lambda k: (k, settings[k]), (
                        'area_overlap',
                        'bad_batch_size',
                        'gray_threshold',
                        'margin_width_x',
                        'margin_width_y',
                        'method',
                        'n_samples',
                        'patch_size',
                        'slide_level',
                        'white_level',
                        'white_threshold',
                        'white_threshold_incr',
                        'white_threshold_max',
                    )))

                # batch sample & store -- keep it small to avoid OOM!  In
                # "linear" sampling mode, more batches might be needed, so go
                # for a run and get the extracted pathes and the last
                # index. Loop until no patches come out

                # [TO-DO] store info in _per-patient_ H5 DB

                # a patient case (:= slide) the tumor annotation mask is
                # usually (much) smaller than the normal tissue mask, thus a
                # different number of batches is needed to extract all the
                # tumor and normal patches. So we compute then normal tissue
                # mask once. Apart from that, there's no relation between
                # tumor and normal patches, hence we batch-loop two times: a
                # first time for the tumor case and a second time for the
                # normal case. N.B. In 'random' sampling mode, just one batch
                # is ever done.

                index = 0  # ignored in 'random' mode -- only one batch done
                tum_patch_point = []
                bcnt_t, bcnt_n = 0, 0
                last_idx_t = last_idx_n = -1

                if settings['window']:
                    self.logger.info(
                        "patient {}: restricting nonzero points range to {}%, {}%"
                        .format(patient, settings['window'][0],
                                settings['window'][1]))
                nzx_n, nzy_n = integral.nonzero_range(normal_im,
                                                      settings['window'])

                # *** Warning! *** Split loops doesn't work if we want to show
                # images: there's data dependency on "normal_patches_locations".

                # normal tissue
                while (True):
                    self.logger.info(
                        "patient {}: >>> [normal] starting batch {}".format(
                            patient, bcnt_n))

                    opts['start_idx'] = last_idx_n + 1
                    nor_patch_list, nor_patch_point, last_idx_n = integral.patch_sampling(
                        slide, normal_im, nzx_n, nzy_n, **opts)
                    if nor_patch_point and nor_patch_list:
                        nor_patch_array = np.asarray(nor_patch_list)
                        normal_patches_locations = np.array(nor_patch_point)
                        self.store_patient(info, nor_patch_array,
                                           nor_patch_point, 'normal', h5db,
                                           bcnt_n)
                    else:
                        self.logger.info(
                            'patient {}: batch {}: no (more) normal patches'.
                            format(patient, bcnt_n))
                        break

                    self.nor_counter += len(nor_patch_array)

                    self.logger.info(
                        "patient {}: <<< [normal] done batch {}".format(
                            patient, bcnt_n))

                    if last_idx_n == None:
                        # in 'random' method, this tells us that we're done sampling
                        break

                    bcnt_n += 1
                # {end-while}

                # TO-DO: batch runs should be better encapsulated (aux fun/method)...

                # tumors masks are usually too small for windowed sampling, so
                # take the full range
                nzx_t, nzy_t = integral.nonzero_range(annotations_mask, [])
                while (True):

                    self.logger.info(
                        "patient {}: >>> [tumor] starting batch {}".format(
                            patient, bcnt_t))

                    opts['start_idx'] = last_idx_t + 1
                    tum_patch_list, tum_patch_point, last_idx_t = integral.patch_sampling(
                        slide, annotations_mask, nzx_t, nzy_t, **opts)
                    if tum_patch_list and tum_patch_point:
                        tum_patch_array = np.asarray(tum_patch_list)
                        tum_locations = np.array(tum_patch_point)
                        self.store_patient(info, tum_patch_array,
                                           tum_locations, 'tumor', h5db,
                                           bcnt_t)
                    else:
                        self.logger.info(
                            'patient {}: batch {}: no (more) tumor patches'.
                            format(patient, bcnt_t))
                        break

                    if opts['method'] == 'random':
                        if bcnt_n != bcnt_t:
                            self.logger.error(
                                "[BUG] Can't make scatter image(s): batch count mismatch"
                            )
                            errors += 1
                        else:
                            # plotting the tumor locations in the XML file Drawing the
                            # normal patches sampling points tumor_locations.png shows the
                            # tumor patches locations in red and the normal patches
                            # locations in green
                            tumor_locations_im = rgb_im
                            plt.figure()
                            plt.imshow(tumor_locations_im)
                            # Warning! Data dependency on previous normal batch run
                            for p_x, p_y in normal_patches_locations:
                                plt.scatter(p_y, p_x, c='g')
                            for p_x, p_y in tum_locations:
                                plt.scatter(p_y, p_x, c='r')

                            img_file = self.get_image_fname(
                                pat_res_dir, 'tumor_locations', info, bcnt_t)
                            plt.savefig(img_file)
                            plt.close()
                            self.logger.info(
                                'patient {}: batch {}: tumor locations image saved to: {}'
                                .format(patient, bcnt_t, img_file))

                    self.tum_counter += len(tum_patch_array)

                    self.logger.info(
                        "patient {}: <<< [tumor] done batch {}".format(
                            patient, bcnt_t))

                    if last_idx_t == None:
                        # in 'random' method, this tells us that we're done sampling
                        break

                    bcnt_t += 1
                # {end-while}

                h5db.close()
                self.logger.info(
                    "patient {}: processed in {} (normal) + {} (tumor) batches"
                    .format(patient, bcnt_n, bcnt_t))
                self.logger.info("patient {}: data saved to H5 DB: {}".format(
                    patient, h5db_path))
            # {end-for-patient}
        # {end-for-centre}

        self.report['errors'] = errors
        self.report['warnings'] = warnings