def extract_patches(self, h5db, new_folder): print 'OpenSlide needed to extract patches.' return None ''' for centre in self.centres: print('[cnn][patch_extraction] Selected Centre: ', centre) # each centre may have more than one annotation XML file, so here we retrieve # a list of all the XMLs related to the current centre annotation_list = np.sort(self.get_annotation_list(centre, self.xml_source_fld)) # for each XML file in the annotation list # we want to extract tumor and normal patches for xml_file in annotation_list: files_counter +=1 # variable to shape the final data vector ''' print('[debug] ', self.name) print('[debug] ', self.settings) self.set_files_counter(self.count_annotation_files()) print('[dataset] {0} [extract_patches] {1} total annotation files.'. format(self.name, self.files_counter)) for centre in self.centres: annotation_list = self.get_annotation_list(centre) for xml_file in annotation_list: slide_path = self.get_wsi_path(centre, xml_file) xml_path = os.path.join(self.xml_source_fld, xml_file) # retrieving the information about the file analysed. # info is a dictionary with the following keys: # info['centre'], current centre number # info['patient'], current patient number # info['node'], current WSI node info = self.get_info(xml_path, centre) #functions.setDBHierarchy(h5db, self.settings,info) if info['patient'] == '008_Mask.tif': continue if xml_path != None: ## add check slide is open and ok # preprocess takes the WSI path, and the slide_level and returns the # the WSI openslide obj, the tumor annotation mask, the WSI image # and the tumor contours if self.name == 'camelyon16': print('import openslides') #slide = openslide.OpenSlide(slide_path) #rgb_im = np.array(slide.read_region((0,0),7,slide.level_dimensions[7])) #mask_file = xml_path+'Tumor_{}_Mask.tif'.format(info['patient']) #import pdb; pdb.set_trace() annotations = np.asarray( openslide.OpenSlide(xml_path).read_region( (0, 0), 7, slide.level_dimensions[7])) annotations_mask = annotations[:, :, 0] #import pdb; pdb.set_trace() im_contour = rgb_im else: import pdb pdb.set_trace() slide, annotations_mask, rgb_im, im_contour = functions.preprocess( slide_path, xml_path, slide_level=self.settings['slide_level']) tum_patch_list, tum_patch_point = integral.patch_sampling_using_integral( slide, self.settings['slide_level'], annotations_mask, self.settings['patch_size'], self.settings['n_samples']) # conversion of the lists to np arrays tum_patch_array = np.asarray(tum_patch_list) #import pdb; pdb.set_trace() tum_locations = np.array(tum_patch_point) # storage in the HDF5 db self.store(h5db, info, tum_patch_array, tum_locations, 'tumor') # reverting the tumor mask to find normal tissue and extract patches # Note : # normal_mask = tissu mask(morp_im) - tummor mask(annotations_mask) ##### restart from here ## morp_im = functions.get_morp_im(rgb_im) normal_im = morp_im - annotations_mask ## np.min(normal_im) := -1.0 normal_im = normal_im == 1.0 normal_im = (normal_im).astype(int) # sampling normal patches with uniform distribution nor_patch_list, nor_patch_point = integral.patch_sampling_using_integral( slide, self.settings['slide_level'], normal_im, self.settings['patch_size'], self.settings['n_samples']) nor_patch_array = np.asarray(nor_patch_list) normal_patches_locations = np.array(nor_patch_point) # storing the normal patches and their locations self.store(h5db, info, nor_patch_array, nor_patch_point, 'normal') ''' Visualisation ''' # plotting the tumor locations in the XML file # Drawing the normal patches sampling points # tumor_locations.png shows the tumor patches locations in red # and the normal patches locations in green tumor_locations_im = rgb_im plt.figure() plt.imshow(tumor_locations_im) for p_x, p_y in normal_patches_locations: plt.scatter(p_y, p_x, c='g') #cv2.circle(tumor_locations_im,(p_y,p_x),30,(0,255,0),10) for p_x, p_y in tum_locations: plt.scatter(p_y, p_x, c='r') #cv2.circle(tumor_locations_im,(p_y,p_x),30,(255,0,0), 10) print( '[cnn][patch_extraction] Saving tumor locations image') plt.savefig( os.path.join( new_folder, 'level{}_centre{}_patient{}_node{}_tumor_locations.png' .format(self.settings['slide_level'], info['centre'], info['patient'], info['node']))) plt.close() #print('Saving tumor locations image') #plt.savefig('tumor_locations_patient0{}_node{}'.format(info['patient'], info['node'])) print( '[cnn][patch_extraction] Saving annotation mask and normal tissue mask' ) plt.figure() plt.imshow(annotations_mask) plt.savefig( os.path.join( new_folder, 'level{}_centre{}_patient{}_node{}_annotation_mask.png' .format(self.settings['slide_level'], info['centre'], info['patient'], info['node']))) plt.close() plt.figure() plt.imshow(normal_im) plt.savefig( os.path.join( new_folder, 'level{}_centre{}_patient{}_node{}_normal_tissue_mask.png' .format(self.settings['slide_level'], info['centre'], info['patient'], info['node']))) plt.close() plt.close('all') self.tum_counter += len(tum_patch_array) self.nor_counter += len(nor_patch_array) #self.nor_counter = 0 return
def extract_patches(self): """ (more doc please) """ errors = 0 warnings = 0 settings = self.config['settings'] for centre in self.centres: for patient in self.get_patients(centre): self.logger.info('processing patient: {}'.format(patient)) slide_path = self.get_wsi_path(centre, patient) xml_path = self.get_annotation_path(centre, patient) info = self.get_info(centre, patient) pat_res_dir = self.make_patient_dir(info) if not pat_res_dir: self.logger.error( "patient {}: problems with results dir...".format( patient)) errors += 1 continue h5db_path = os.path.join(pat_res_dir, self.h5db_bname + '.h5') try: h5db = hd.File(h5db_path, 'w') except Exception as e: self.logger.error( "patient {}: can't open my H5 DB '{}': {} ".format( patient, h5db_path, e)) errors += 1 continue slide, annotations_mask, rgb_im, im_contour = preprocess( slide_path, xml_path, slide_level=settings['slide_level']) # reverting the tumor mask to find normal tissue and extract patches # Note : # normal_mask = tissu mask(morp_im) - tummor mask(annotations_mask) morp_im = get_morp_im(rgb_im) normal_im = morp_im - annotations_mask # np.min(normal_im) := -1.0 normal_im = normal_im == 1.0 normal_im = (normal_im).astype(int) # masks are the same for any sample batch ;-) # [TO-DO] make switchable from config/CL plt.figure() plt.imshow(annotations_mask) img_file = self.get_image_fname(pat_res_dir, 'annotation_mask', info) plt.savefig(img_file) plt.close() self.logger.info( 'patient {}: Annotation mask image saved to: {}'.format( patient, img_file)) plt.figure() plt.imshow(normal_im) img_file = self.get_image_fname(pat_res_dir, 'normal_tissue_mask', info) plt.savefig(img_file) plt.close() self.logger.info( 'patient {}: Normal tissue mask image saved to: {}'.format( patient, img_file)) opts = dict( map(lambda k: (k, settings[k]), ( 'area_overlap', 'bad_batch_size', 'gray_threshold', 'margin_width_x', 'margin_width_y', 'method', 'n_samples', 'patch_size', 'slide_level', 'white_level', 'white_threshold', 'white_threshold_incr', 'white_threshold_max', ))) # batch sample & store -- keep it small to avoid OOM! In # "linear" sampling mode, more batches might be needed, so go # for a run and get the extracted pathes and the last # index. Loop until no patches come out # [TO-DO] store info in _per-patient_ H5 DB # a patient case (:= slide) the tumor annotation mask is # usually (much) smaller than the normal tissue mask, thus a # different number of batches is needed to extract all the # tumor and normal patches. So we compute then normal tissue # mask once. Apart from that, there's no relation between # tumor and normal patches, hence we batch-loop two times: a # first time for the tumor case and a second time for the # normal case. N.B. In 'random' sampling mode, just one batch # is ever done. index = 0 # ignored in 'random' mode -- only one batch done tum_patch_point = [] bcnt_t, bcnt_n = 0, 0 last_idx_t = last_idx_n = -1 if settings['window']: self.logger.info( "patient {}: restricting nonzero points range to {}%, {}%" .format(patient, settings['window'][0], settings['window'][1])) nzx_n, nzy_n = integral.nonzero_range(normal_im, settings['window']) # *** Warning! *** Split loops doesn't work if we want to show # images: there's data dependency on "normal_patches_locations". # normal tissue while (True): self.logger.info( "patient {}: >>> [normal] starting batch {}".format( patient, bcnt_n)) opts['start_idx'] = last_idx_n + 1 nor_patch_list, nor_patch_point, last_idx_n = integral.patch_sampling( slide, normal_im, nzx_n, nzy_n, **opts) if nor_patch_point and nor_patch_list: nor_patch_array = np.asarray(nor_patch_list) normal_patches_locations = np.array(nor_patch_point) self.store_patient(info, nor_patch_array, nor_patch_point, 'normal', h5db, bcnt_n) else: self.logger.info( 'patient {}: batch {}: no (more) normal patches'. format(patient, bcnt_n)) break self.nor_counter += len(nor_patch_array) self.logger.info( "patient {}: <<< [normal] done batch {}".format( patient, bcnt_n)) if last_idx_n == None: # in 'random' method, this tells us that we're done sampling break bcnt_n += 1 # {end-while} # TO-DO: batch runs should be better encapsulated (aux fun/method)... # tumors masks are usually too small for windowed sampling, so # take the full range nzx_t, nzy_t = integral.nonzero_range(annotations_mask, []) while (True): self.logger.info( "patient {}: >>> [tumor] starting batch {}".format( patient, bcnt_t)) opts['start_idx'] = last_idx_t + 1 tum_patch_list, tum_patch_point, last_idx_t = integral.patch_sampling( slide, annotations_mask, nzx_t, nzy_t, **opts) if tum_patch_list and tum_patch_point: tum_patch_array = np.asarray(tum_patch_list) tum_locations = np.array(tum_patch_point) self.store_patient(info, tum_patch_array, tum_locations, 'tumor', h5db, bcnt_t) else: self.logger.info( 'patient {}: batch {}: no (more) tumor patches'. format(patient, bcnt_t)) break if opts['method'] == 'random': if bcnt_n != bcnt_t: self.logger.error( "[BUG] Can't make scatter image(s): batch count mismatch" ) errors += 1 else: # plotting the tumor locations in the XML file Drawing the # normal patches sampling points tumor_locations.png shows the # tumor patches locations in red and the normal patches # locations in green tumor_locations_im = rgb_im plt.figure() plt.imshow(tumor_locations_im) # Warning! Data dependency on previous normal batch run for p_x, p_y in normal_patches_locations: plt.scatter(p_y, p_x, c='g') for p_x, p_y in tum_locations: plt.scatter(p_y, p_x, c='r') img_file = self.get_image_fname( pat_res_dir, 'tumor_locations', info, bcnt_t) plt.savefig(img_file) plt.close() self.logger.info( 'patient {}: batch {}: tumor locations image saved to: {}' .format(patient, bcnt_t, img_file)) self.tum_counter += len(tum_patch_array) self.logger.info( "patient {}: <<< [tumor] done batch {}".format( patient, bcnt_t)) if last_idx_t == None: # in 'random' method, this tells us that we're done sampling break bcnt_t += 1 # {end-while} h5db.close() self.logger.info( "patient {}: processed in {} (normal) + {} (tumor) batches" .format(patient, bcnt_n, bcnt_t)) self.logger.info("patient {}: data saved to H5 DB: {}".format( patient, h5db_path)) # {end-for-patient} # {end-for-centre} self.report['errors'] = errors self.report['warnings'] = warnings