def __call__(self, img): if random.random() < self.prob: self.to_augment = staintools.LuminosityStandardizer.standardize( np.array(img).astype('uint8')) self.augmentor = staintools.StainAugmentor(method='vahadane', sigma1=0.2, sigma2=0.2) self.augmentor.fit(self.to_augment) augmented_img = self.augmentor.pop() return Image.fromarray( augmented_img.astype('uint8')) # .astype(float64) else: return img
def save_segmentation(i, max_row, max_col, loc, seg, home_dir, count, fileNames, augment=0, tile_size=224, blank_ratio=0.5): for j in range(max_col): aaa = seg[i, j] ccc = np.shape(aaa) if ccc[0] == tile_size & ccc[1] == tile_size: if (np.sum(seg[i, j][:, :, 0] > 220) / (ccc[0] * ccc[1])) < blank_ratio: # print("start saving \n") output_dir = (home_dir + "/" + str(count) + "/") if not os.path.isdir(output_dir): os.makedirs(output_dir) cv2.imwrite( output_dir + "/" + fileNames[:fileNames.rfind(".")] + "_" + str(count) + "_" + '_'.join(map(str, loc[i, j])) + ".jpg", seg[i, j]) #augment if augment > 0: augmentor = staintools.StainAugmentor(method='vahadane', sigma1=0.2, sigma2=0.2) augmentor.fit(seg[i, j]) for index in range(augment): augmented_image = augmentor.pop() cv2.imwrite( output_dir + "/" + "aug_" + str(index) + "_" + fileNames[:fileNames.rfind(".")] + "_" + str(count) + "_" + '_'.join(map(str, loc[i, j])) + ".jpg", augmented_image)
def __init__(self, data_path, transform_args, metadata_csv, split, num_classes=2, resize_shape=(DEFAULT_PATCH_SIZE, DEFAULT_PATCH_SIZE), max_patches=None, tasks_to='tcga', is_training=False, filtered=True, toy=False, normalize=False, transform=None): """Initialize TCGADataset. data directory to be organized as follows: data_path slide_list.pkl train.hdf5 val.hdf5 test.hdf5 metadata.csv Args: data_path (str): path to data directory transform_args (args): arguments to transform data metadata_csv (str): path to csv containing metadata information of the dataset split (str): either "train", "valid", or "test" num_classes (int): number of unique labels resize_shape (tuple): shape to resize the inputs to max_patches (int): max number of patches to obtain for each slide tasks_to (str): corresponds to a task sequence is_training (bool): whether the model in in training mode or not filtered (bool): whether to filter the images """ # if split not in ["train", "valid", "test"]: # raise ValueError("Invalid value for split. Must specify train, valid, or test.") super().__init__(data_path, transform_args, split, is_training, 'tcga', tasks_to) self.data_path = data_path # self.slide_list_path = os.path.join(self.data_path, SLIDE_PKL_FILE) self.hdf5_path = os.path.join(self.data_path, "{}.hdf5".format(split)) #hdf5_fh = h5py.File(self.hdf5_path, "r") #if split == "demo": # s = "TCGA-W5-AA2Z-01Z-00-DX1.49AB7E33-EE0C-42DE-9EDE-91E01290BE45.svs" # print("hdf5 test!") # print("slide: {}".format(s)) # print("patch 0: {}".format(self.hdf5_fh[s][0, 0, 0, 0])) # print("patch 1: {}".format(self.hdf5_fh[s][1, 0, 0, 0])) self.split = split self.is_training = is_training self.metadata_path = os.path.join(self.data_dir, metadata_csv) print("metadata_path: {}".format(self.metadata_path)) self.metadata = pd.read_csv(self.metadata_path) print("hdf5 path: {}".format(self.hdf5_path)) self.toy = True self.filtered = filtered # with open(self.slide_list_path, "rb") as pkl_fh: # self.slide_list = pickle.load(pkl_fh) with h5py.File(self.hdf5_path, "r") as db: self.valid_slides = [slide_id for slide_id in db] self.slide_list = self.metadata[COL_TCGA_SLIDE_ID] print("Num valid slides {}".format(len(self.valid_slides))) self.num_classes = num_classes self.resize_shape = resize_shape self.max_patches_per_slide = max_patches self.patch_list = self._get_patch_list() print("Patch list shape: {}".format(self.patch_list.shape)) self.label_dict = self._get_label_dict(tasks_to) self.labels = self._get_labels() self._set_class_weights(self.labels) self.transform = transform self.normalize = normalize # tools for patch normalization self.standardizer = staintools.BrightnessStandardizer() self.color_normalizer = staintools.ReinhardColorNormalizer() self.normalizer_with_constants = transforms.Compose( [transforms.Normalize(mean=TCGA_MEAN, std=TCGA_STD)]) self.ToTensor = transforms.Compose([transforms.ToTensor()]) # tools for image augmentation self.stain_augmentor = staintools.StainAugmentor(method='vahadane', sigma1=0.2, sigma2=0.2)
# Normalize to stain of first image normalizer = staintools.StainNormalizer(method=METHOD) normalizer.fit(i1) i2_normalized = normalizer.transform(i2) i3_normalized = normalizer.transform(i3) i4_normalized = normalizer.transform(i4) i5_normalized = normalizer.transform(i5) # Plot images = [i1, i2_normalized, i3_normalized, i4_normalized, i5_normalized] titles = ["Target"] + ["Stain normalized"] * 4 staintools.plot_image_list(images, width=5, title_list=titles, \ save_name=RESULTS_DIR + 'stain-normalized-images.png', show=0) # ================== # Stain augmentation # ================== # Augment the first image augmentor = staintools.StainAugmentor(method=METHOD, sigma1=0.4, sigma2=0.4) augmentor.fit(i1) augmented_images = [] for _ in range(10): augmented_image = augmentor.pop() augmented_images.append(augmented_image) # Plot titles = ["Augmented"] * 10 staintools.plot_image_list(augmented_images, width=5, title_list=titles, \ save_name=RESULTS_DIR + 'stain-augmented-images.png', show=0)
def split_into_tiles(home_dir, fileNames, img_mat, count, normalizer, blank_ratio = 0.5, tile_size = 224, overlapping = 0.25, augment = 0, thread = 1): """ Split a tissue into non-overlapping small tiles Args: 1. folder (str): name of the folder where the targer image exists. 2. fileNames (str): name of the files. 3. blank_ratio (float): ratio of the blank area (R > 220). 4. tile_size (int): size of each tile. 5. overlapping (float): the portion of overlapping side between two consecutive sliding windows Precondition: 1. folder and fileNames are UNIX style 2. blank_ratio is float between 0 to 1 """ #Need to consider the overlapping case img = normalizer.transform(img_mat) #normalize h, w, channels = img.shape height=tile_size + 1 width=tile_size + 1 h_val=height*(1 - overlapping) w_val=width*(1-overlapping) max_row = (h-height)/h_val+1 max_col = (w-width)/w_val+1 if max_row == np.fix(max_row): max_row = int(max_row) else: max_row = int(np.fix(max_row+1)) if max_col == np.fix(max_col): max_col = int(max_col) else: max_col = int(np.fix(max_col+1)) seg = np.ndarray(shape = (max_row, max_col), dtype = np.ndarray) loc = np.ndarray(shape = (max_row, max_col), dtype = np.ndarray) for row in range(1, max_row + 1): for col in range(1, max_col + 1): if ((width+(col-1)*w_val) > w) & (((row-1)*h_val+height) <= h): seg[row-1, col-1]= img[int((row-1)*h_val+1) : int(height+(row-1)*h_val), int((col-1)*w_val+1) : w, : ] loc[row-1, col-1] = [int((row-1)*h_val+1), int(height+(row-1)*h_val), int((col-1)*w_val+1), w] elif ((height + (row - 1) * h_val) > w) & (((col - 1) * w_val + width) <= h): seg[row-1, col-1]= img[int((row-1) * h_val + 1) : int(h), int((col-1)*w_val+1) : int(width+(col-1)*w_val), : ] loc[row-1, col-1] = [int((row-1) * h_val + 1), int(h), int((col-1)*w_val+1), int(width+(col-1)*w_val)] elif ((width + (col-1)*w_val) > w) & (((row-1)*h_val+height) > h): seg[row-1, col-1] = img[int((row-1)*h_val+1) : int(h), int((col-1)*w_val+1) : int(w), :] loc[row-1, col-1] = [int((row-1)*h_val+1), int(h), int((col-1)*w_val+1), int(w)] else: seg[row-1, col-1]= img[int((row-1)*h_val+1) : int(height+(row-1)*h_val), int((col-1)*w_val+1) : int(width+(col-1)*w_val), :] loc[row-1, col-1] = [int((row-1)*h_val+1), int(height+(row-1)*h_val), int((col-1)*w_val+1), int(width+(col-1)*w_val)] # save if thread == 1: print("Segmentation Progress:") else: print("Begin Segmentation") for i in range(max_row): sys.stdout.write('\r') for j in range(max_col): aaa = seg[i, j] ccc = np.shape(aaa) if ccc[0] == tile_size & ccc[1] == tile_size: if (np.sum(seg[i, j][:, :, 0] > 220)/(ccc[0] * ccc[1])) < blank_ratio: # print("start saving \n") output_dir = (home_dir + "/" + str(count) + "/") if not os.path.isdir(output_dir): os.makedirs(output_dir) cv2.imwrite( output_dir + "/" + fileNames[:fileNames.rfind(".")] + "_" + str(count) + "_" + '_'.join(map(str,loc[i, j])) + ".jpg", seg[i, j]) #augment if augment > 0: augmentor = staintools.StainAugmentor(method='vahadane', sigma1=0.2, sigma2=0.2) augmentor.fit(seg[i, j]) for index in range(augment): augmented_image = augmentor.pop() cv2.imwrite( output_dir + "/" + "aug_" + str(index) + "_" + fileNames[:fileNames.rfind(".")] + "_" + str(count) + "_" '_'.join(map(str,loc[i, j])) + ".jpg", augmented_image) if thread == 1: k = (i + 1) // max_row sys.stdout.write("[%-20s] %d%%\n" % ('='*int(20*k), 100*k)) sys.stdout.flush() sleep(0.25)