def cls_weights(pth=args.train_image_pth, ignore_index=None, ignore_cls=False, ignore_seg=False): ''' given gt.npy, calculates class distributions of images and returns inverse ''' metadata_pth = '{}/gt.npy'.format(pth) metadata = ufs.fetch_metadata(metadata_pth) numsamples_cls = np.zeros((args.num_classes, ), dtype=np.int64) numsamples_seg = np.zeros((args.num_classes, ), dtype=np.int64) for _, item in metadata.items(): for _, subitem in item.items(): if not ignore_seg and isinstance(subitem['label'], str): l = Image.open(subitem['label']) l = np.array(l) n = np.bincount(l.reshape(-1), minlength=args.num_classes) numsamples_seg += n if not ignore_cls and isinstance(subitem['label'], int): numsamples_cls[int(subitem['label'])] += 1 if ignore_index is not None: numsamples_seg[ignore_index] = 0 numsamples_cls[ignore_index] = 0 ratios_cls, ratios_seg = numsamples_cls / ( args.epsilon + numsamples_cls.sum()), numsamples_seg / ( args.epsilon + numsamples_seg.sum()) ' find classes with sample count > 0' nonzero_cls = np.nonzero(numsamples_cls) nonzero_seg = np.nonzero(numsamples_seg) 'inverse ratios (i.e. weights)' ratios_cls = 1.0 / ratios_cls[nonzero_cls] ratios_seg = 1.0 / ratios_seg[nonzero_seg] 'placeholder for class weights' cls_weights_cls = np.zeros((args.num_classes, )) cls_weights_seg = np.zeros((args.num_classes, )) 'normalize max weight to 1' if ~(ratios_cls == []): ratios_cls /= (args.epsilon + ratios_cls.max()) cls_weights_cls[nonzero_cls] = ratios_cls if ~(ratios_seg == []): ratios_seg /= (args.epsilon + ratios_seg.max()) cls_weights_seg[nonzero_seg] = ratios_seg return cls_weights_cls, cls_weights_seg
def cls_ratios_hr(pth=args.train_hr_image_pth, ignore_index=None): import utils.dataset_hr as ds metadata_pth = ufs.fix_path('../{}/gt.npy'.format(pth)) metadata = ufs.fetch_metadata(metadata_pth) ' build the datalist ' cls = np.zeros(args.num_classes, ) for filename in metadata: first_region_id = list(metadata[filename].keys())[0] first_sub_region_id = list( metadata[filename][first_region_id].keys())[0] pth = metadata[filename][first_region_id][first_sub_region_id][ 'wsipath'] pth = ufs.fix_path(pth) _wsi_ = openslide.OpenSlide(pth) params = { 'iw': _wsi_.level_dimensions[0][0], 'ih': _wsi_.level_dimensions[0][1], 'tile_w': ds.HR_PATCH_W, 'tile_h': ds.HR_PATCH_H, 'scan_level': ds.HR_SCAN_LEVEL } params = DotDict(params) for conncomp in metadata[filename]: for id in metadata[filename][conncomp]: region_obj = metadata[filename][conncomp][id].copy() 'which/are the points valid for this patch size, scan level combo?' region_obj['cnt_xy'], num_cnt_pts = regiontools.map_points( region_obj['cnt_xy'], params) region_obj['perim_xy'], num_perim_pts = regiontools.map_points( region_obj['perim_xy'], params) if num_cnt_pts >= ds.HR_NUM_CNT_SAMPLES and \ num_perim_pts >= ds.HR_NUM_PERIM_SAMPLES: cls[region_obj['label']] += 1 cls = np.array(cls) print(cls, cls.sum()) cls = cls / cls.sum() return cls
x = int(x) return 1 << (x-1).bit_length() def ispow2(x): x = int(x) return x > 0 and (x & (x - 1)) args.raw_train_pth = 'data/sunnybrook/WSI' ufs.make_folder('../' + args.train_image_pth, True) wsipaths = glob.glob('../{}/*.svs'.format(args.raw_train_pth)) ' check if metadata gt.npy already exists to append to it ' metadata_pth = '../{}/gt.npy'.format(args.train_image_pth) metadata = ufs.fetch_metadata(metadata_pth) pwhs = { np.maximum(args.tile_w, args.tile_h): 0 } wsipaths = sorted(wsipaths) patch_id = 0 num_iters = 1 # each iter randomizes the centers of objects for _ in range(num_iters): for wsipath in tqdm(wsipaths): 'read scan and get metadata' scan = openslide.OpenSlide(wsipath)
#args.patch_folder = '/home/ozan/Downloads/breastpathq/datasets (copy)/validation' #args.label_csv_path = '/home/ozan/Downloads/breastpathq/datasets (copy)/val_labels.csv' savepath = args.val_image_pth else: args.patch_folder = '/home/ozan/Downloads/breastpathq/datasets/train' args.label_csv_path = '/home/ozan/Downloads/breastpathq/datasets/train_labels.csv' #args.patch_folder = '/home/ozan/Downloads/breastpathq/datasets (copy)/train' #args.label_csv_path = '/home/ozan/Downloads/breastpathq/datasets (copy)/train_labels.csv' savepath = args.train_image_pth if __name__ == '__main__': 'train' ufs.make_folder('../' + savepath, is_spie) metadata_pth_train = '../{}/gt.npy'.format(savepath) metadata = ufs.fetch_metadata(metadata_pth_train) raw_gt = {} cc = [] with open('{}'.format(args.label_csv_path)) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') next(csv_reader) for row in csv_reader: image_id = int(row[0]) region_id = int(row[1]) cellularity = float(row[2]) if image_id not in raw_gt: raw_gt[image_id] = {} raw_gt[image_id][region_id] = cellularity
def __init__(self, pth, eval, remove_white, duplicate_dataset): self.base_path = Path(__file__).parent metadata_pth = (self.base_path / '../{}/gt.npy'.format(pth)).resolve().as_posix() metadata = ufs.fetch_metadata(metadata_pth) ''' dataset structure: dataset is comprised of patches+wsi regions. patches: metadata['P'] indicates where all the patches are. wsi: 0. metadata[filename/svs file name] 1. m[f][connected component id] 2. m[f][c][region within the connected component] @ level 1, we have the connected component as given in gt mask. at this level m[f][c][0] always points to the large region if the region is large enough, we then split it to smaller sub-regions at m[f][c][>=1]. ''' ' build the datalist ' self.datalist = [] cls = np.zeros(args.num_classes, ) ' build patch portion of ds ' if 'P' in metadata: P = copy.deepcopy(metadata['P'][0]) del metadata['P'] P_dims = {} for key in P: d = P[key]['dimensions'] if d not in P_dims: params = { 'num_center_points': HR_NUM_CNT_SAMPLES, 'num_perim_points': HR_NUM_PERIM_SAMPLES, 'scan_level': HR_SCAN_LEVEL, 'tile_w': HR_PATCH_W, 'tile_h': HR_PATCH_H, 'dimensions': d } params = preprocessing.DotDict(params) P_dims[d] = regiontools.get_key_points_for_patch(params) item = {**P[key], **P_dims[d]} self.datalist.append(item) cls[item['label']] += 1 ' build wsi regions portion ' self.wsis = {} for filename in metadata: first_region_id = list(metadata[filename].keys())[0] first_sub_region_id = list( metadata[filename][first_region_id].keys())[0] pth = metadata[filename][first_region_id][first_sub_region_id][ 'wsipath'] pth = (self.base_path / pth).resolve().as_posix() self.wsis[pth] = openslide.OpenSlide(pth) if remove_white: 'get low res. nuclei image/foreground mask' scan = self.wsis[pth] x, y = scan.level_dimensions[-1] mask = scan.read_region((0, 0), scan.level_count - 1, (x, y)).convert('RGB') mask = mask.resize((x // 4, y // 4)) mask = preprocessing.find_nuclei(mask) mask = Image.fromarray(mask.astype(np.uint8)).resize((x, y)) mask = np.asarray(mask) params = { 'iw': self.wsis[pth].level_dimensions[0][0], 'ih': self.wsis[pth].level_dimensions[0][1], 'tile_w': HR_PATCH_W, 'tile_h': HR_PATCH_H, 'scan_level': metadata[filename][first_region_id][first_sub_region_id] ['scan_level'] } params = preprocessing.DotDict(params) for conncomp in metadata[filename]: for id in metadata[filename][conncomp]: region_obj = metadata[filename][conncomp][id].copy() if remove_white: 'given points, remove patches that are only white' region_obj[ 'cnt_xy'], num_cnt_pts = regiontools.remove_white_region( mask, region_obj['cnt_xy'], params) region_obj[ 'perim_xy'], num_perim_pts = regiontools.remove_white_region( mask, region_obj['perim_xy'], params) 'which points valid for this patch size, scan level combo?' region_obj['cnt_xy'], num_cnt_pts = regiontools.map_points( region_obj['cnt_xy'], params) region_obj[ 'perim_xy'], num_perim_pts = regiontools.map_points( region_obj['perim_xy'], params) if num_cnt_pts >= HR_NUM_CNT_SAMPLES and \ num_perim_pts >= HR_NUM_PERIM_SAMPLES: self.datalist.append(region_obj) cls[region_obj['label']] += 1 self.eval = eval cls = np.array(cls) '''cls[0] += cls[1] cls[1] = cls[2] cls[2] = cls[3] cls[3] = 0''' print(cls) cls = cls / cls.sum() print(cls) if not self.eval: args.cls_ratios = cls ' augmentation settings ' self.image_aug = preprocessing.standard_augmentor(self.eval) if not self.eval: from itertools import chain self.datalist = list( chain(*[[i] * duplicate_dataset for i in self.datalist]))