def __init__(self, root, split="train", transforms=None, augmentations=False, in_channels=3, mask_dilation_size=1, no_gt=False, fold_num=0, num_folds=1, seed=1234, no_load_images=False, only_non_empty=False): self.root = root self.split = split self.transforms = transforms self.augmentations = augmentations self.in_channels = in_channels self.mask_dilation_size = mask_dilation_size self.no_gt = no_gt self.n_classes = 2 self.no_load_images = no_load_images self.only_non_empty = only_non_empty self.files = {} self.files_empty = {} self.files_non_empty = {} if self.split != 'test': ##train_df = pd.read_csv(os.path.join(self.root, self.split.replace('val', 'train') + '-rle_unique.csv'), index_col=0) train_df = pd.read_csv(os.path.join(self.root, 'stage_2_' + self.split.replace('val', 'train') + '_unique.csv'), index_col=0) self.train_labels = train_df.to_dict('index') else: ##test_df = pd.read_csv(os.path.join(self.root, 'sample_submission.csv'), index_col=0) test_df = pd.read_csv(os.path.join(self.root, 'stage_2_sample_submission.csv'), index_col=0) self.test_labels = test_df.to_dict('index') ##root_img_path = os.path.join(self.root, 'dicom-images-' + self.split.replace('val', 'train')) root_img_path = os.path.join(self.root, 'stage_{}_images'.format(1 if self.split != 'test' else 2)) fs = recursive_glob(rootdir=root_img_path, suffix='.dcm') if self.split != 'test': fs = [f for f in fs if os.path.basename(f)[:-4] in self.train_labels] N = len(fs) start_idx = N * fold_num // num_folds end_idx = N * (fold_num + 1) // num_folds print('{:5s}: {:2d}/{:2d} [{:6d}, {:6d}] - {:6d}'.format(self.split, fold_num, num_folds, start_idx, end_idx, N)) self.files[self.split] = [] self.files_empty[self.split] = [] self.files_non_empty[self.split] = [] torch.manual_seed(seed) rp = torch.randperm(N).tolist() for i in range(N): f = fs[rp[i]] lbl = self.train_labels[os.path.basename(f)[:-4]]['EncodedPixels'] if ((i >= start_idx and i < end_idx) and self.split == 'val') or (not (i >= start_idx and i < end_idx) and self.split == 'train') or (num_folds == 1): self.__append_files__(f, lbl) else: fs = [f for f in fs if os.path.basename(f)[:-4] in self.test_labels] self.files[self.split] = fs ##self.clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(16,16)) if not self.files[split]: raise Exception("No files for split=[%s] found in %s" % (self.split, self.root)) else: print("Found %d %s images" % (len(self.files[self.split]), self.split))
def __init__(self, root, split="train", version="simplified", is_transform=True, img_size=(256, 256), augmentations=None, no_gt=False, train_fold_num=0, num_train_folds=10, num_val=300, seed=1234): self.root = root self.split = split self.version = version self.is_transform = is_transform self.augmentations = augmentations self.no_gt = no_gt self.n_classes = 340 self.img_size = img_size if isinstance(img_size, tuple) else (img_size, img_size) self.mean_rgb = [0.485, 0.456, 0.406] self.std_rgb = [0.229, 0.224, 0.225] # torchvision pretrained image transform self.files = {} if self.split == 'test': csv_path = os.path.join( self.root, '{}_{}.csv'.format(self.split, self.version)) test_df = pd.read_csv(csv_path, usecols=['key_id', 'drawing'])[['key_id', 'drawing']] test_dict = test_df.to_dict('index') gc.collect() self.files[split] = test_dict fs = recursive_glob(rootdir=os.path.join( self.root, '{}_{}'.format('train', self.version)), suffix='.csv') else: torch.manual_seed(seed) csv_path = os.path.join( self.root, '{}_{}'.format(self.split.replace('val', 'train'), self.version)) suffix = '_{}_{}_{}.pkl'.format( self.split, train_fold_num, num_train_folds ) if self.split == 'train' else '_{}_{}.pkl'.format( self.split, num_val) fs = recursive_glob(rootdir=csv_path, suffix=suffix) if len(fs) == 0: suffix = '.csv' fs = recursive_glob(rootdir=csv_path, suffix=suffix) train_dicts = {} accum = 0 for i, f in enumerate(fs): if suffix == '.csv': train_df = pd.read_csv( f, usecols=[ 'key_id', 'drawing', 'recognized', 'word' ])[['key_id', 'drawing', 'recognized', 'word']] train_dict = train_df.to_dict('index') num_train = (len(train_dict) - num_val) // num_train_folds print('({:3d}) {:25s}: {:7d}'.format( i, os.path.basename(f).split('.')[0].split('_') [0].replace(' ', '_'), len(train_dict))) else: with open(f, 'rb') as pkl_f: train_dict = pickle.load(pkl_f) if num_train_folds > 0 and suffix == '.csv': #""" class_name = os.path.basename(f).split('.')[0] rp = torch.randperm(len(train_dict)).tolist() total_num_train = len(train_dict) - num_val for fold_num in range(num_train_folds): start_index = fold_num * num_train end_index = ( fold_num + 1 ) * num_train if fold_num < num_train_folds - 1 else total_num_train selected_train_indices = rp[start_index:end_index] part_train_dict = { j: train_dict[key_index] for j, key_index in enumerate( selected_train_indices) } part_train_pkl_filename = os.path.join( csv_path, '{}_{}_{}_{}.pkl'.format(class_name, 'train', fold_num, num_train_folds)) with open(part_train_pkl_filename, 'wb') as pkl_f: pickle.dump(part_train_dict, pkl_f, protocol=pickle.HIGHEST_PROTOCOL) selected_val_indices = rp[-num_val:] part_val_dict = { j: train_dict[key_index] for j, key_index in enumerate(selected_val_indices) } part_val_pkl_filename = os.path.join( csv_path, '{}_{}_{}.pkl'.format(class_name, 'val', num_val)) with open(part_val_pkl_filename, 'wb') as pkl_f: pickle.dump(part_val_dict, pkl_f, protocol=pickle.HIGHEST_PROTOCOL) #""" train_dict_len = num_train else: train_dict_len = len(train_dict) train_dict = { accum + j: train_dict[j] for j in range(train_dict_len) } train_dicts.update(train_dict) accum = accum + train_dict_len gc.collect() self.files[split] = train_dicts self.class_num2name = [ os.path.basename(f).split('.')[0].split('_')[0].replace(' ', '_') for f in fs ] self.ignore_index = -1 self.class_name2num = dict( zip(self.class_num2name, range(self.n_classes))) if not self.files[split]: raise Exception("No files for split=[%s] found in %s" % (split, csv_path)) print("Found %d %s images" % (len(self.files[split]), split))
def __init__(self, root, split="train", is_transform=True, img_size=(512, 512), augmentations=None, no_gt=False, use_external=False, fold_num=0, num_folds=1, seed=1234): self.root = root self.split = split self.is_transform = is_transform self.augmentations = augmentations self.no_gt = no_gt self.n_classes = 28 self.img_size = img_size if isinstance(img_size, tuple) else (img_size, img_size) self.mean_gbry = [0.0526, 0.0547, 0.0804, 0.0827] self.std_gbry = [0.1122, 0.1560, 0.1496, 0.1497] self.files = {} #""" root_img_path = os.path.join(self.root, self.split.replace('val', 'train')) fs = recursive_glob(rootdir=root_img_path, suffix='.png') uni_fs = np.unique(np.array([os.path.join(self.split.replace('val', 'train'), '_'.join(os.path.basename(f).split('_')[:-1])) for f in fs])) self.files[split] = uni_fs.tolist() if self.split != 'test' and use_external: root_img_path = os.path.join(self.root, 'external') fs = recursive_glob(rootdir=root_img_path, suffix='.png') uni_fs = np.unique(np.array([os.path.join('external', '_'.join(os.path.basename(f).split('_')[:-1])) for f in fs])) self.files[split] = self.files[split] + uni_fs.tolist() #""" list_root = 'data_list' if not os.path.exists(list_root): os.mkdir(list_root) list_filename = os.path.join(list_root, 'list_{}_{}-{}'.format(self.split, fold_num, num_folds)) if self.split != 'test' else os.path.join(list_root, 'list_{}'.format(self.split)) if not os.path.exists(list_filename): N = len(self.files[split]) if self.split == 'test': with open(list_filename, 'w') as f_test: for i in range(N): f_test.write(self.files[split][i] + '\n') else: torch.manual_seed(seed) rp = torch.randperm(N).tolist() start_idx = N * fold_num // num_folds end_idx = N * (fold_num + 1) // num_folds print('{:5s}: {:2d}/{:2d} [{:6d}, {:6d}] - {:6d}'.format(self.split, fold_num, num_folds, start_idx, end_idx, N)) f_train = open(list_filename.replace('val', 'train'), 'w') f_val = open(list_filename.replace('train', 'val'), 'w') for i in range(N): if i >= start_idx and i < end_idx: f_val.write(self.files[split][rp[i]] + '\n') else: f_train.write(self.files[split][rp[i]] + '\n') f_train.close() f_val.close() else: with open(list_filename, 'r') as f: self.files[split] = f.read().splitlines() train_df = pd.read_csv(os.path.join(self.root, 'train.csv'), index_col=0) self.train_labels = train_df.to_dict('index') self.class_num_samples_train = torch.zeros(self.n_classes, dtype=torch.float, device=torch.device('cuda')) for i, img_id in enumerate(self.train_labels): lbl_str = self.train_labels[img_id]['Target'] for l in lbl_str.split(): self.class_num_samples_train[int(l)] += 1 if use_external: external_df = pd.read_csv(os.path.join(self.root, 'HPAv18RGBY_wodpl.csv' if self.split == 'test' else 'HPAv18RGBY_WithoutUncertain_wodpl.csv'), index_col=0) external_labels = external_df.to_dict('index') class_num_samples_external = torch.zeros(self.n_classes, dtype=torch.float, device=torch.device('cuda')) for i, img_id in enumerate(external_labels): lbl_str = external_labels[img_id]['Target'] for l in lbl_str.split(): class_num_samples_external[int(l)] += 1 self.train_labels.update(external_labels) self.class_num_samples = self.class_num_samples_train + class_num_samples_external self.loss_weights = ((self.class_num_samples.sum() - self.class_num_samples) / self.class_num_samples).log() self.class_names = ['Nucleoplasm', 'Nuclear membrane', 'Nucleoli', 'Nucleoli fibrillar center', 'Nuclear speckles', 'Nuclear bodies', 'Endoplasmic reticulum', 'Golgi apparatus', 'Peroxisomes', 'Endosomes', 'Lysosomes', 'Intermediate filaments', 'Actin filaments', 'Focal adhesion sites', 'Microtubules', 'Microtubule ends', 'Cytokinetic bridge', 'Mitotic spindle', 'Microtubule organizing center', 'Centrosome', 'Lipid droplets', 'Plasma membrane', 'Cell junctions', 'Mitochondria', 'Aggresome', 'Cytosol', 'Cytoplasmic bodies', 'Rods & rings',] if not self.files[split]: raise Exception("No files for split=[%s] found in %s" % (split, self.root)) print("Found %d %s images" % (len(self.files[split]), split))