Example #1
0
    def __init__(self, root, split="train",
                 transforms=None, augmentations=False, in_channels=3, mask_dilation_size=1,
                 no_gt=False, fold_num=0, num_folds=1, seed=1234, no_load_images=False, only_non_empty=False):
        self.root = root
        self.split = split
        self.transforms = transforms
        self.augmentations = augmentations
        self.in_channels = in_channels
        self.mask_dilation_size = mask_dilation_size
        self.no_gt = no_gt
        self.n_classes = 2
        self.no_load_images = no_load_images
        self.only_non_empty = only_non_empty
        self.files = {}
        self.files_empty = {}
        self.files_non_empty = {}

        if self.split != 'test':
            ##train_df = pd.read_csv(os.path.join(self.root, self.split.replace('val', 'train') + '-rle_unique.csv'), index_col=0)
            train_df = pd.read_csv(os.path.join(self.root, 'stage_2_' + self.split.replace('val', 'train') + '_unique.csv'), index_col=0)
            self.train_labels = train_df.to_dict('index')
        else:
            ##test_df = pd.read_csv(os.path.join(self.root, 'sample_submission.csv'), index_col=0)
            test_df = pd.read_csv(os.path.join(self.root, 'stage_2_sample_submission.csv'), index_col=0)
            self.test_labels = test_df.to_dict('index')

        ##root_img_path = os.path.join(self.root, 'dicom-images-' + self.split.replace('val', 'train'))
        root_img_path = os.path.join(self.root, 'stage_{}_images'.format(1 if self.split != 'test' else 2))
        fs = recursive_glob(rootdir=root_img_path, suffix='.dcm')

        if self.split != 'test':
            fs = [f for f in fs if os.path.basename(f)[:-4] in self.train_labels]
            N = len(fs)
            start_idx = N * fold_num // num_folds
            end_idx = N * (fold_num + 1) // num_folds
            print('{:5s}: {:2d}/{:2d} [{:6d}, {:6d}] - {:6d}'.format(self.split, fold_num, num_folds, start_idx, end_idx, N))
            self.files[self.split] = []
            self.files_empty[self.split] = []
            self.files_non_empty[self.split] = []
            torch.manual_seed(seed)
            rp = torch.randperm(N).tolist()
            for i in range(N):
                f = fs[rp[i]]
                lbl = self.train_labels[os.path.basename(f)[:-4]]['EncodedPixels']
                if ((i >= start_idx and i < end_idx) and self.split == 'val') or (not (i >= start_idx and i < end_idx) and self.split == 'train') or (num_folds == 1):
                    self.__append_files__(f, lbl)
        else:
            fs = [f for f in fs if os.path.basename(f)[:-4] in self.test_labels]
            self.files[self.split] = fs

        ##self.clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(16,16))

        if not self.files[split]:
            raise Exception("No files for split=[%s] found in %s" % (self.split, self.root))
        else:
            print("Found %d %s images" % (len(self.files[self.split]), self.split))
    def __init__(self,
                 root,
                 split="train",
                 version="simplified",
                 is_transform=True,
                 img_size=(256, 256),
                 augmentations=None,
                 no_gt=False,
                 train_fold_num=0,
                 num_train_folds=10,
                 num_val=300,
                 seed=1234):

        self.root = root
        self.split = split
        self.version = version
        self.is_transform = is_transform
        self.augmentations = augmentations
        self.no_gt = no_gt
        self.n_classes = 340
        self.img_size = img_size if isinstance(img_size, tuple) else (img_size,
                                                                      img_size)
        self.mean_rgb = [0.485, 0.456, 0.406]
        self.std_rgb = [0.229, 0.224,
                        0.225]  # torchvision pretrained image transform
        self.files = {}

        if self.split == 'test':
            csv_path = os.path.join(
                self.root, '{}_{}.csv'.format(self.split, self.version))
            test_df = pd.read_csv(csv_path,
                                  usecols=['key_id',
                                           'drawing'])[['key_id', 'drawing']]
            test_dict = test_df.to_dict('index')
            gc.collect()
            self.files[split] = test_dict
            fs = recursive_glob(rootdir=os.path.join(
                self.root, '{}_{}'.format('train', self.version)),
                                suffix='.csv')
        else:
            torch.manual_seed(seed)
            csv_path = os.path.join(
                self.root, '{}_{}'.format(self.split.replace('val', 'train'),
                                          self.version))
            suffix = '_{}_{}_{}.pkl'.format(
                self.split, train_fold_num, num_train_folds
            ) if self.split == 'train' else '_{}_{}.pkl'.format(
                self.split, num_val)
            fs = recursive_glob(rootdir=csv_path, suffix=suffix)
            if len(fs) == 0:
                suffix = '.csv'
                fs = recursive_glob(rootdir=csv_path, suffix=suffix)
            train_dicts = {}
            accum = 0
            for i, f in enumerate(fs):
                if suffix == '.csv':
                    train_df = pd.read_csv(
                        f, usecols=[
                            'key_id', 'drawing', 'recognized', 'word'
                        ])[['key_id', 'drawing', 'recognized', 'word']]
                    train_dict = train_df.to_dict('index')
                    num_train = (len(train_dict) - num_val) // num_train_folds
                    print('({:3d}) {:25s}: {:7d}'.format(
                        i,
                        os.path.basename(f).split('.')[0].split('_')
                        [0].replace(' ', '_'), len(train_dict)))
                else:
                    with open(f, 'rb') as pkl_f:
                        train_dict = pickle.load(pkl_f)
                if num_train_folds > 0 and suffix == '.csv':
                    #"""
                    class_name = os.path.basename(f).split('.')[0]
                    rp = torch.randperm(len(train_dict)).tolist()
                    total_num_train = len(train_dict) - num_val
                    for fold_num in range(num_train_folds):
                        start_index = fold_num * num_train
                        end_index = (
                            fold_num + 1
                        ) * num_train if fold_num < num_train_folds - 1 else total_num_train
                        selected_train_indices = rp[start_index:end_index]
                        part_train_dict = {
                            j: train_dict[key_index]
                            for j, key_index in enumerate(
                                selected_train_indices)
                        }
                        part_train_pkl_filename = os.path.join(
                            csv_path,
                            '{}_{}_{}_{}.pkl'.format(class_name, 'train',
                                                     fold_num,
                                                     num_train_folds))
                        with open(part_train_pkl_filename, 'wb') as pkl_f:
                            pickle.dump(part_train_dict,
                                        pkl_f,
                                        protocol=pickle.HIGHEST_PROTOCOL)

                    selected_val_indices = rp[-num_val:]
                    part_val_dict = {
                        j: train_dict[key_index]
                        for j, key_index in enumerate(selected_val_indices)
                    }
                    part_val_pkl_filename = os.path.join(
                        csv_path,
                        '{}_{}_{}.pkl'.format(class_name, 'val', num_val))
                    with open(part_val_pkl_filename, 'wb') as pkl_f:
                        pickle.dump(part_val_dict,
                                    pkl_f,
                                    protocol=pickle.HIGHEST_PROTOCOL)
                    #"""
                    train_dict_len = num_train
                else:
                    train_dict_len = len(train_dict)
                train_dict = {
                    accum + j: train_dict[j]
                    for j in range(train_dict_len)
                }
                train_dicts.update(train_dict)
                accum = accum + train_dict_len
                gc.collect()
            self.files[split] = train_dicts

        self.class_num2name = [
            os.path.basename(f).split('.')[0].split('_')[0].replace(' ', '_')
            for f in fs
        ]
        self.ignore_index = -1

        self.class_name2num = dict(
            zip(self.class_num2name, range(self.n_classes)))

        if not self.files[split]:
            raise Exception("No files for split=[%s] found in %s" %
                            (split, csv_path))

        print("Found %d %s images" % (len(self.files[split]), split))
Example #3
0
    def __init__(self, root, split="train", is_transform=True,
                 img_size=(512, 512), augmentations=None,
                 no_gt=False, use_external=False, fold_num=0, num_folds=1, seed=1234):

        self.root = root
        self.split = split
        self.is_transform = is_transform
        self.augmentations = augmentations
        self.no_gt = no_gt
        self.n_classes = 28
        self.img_size = img_size if isinstance(img_size, tuple) else (img_size, img_size)
        self.mean_gbry = [0.0526, 0.0547, 0.0804, 0.0827]
        self.std_gbry = [0.1122, 0.1560, 0.1496, 0.1497]
        self.files = {}

        #"""
        root_img_path = os.path.join(self.root, self.split.replace('val', 'train'))
        fs = recursive_glob(rootdir=root_img_path, suffix='.png')
        uni_fs = np.unique(np.array([os.path.join(self.split.replace('val', 'train'), '_'.join(os.path.basename(f).split('_')[:-1])) for f in fs]))
        self.files[split] = uni_fs.tolist()
        if self.split != 'test' and use_external:
            root_img_path = os.path.join(self.root, 'external')
            fs = recursive_glob(rootdir=root_img_path, suffix='.png')
            uni_fs = np.unique(np.array([os.path.join('external', '_'.join(os.path.basename(f).split('_')[:-1])) for f in fs]))
            self.files[split] = self.files[split] + uni_fs.tolist()
        #"""

        list_root = 'data_list'
        if not os.path.exists(list_root):
             os.mkdir(list_root)

        list_filename = os.path.join(list_root, 'list_{}_{}-{}'.format(self.split, fold_num, num_folds)) if self.split != 'test' else os.path.join(list_root, 'list_{}'.format(self.split))
        if not os.path.exists(list_filename):
            N = len(self.files[split])
            if self.split == 'test':
                with open(list_filename, 'w') as f_test:
                    for i in range(N):
                        f_test.write(self.files[split][i] + '\n')
            else:
                torch.manual_seed(seed)
                rp = torch.randperm(N).tolist()
                start_idx = N * fold_num // num_folds
                end_idx = N * (fold_num + 1) // num_folds
                print('{:5s}: {:2d}/{:2d} [{:6d}, {:6d}] - {:6d}'.format(self.split, fold_num, num_folds, start_idx, end_idx, N))
                f_train = open(list_filename.replace('val', 'train'), 'w')
                f_val = open(list_filename.replace('train', 'val'), 'w')
                for i in range(N):
                    if i >= start_idx and i < end_idx:
                        f_val.write(self.files[split][rp[i]] + '\n')
                    else:
                        f_train.write(self.files[split][rp[i]] + '\n')
                f_train.close()
                f_val.close()
        else:
            with open(list_filename, 'r') as f:
                self.files[split] = f.read().splitlines()

        train_df = pd.read_csv(os.path.join(self.root, 'train.csv'), index_col=0)
        self.train_labels = train_df.to_dict('index')
        self.class_num_samples_train = torch.zeros(self.n_classes, dtype=torch.float, device=torch.device('cuda'))
        for i, img_id in enumerate(self.train_labels):
            lbl_str = self.train_labels[img_id]['Target']
            for l in lbl_str.split():
                self.class_num_samples_train[int(l)] += 1
        if use_external:
            external_df = pd.read_csv(os.path.join(self.root, 'HPAv18RGBY_wodpl.csv' if self.split == 'test' else 'HPAv18RGBY_WithoutUncertain_wodpl.csv'), index_col=0)
            external_labels = external_df.to_dict('index')
            class_num_samples_external = torch.zeros(self.n_classes, dtype=torch.float, device=torch.device('cuda'))
            for i, img_id in enumerate(external_labels):
                lbl_str = external_labels[img_id]['Target']
                for l in lbl_str.split():
                    class_num_samples_external[int(l)] += 1
            self.train_labels.update(external_labels)

        self.class_num_samples = self.class_num_samples_train + class_num_samples_external
        self.loss_weights = ((self.class_num_samples.sum() - self.class_num_samples) / self.class_num_samples).log()

        self.class_names = ['Nucleoplasm', 'Nuclear membrane', 'Nucleoli', 'Nucleoli fibrillar center', 'Nuclear speckles',
                            'Nuclear bodies', 'Endoplasmic reticulum', 'Golgi apparatus', 'Peroxisomes', 'Endosomes',
                            'Lysosomes', 'Intermediate filaments', 'Actin filaments', 'Focal adhesion sites', 'Microtubules',
                            'Microtubule ends', 'Cytokinetic bridge', 'Mitotic spindle', 'Microtubule organizing center', 'Centrosome',
                            'Lipid droplets', 'Plasma membrane', 'Cell junctions', 'Mitochondria', 'Aggresome',
                            'Cytosol', 'Cytoplasmic bodies', 'Rods & rings',]

        if not self.files[split]:
            raise Exception("No files for split=[%s] found in %s" % (split, self.root))

        print("Found %d %s images" % (len(self.files[split]), split))