def __init__(self,
              csv_path: Path,
              split: str,
              labeled=True,
              rand_number=543) -> None:
     super(ChexpertDataset, self).__init__()
     set_seed(rand_number)
     self.data_path = Path(csv_path).parent
     self.annotations = pd.read_csv(csv_path).fillna(0)
     self.train_annotations = None
     self.split = split
     self.transforms = None
     self.height, self.width = 224, 224
     self.transforms = get_transforms(self.height,
                                      self.width,
                                      split=split if labeled else "val")
     if split == "train":
         assert cfg.DATA.BATCH_SIZE <= cfg.DATA.LABELED_SIZE, "Batch size must be smaller than train size."
         self.annotations = self.annotations.sample(frac=1).reset_index(
             drop=True)
         if labeled:
             self.annotations = self.annotations[:cfg.DATA.LABELED_SIZE]
         else:
             self.annotations = self.annotations[
                 cfg.DATA.LABELED_SIZE:cfg.DATA.LABELED_SIZE +
                 cfg.DATA.UNLABELED_SIZE].reset_index(drop=True)
Ejemplo n.º 2
0
 def __init__(self, csv_path: Path, split: str) -> None:
     super(ChexpertDataset, self).__init__()
     self.data_path = Path(csv_path).parent
     self.annotations = pd.read_csv(csv_path).fillna(0)
     self.split = split
     self.transforms = None
     self.height, self.width = 64, 64  # 224, 224
     self.transforms = get_transforms(self.height, self.width, split)
     if split == "train":
         assert cfg.DATA.BATCH_SIZE <= cfg.DATA.LABELED_SIZE, "Batch size must be smaller than train size."
         labeled_size = cfg.DATA.LABELED_SIZE
         self.normal = self.annotations[
             (self.annotations['Atelectasis'] == 0)
             & (self.annotations['Cardiomegaly'] == 0) &
             (self.annotations['Consolidation'] == 0) &
             (self.annotations['Edema'] == 0) &
             (self.annotations['Pleural Effusion'] == 0)]
         self.abnormal = self.annotations[
             (self.annotations['Atelectasis'] != 0) |
             (self.annotations['Cardiomegaly'] != 0) |
             (self.annotations['Consolidation'] != 0) |
             (self.annotations['Edema'] != 0) |
             (self.annotations['Pleural Effusion'] != 0)]
         # self.normal = self.normal.sample(n=int(labeled_size / 2)).reset_index(drop=True)
         self.normal = self.normal.sample(n=int(labeled_size / 2))
         self.abnormal = self.abnormal.sample(n=int(labeled_size / 2))
         normal_indices = self.normal.index
         abnormal_indices = self.abnormal.index
         self.annotations = self.annotations.drop(normal_indices).drop(
             abnormal_indices)
         self.train_annotations = self.normal.append(self.abnormal,
                                                     ignore_index=True)
Ejemplo n.º 3
0
 def __init__(self, config, phase='train'):
     self.df = pd.read_csv(config.dataset[phase].csv_path)
     self.phase = phase
     self.transforms = get_transforms(config)
     self.augmentations = get_augmentations(
         config) if 'train' in phase else None
     self.config = config
Ejemplo n.º 4
0
 def __init__(self, csv_path: Path,
              shuffled_annotations: pd.DataFrame) -> None:
     # shuffled_annotations are remaining annotations not used in labeled
     super(ChexpertDatasetUnlabeled, self).__init__()
     self.data_path = Path(csv_path).parent
     unlabeled_size = cfg.DATA.UNLABELED_SIZE
     self.annotations = shuffled_annotations[:unlabeled_size].reset_index(
         drop=True)
     self.height, self.width = 64, 64  # 224, 224
     self.transforms = get_transforms(self.height, self.width, 'train')
    def __init__(self, csv_path: Path, shuffled_annotations: pd.DataFrame, model) -> None:
        super(ChexpertDatasetUnlabeled, self).__init__()
        self.data_path = Path(csv_path).parent
        labeled_size = cfg.DATA.LABELED_SIZE
        unlabeled_size = cfg.DATA.UNLABELED_SIZE
        self.labeled = shuffled_annotations[:labeled_size]
        self.annotations = shuffled_annotations[labeled_size:labeled_size + unlabeled_size].reset_index(drop=True)

        self.S = []
        self.height, self.width = 224, 224
        self.transforms = get_transforms(self.height, self.width, 'train')
        self.model = model

        self.assign_nearest()
 def __init__(self, csv_path: Path, split: str) -> None:
     super(ChexpertDataset, self).__init__()
     self.data_path = Path(csv_path).parent
     self.annotations = pd.read_csv(csv_path).fillna(0)
     self.train_annotations = None
     self.split = split
     self.transforms = None
     self.height, self.width = 64, 64  # 224, 224
     self.transforms = get_transforms(self.height, self.width, split)
     if split == "train":
         assert cfg.DATA.BATCH_SIZE <= cfg.DATA.LABELED_SIZE, "Batch size must be smaller than train size."
         self.annotations = self.annotations.sample(frac=1).reset_index(
             drop=True)
         self.train_annotations = self.annotations[:cfg.DATA.LABELED_SIZE]
Ejemplo n.º 7
0
def get_data_loader(configs):
    data_transforms = get_transforms(resize_size=256, crop_size=224)

    # build dataset
    train_dataset = datasets.ImageFolder(os.path.join(configs.data_path,
                                                      'train'),
                                         transform=data_transforms['train'])
    determin_train_dataset = datasets.ImageFolder(
        os.path.join(configs.data_path, 'train'),
        transform=data_transforms['val'])
    val_dataset = datasets.ImageFolder(os.path.join(configs.data_path, 'val'),
                                       transform=data_transforms['val'])
    test_datasets = {
        'test' + str(i):
        datasets.ImageFolder(os.path.join(configs.data_path, 'test'),
                             transform=data_transforms["test" + str(i)])
        for i in range(10)
    }

    # build dataloader
    train_loader = DataLoader(train_dataset,
                              batch_size=configs.batch_size,
                              shuffle=True,
                              num_workers=configs.num_workers,
                              pin_memory=True)
    determin_train_loader = DataLoader(determin_train_dataset,
                                       batch_size=configs.batch_size,
                                       shuffle=False,
                                       num_workers=configs.num_workers,
                                       pin_memory=True)
    val_loader = DataLoader(val_dataset,
                            batch_size=configs.batch_size,
                            shuffle=False,
                            num_workers=configs.num_workers,
                            pin_memory=True)
    test_loaders = {
        'test' + str(i): DataLoader(test_datasets["test" + str(i)],
                                    batch_size=4,
                                    shuffle=False,
                                    num_workers=configs.num_workers)
        for i in range(10)
    }

    return train_loader, determin_train_loader, val_loader, test_loaders
    def __init__(self,
                 csv_path: Path,
                 unlabeled_pseudo: str,
                 split="train",
                 imb_type="exp",
                 imb_factor=0.01,
                 unlabel_imb_factor=1,
                 rand_number=543):
        super(SemiSupervisedImbalanceChexpert, self).__init__()
        # unlabeled
        set_seed(rand_number)
        self.data_path = Path(csv_path).parent
        self.annotations = pd.read_csv(csv_path).fillna(0)
        self.split = split
        self.height, self.width = 224, 224
        self.transforms = get_transforms(self.height, self.width, split)
        self.assign_labels()

        if split == "train":
            assert cfg.DATA.BATCH_SIZE <= cfg.DATA.LABELED_SIZE, "Batch size must be smaller than train size."
            self.labeled_annotations = self.annotations.sample(
                frac=1).reset_index(drop=True)
            self.labeled_annotations = self.labeled_annotations[:cfg.DATA.
                                                                LABELED_SIZE]
            self.unlabeled_annotations = self.annotations[
                cfg.DATA.LABELED_SIZE:cfg.DATA.LABELED_SIZE +
                cfg.DATA.UNLABELED_SIZE].reset_index(drop=True)

        self.cls_num = 32
        self.unlabel_size_factor = 5
        self.unlabeled_pseudo = unlabeled_pseudo  # pseudo-labels using model trained on imbalanced data
        self.imb_factor = imb_factor
        self.unlabel_imb_factor = unlabel_imb_factor
        self.num_per_cls_dict = dict()
        img_num_list = self.get_img_num_per_cls(self.cls_num, imb_type,
                                                imb_factor)
        img_num_list_unlabeled = self.get_img_num_per_cls_unlabeled(
            self.cls_num, img_num_list, unlabel_imb_factor)
        self.gen_imbalanced_data(img_num_list, img_num_list_unlabeled)
    def __init__(self,
                 csv_path: Path,
                 split: str,
                 imb_type="exp",
                 imb_factor=0.01,
                 rand_number=543):
        super(ImbalanceChexpert, self).__init__()
        set_seed(rand_number)
        self.data_path = Path(csv_path).parent
        self.annotations = pd.read_csv(csv_path).fillna(0)
        self.split = split
        self.height, self.width = 224, 224
        self.transforms = get_transforms(self.height, self.width, split)
        self.assign_labels()

        if split == "train":
            assert cfg.DATA.BATCH_SIZE <= cfg.DATA.LABELED_SIZE, "Batch size must be smaller than train size."
            self.annotations = self.annotations.sample(frac=1).reset_index(
                drop=True)
            self.annotations = self.annotations[:cfg.DATA.
                                                LABELED_SIZE]  # split to be "labelled data"

        img_num_list = self.get_img_num_per_cls(32, imb_type, imb_factor)
        self.gen_imbalanced_data(img_num_list)
Ejemplo n.º 10
0
# target_column has unique values in set -1, 0, 1
# -1 corresponds to the unlabeled data
df = pd.read_csv(args.csv)
labeled = df[df[args.target_column] > -1]
if args.ssl:
    print("Semi-supervised learning model is on...")
    unlabeled = df[df[args.target_column] == -1]

# weights to initialize bias of FC layer of classifier
weight = labeled.groupby(args.target_column).count()["path"] / labeled.shape[0]
weight = torch.Tensor(weight.values).log()

train_labeled, test_labeled = train_test_split(labeled, test_size=args.test_size, stratify=labeled[args.target_column], random_state=args.random_state)

train_transform, valid_transform = get_transforms(img_size=args.image_size)
train_labeled_loader, valid_labeled_loader = get_loaders(
    train_labeled,
    test_labeled,
    train_transform,
    valid_transform,
    target_column=args.target_column,
    batch_size=args.batch_size,
    num_workers=args.num_workers,
    shuffle=True
)

if args.ssl:
    dataset_unlabeled = ImageDataset(unlabeled, train_transform, target_column=None)

loss = LabelSmoothingLoss(num_classes=2, smoothing=0.2, weight=None)
Ejemplo n.º 11
0
                        type=str,
                        default=None,
                        help="position of check_point")
    parser.add_argument("--dataset",
                        type=str,
                        default="data/custom/test.txt",
                        help="position of dataset")
    parser.add_argument(
        "--n_cpu",
        type=int,
        default=8,
        help="number of cpu threads to use during batch generation")
    parser.add_argument("--img_size",
                        type=int,
                        default=224,
                        help="size of each image dimension")
    opt = parser.parse_args()
    print(opt)
    # load weights
    model = torch.load(opt.check_point)
    data_transforms = get_transforms()
    # load data
    dataset = ListDataSet("test", opt.dataset, data_transforms["test"])
    testdata = DataLoader(dataset,
                          batch_size=opt.batch_size,
                          shuffle=False,
                          num_workers=opt.n_cpu)
    print(dataset.__len__())
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    evaluation(model, device, testdata)
    def __init__(self, cfg):
        super(ModelTaskLightning, self).__init__()
        assert cfg.lr_scheduler == 'poly', 'This implementation relies on an unconventional usage of _LRScheduler class'
        self.cfg = cfg

        dataset_class = resolve_dataset_class(cfg.dataset)

        self.dataset_train = dataset_class(
            cfg.datasets_dir, SPLIT_TRAIN, download=cfg.dataset_download, integrity_check=False
        )
        self.dataset_valid = dataset_class(
            cfg.datasets_dir, SPLIT_VALID, download=cfg.dataset_download, integrity_check=False
        )

        print('Number of samples in training split:', len(self.dataset_train))
        print('Number of samples in validation split:', len(self.dataset_valid))

        self.semseg_num_classes = self.dataset_train.num_classes
        self.semseg_ignore_class = self.dataset_train.ignore_label
        self.semseg_class_names = self.dataset_train.semseg_class_names

        model_class = resolve_network_model(cfg.model_name)
        self.net = model_class(cfg, self.semseg_num_classes)

        self.transforms_train = get_transforms(
            semseg_ignore_class=self.semseg_ignore_class,
            geom_scale_min=cfg.aug_geom_scale_min,
            geom_scale_max=cfg.aug_geom_scale_max,
            geom_tilt_max_deg=cfg.aug_geom_tilt_max_deg,
            geom_wiggle_max_ratio=cfg.aug_geom_wiggle_max_ratio,
            geom_reflect=cfg.aug_geom_reflect,
            crop_random=cfg.aug_input_crop_size,
            rgb_zero_mean_status=True,
            rgb_mean=self.dataset_train.rgb_mean,
            rgb_stddev=self.dataset_train.rgb_stddev,
            stroke_width=cfg.aug_semseg_weak_stroke_width,
        )

        self.transforms_valid = get_transforms(
            semseg_ignore_class=self.semseg_ignore_class,
            crop_for_passable=self.net.bottleneck_stride if not cfg.aug_geom_validation_center_crop_sts else 0,
            crop_center=cfg.aug_geom_validation_center_crop_size if cfg.aug_geom_validation_center_crop_sts else 0,
            rgb_zero_mean_status=True,
            rgb_mean=self.dataset_train.rgb_mean,
            rgb_stddev=self.dataset_train.rgb_stddev,
            stroke_width=cfg.aug_semseg_weak_stroke_width,
        )

        self.dataset_train.set_transforms(self.transforms_train)
        self.dataset_valid.set_transforms(self.transforms_valid)

        class_weights = None
        if cfg.loss_cross_entropy_class_weights_sts:
            class_weights = class_weights_from_histogram(self.dataset_train.semseg_class_histogram)

        self.loss_ce = torch.nn.CrossEntropyLoss(class_weights, ignore_index=self.semseg_ignore_class)

        self.loss_gatedcrf = None
        if cfg.loss_gatedcrf_sts:
            self.loss_gatedcrf = ModelLossSemsegGatedCRF()

        self.poly_lr_sched = None