예제 #1
0
def get_train_dataloader(df: pd.DataFrame, configs: dict):
    mean = (configs['mean']['r'], configs['mean']['g'], configs['mean']['b'])
    std = (configs['std']['r'], configs['std']['g'], configs['std']['b'])

    transforms = get_train_augmentations(configs['image_size'],
                                         mean=mean,
                                         std=std)

    try:
        face_detector = configs['face_detector']
    except KeyError:
        face_detector = None
    dataset = Dataset(df,
                      configs['path_root'],
                      transforms,
                      face_detector=face_detector)
    if configs['use_balance_sampler']:
        labels = list(df.target.values)
        sampler = BalanceClassSampler(labels, mode="upsampling")
        shuffle = False
    else:
        sampler = None
        shuffle = True

    dataloader = DataLoader(
        dataset,
        batch_size=configs['batch_size'],
        num_workers=configs['num_workers_train'],
        sampler=sampler,
        shuffle=shuffle,
    )
    return dataloader
예제 #2
0
def test_balance_class_sampler_with_prefetch():
    train_data = MNIST(os.getcwd(),
                       train=True,
                       download=True,
                       transform=ToTensor())
    train_labels = train_data.targets.cpu().numpy().tolist()
    train_sampler = BalanceClassSampler(train_labels, mode=5000)
    valid_data = MNIST(os.getcwd(),
                       train=False,
                       download=True,
                       transform=ToTensor())

    loaders = {
        "train": DataLoader(train_data, sampler=train_sampler, batch_size=32),
        "valid": DataLoader(valid_data, batch_size=32),
    }
    loaders = {k: BatchPrefetchLoaderWrapper(v) for k, v in loaders.items()}

    model = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 10))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.02)

    runner = dl.SupervisedRunner()
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        loaders=loaders,
        num_epochs=1,
        logdir="./logs",
        valid_loader="valid",
        valid_metric="loss",
        minimize_valid_metric=True,
        verbose=True,
    )
예제 #3
0
def test_balance_class_sampler():
    train_data = MNIST(DATA_ROOT, train=True)
    train_labels = train_data.targets.cpu().numpy().tolist()
    train_sampler = BalanceClassSampler(train_labels, mode=5000)
    valid_data = MNIST(DATA_ROOT, train=False)

    loaders = {
        "train": DataLoader(train_data, sampler=train_sampler, batch_size=32),
        "valid": DataLoader(valid_data, batch_size=32),
    }

    model = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 10))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.02)

    runner = dl.SupervisedRunner()
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        loaders=loaders,
        num_epochs=1,
        logdir="./logs",
        valid_loader="valid",
        valid_metric="loss",
        minimize_valid_metric=True,
        verbose=True,
    )
예제 #4
0
def make_train_and_validation_data_loaders(
    hyper_parameters: Dict,
    validation_fold_number: int = 0,
) -> Tuple[DataLoader, DataLoader]:
    # Load a DataFrame with the files and targets.
    data_set = load_imagenet_data_paths()

    # Split the data set into folds.
    data_set = add_fold_to_data_set(data_set, n_splits=24)

    # Create train and validation data sets.
    train_data_set = DCTDataSet(
        kinds=data_set[data_set["fold"] != validation_fold_number].kind.values,
        files=data_set[data_set["fold"] != validation_fold_number].file.values,
        labels=data_set[
            data_set["fold"] != validation_fold_number].label.values,
        is_training=True,
    )
    validation_data_set = DCTDataSet(
        kinds=data_set[data_set["fold"] == validation_fold_number].kind.values,
        files=data_set[data_set["fold"] == validation_fold_number].file.values,
        labels=data_set[data_set["fold"] ==
                        validation_fold_number].label.values,
        is_training=False,
    )

    # Create train and validation data loaders.
    train_data_loader = DataLoader(
        train_data_set,
        sampler=BalanceClassSampler(labels=train_data_set.get_labels(),
                                    mode="downsampling"),
        batch_size=hyper_parameters["batch_size"],
        shuffle=False,
        num_workers=hyper_parameters["training_workers"],
        pin_memory=True,
        drop_last=True,
        collate_fn=collate_fn,
    )
    validation_data_loader = DataLoader(
        validation_data_set,
        batch_size=hyper_parameters["batch_size"],
        shuffle=False,
        num_workers=hyper_parameters["training_workers"],
        pin_memory=True,
        drop_last=True,
        collate_fn=collate_fn,
    )

    return train_data_loader, validation_data_loader
예제 #5
0
def make_train_and_validation_data_loaders(
    hyper_parameters: Dict,
    validation_fold_number: int = 0,
) -> Tuple[DataLoader, DataLoader]:
    # Define a set of image augmentations.
    # augmentations_train = Compose([VerticalFlip(p=0), HorizontalFlip(p=1)], p=1,)
    # augmentations_validation = Compose([], p=1)
    augmentations_train = None
    augmentations_validation = None

    # Load a DataFrame with the files and targets.
    train_data, val_data = load_cifar_data_paths()

    # Create train and validation data sets.
    train_data_set = DCTDataSet(
        kinds=train_data.kind.values,
        files=train_data.file.values,
        labels=train_data.label.values,
        transforms=augmentations_train,
    )
    validation_data_set = DCTDataSet(
        kinds=val_data.kind.values,
        files=val_data.file.values,
        labels=val_data.label.values,
    )

    # Create train and validation data loaders.
    train_data_loader = DataLoader(
        train_data_set,
        sampler=BalanceClassSampler(labels=train_data_set.get_labels(),
                                    mode="downsampling"),
        batch_size=hyper_parameters["batch_size"],
        shuffle=False,
        num_workers=hyper_parameters["training_workers"],
        pin_memory=False,
        drop_last=True,
    )
    validation_data_loader = DataLoader(
        validation_data_set,
        batch_size=hyper_parameters["batch_size"],
        shuffle=False,
        num_workers=hyper_parameters["training_workers"],
        pin_memory=False,
        drop_last=True,
    )

    return train_data_loader, validation_data_loader
    def get_datasets(
        self,
        stage: str,
        datapath: str = None,
        in_csv: str = None,
        in_csv_train: str = None,
        in_csv_valid: str = None,
        in_csv_infer: str = None,
        train_folds: str = None,
        valid_folds: str = None,
        tag2class: str = None,
        class_column: str = None,
        tag_column: str = None,
        folds_seed: int = 42,
        n_folds: int = 5,
        one_hot_classes: int = None,
        balance_strategy: str = "upsampling",
    ):
        datasets = collections.OrderedDict()
        tag2class = safitty.load(tag2class) if tag2class is not None else None

        df, df_train, df_valid, df_infer = read_csv_data(
            in_csv=in_csv,
            in_csv_train=in_csv_train,
            in_csv_valid=in_csv_valid,
            in_csv_infer=in_csv_infer,
            train_folds=train_folds,
            valid_folds=valid_folds,
            tag2class=tag2class,
            class_column=class_column,
            tag_column=tag_column,
            seed=folds_seed,
            n_folds=n_folds,
        )

        open_fn = [
            ImageReader(
                input_key="filepath", output_key="image", rootpath=datapath
            )
        ]

        if stage.startswith('infer'):
            open_fn.append(ScalarReader(
                input_key="filepath",
                output_key="filepath",
                default_value=-1,
                dtype=np.str,
            ))
        else:
            open_fn.append(ScalarReader(
                input_key="class",
                output_key="targets",
                default_value=-1,
                dtype=np.int64,
            ))

            if one_hot_classes:
                open_fn.append(
                    ScalarReader(
                        input_key="class",
                        output_key="targets_one_hot",
                        default_value=-1,
                        dtype=np.int64,
                        one_hot_classes=one_hot_classes,
                    )
                )

        open_fn = ReaderCompose(readers=open_fn)

        for source, mode in zip(
            (df_train, df_valid, df_infer), ("train", "valid", "infer")
        ):
            if source is not None and len(source) > 0:
                dataset = ListDataset(
                    source,
                    open_fn=open_fn,
                    dict_transform=self.get_transforms(
                        stage=stage, dataset=mode
                    ),
                )
                if mode == "train":
                    labels = [x["class"] for x in source]
                    sampler = BalanceClassSampler(
                        labels, mode=balance_strategy
                    )
                    dataset = {"dataset": dataset, "sampler": sampler}
                datasets[mode] = dataset

        if stage == 'infer':
            datasets['infer'] = datasets['valid']
            del datasets['valid']
            if 'train' in datasets:
                del datasets['train']


        return datasets
예제 #7
0
def make_train_and_validation_data_loaders(
    hyper_parameters: Dict,
) -> Tuple[DataLoader, DataLoader]:
    input_data_type = hyper_parameters["input_data_type"]
    validation_fold_number = hyper_parameters["validation_fold_number"]

    if input_data_type == "RGB":
        data_set_class = ColourDataSet
        # Define a set of image augmentations.
        augmentations_train = Compose(
            [
                VerticalFlip(p=0.5),
                HorizontalFlip(p=0.5),
                RandomRotate90(p=0.5),
                Normalize(p=1),
                ToTensorV2(),
            ],
            p=1,
        )
        augmentations_validation = Compose([Normalize(p=1), ToTensorV2()], p=1)
    elif input_data_type == "YCbCr":
        data_set_class = ColourDataSet
        # Define a set of image augmentations.
        augmentations_train = Compose(
            [
                VerticalFlip(p=0.5),
                HorizontalFlip(p=0.5),
                RandomRotate90(p=0.5),
                ToTensorV2(),
            ],
            p=1,
        )
        augmentations_validation = Compose([ToTensorV2()], p=1)
        # augmentations_train = None
        # augmentations_validation = None
    elif input_data_type == "DCT":
        data_set_class = DCTDataSet
        # Define a set of image augmentations.
        # augmentations_train = Compose([VerticalFlip(p=0), HorizontalFlip(p=1)], p=1,)
        # augmentations_validation = Compose([], p=1)
        augmentations_train = None
        augmentations_validation = None
    else:
        raise ValueError(
            f"Invalid input data type provided: {input_data_type}"
        )

    # Load a DataFrame with the files and targets.
    data_set = load_data(n_classes=hyper_parameters["n_classes"])

    # Split the data set into folds.
    data_set = add_fold_to_data_set(data_set)

    # Create train and validation data sets.
    train_data_set = data_set_class(
        kinds=data_set[data_set["fold"] != validation_fold_number].kind.values,
        image_names=data_set[
            data_set["fold"] != validation_fold_number
        ].image_name.values,
        labels=data_set[
            data_set["fold"] != validation_fold_number
        ].label.values,
        n_classes=hyper_parameters["n_classes"],
        transforms=augmentations_train,
        colour_space=input_data_type,
        use_quality_factor=hyper_parameters["use_quality_factor"],
        separate_classes_by_quality_factor=hyper_parameters[
            "separate_classes_by_quality_factor"
        ],
    )
    validation_data_set = data_set_class(
        kinds=data_set[data_set["fold"] == validation_fold_number].kind.values,
        image_names=data_set[
            data_set["fold"] == validation_fold_number
        ].image_name.values,
        labels=data_set[
            data_set["fold"] == validation_fold_number
        ].label.values,
        n_classes=hyper_parameters["n_classes"],
        transforms=augmentations_validation,
        colour_space=input_data_type,
        use_quality_factor=hyper_parameters["use_quality_factor"],
        separate_classes_by_quality_factor=hyper_parameters[
            "separate_classes_by_quality_factor"
        ],
    )

    # Create train and validation data loaders.
    train_data_loader = DataLoader(
        train_data_set,
        sampler=BalanceClassSampler(
            labels=train_data_set.get_labels(), mode="downsampling"
        ),
        batch_size=int(
            hyper_parameters["batch_size"] * len(hyper_parameters["devices"])
        ),
        shuffle=False,
        num_workers=hyper_parameters["training_workers"],
        pin_memory=False,
        drop_last=True,
    )
    validation_data_loader = DataLoader(
        validation_data_set,
        batch_size=int(
            hyper_parameters["batch_size"] * len(hyper_parameters["devices"])
        ),
        shuffle=False,
        num_workers=hyper_parameters["training_workers"],
        pin_memory=False,
        drop_last=True,
    )

    return train_data_loader, validation_data_loader