def get_train_dataloader(df: pd.DataFrame, configs: dict): mean = (configs['mean']['r'], configs['mean']['g'], configs['mean']['b']) std = (configs['std']['r'], configs['std']['g'], configs['std']['b']) transforms = get_train_augmentations(configs['image_size'], mean=mean, std=std) try: face_detector = configs['face_detector'] except KeyError: face_detector = None dataset = Dataset(df, configs['path_root'], transforms, face_detector=face_detector) if configs['use_balance_sampler']: labels = list(df.target.values) sampler = BalanceClassSampler(labels, mode="upsampling") shuffle = False else: sampler = None shuffle = True dataloader = DataLoader( dataset, batch_size=configs['batch_size'], num_workers=configs['num_workers_train'], sampler=sampler, shuffle=shuffle, ) return dataloader
def test_balance_class_sampler_with_prefetch(): train_data = MNIST(os.getcwd(), train=True, download=True, transform=ToTensor()) train_labels = train_data.targets.cpu().numpy().tolist() train_sampler = BalanceClassSampler(train_labels, mode=5000) valid_data = MNIST(os.getcwd(), train=False, download=True, transform=ToTensor()) loaders = { "train": DataLoader(train_data, sampler=train_sampler, batch_size=32), "valid": DataLoader(valid_data, batch_size=32), } loaders = {k: BatchPrefetchLoaderWrapper(v) for k, v in loaders.items()} model = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 10)) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.02) runner = dl.SupervisedRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, num_epochs=1, logdir="./logs", valid_loader="valid", valid_metric="loss", minimize_valid_metric=True, verbose=True, )
def test_balance_class_sampler(): train_data = MNIST(DATA_ROOT, train=True) train_labels = train_data.targets.cpu().numpy().tolist() train_sampler = BalanceClassSampler(train_labels, mode=5000) valid_data = MNIST(DATA_ROOT, train=False) loaders = { "train": DataLoader(train_data, sampler=train_sampler, batch_size=32), "valid": DataLoader(valid_data, batch_size=32), } model = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 10)) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.02) runner = dl.SupervisedRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, num_epochs=1, logdir="./logs", valid_loader="valid", valid_metric="loss", minimize_valid_metric=True, verbose=True, )
def make_train_and_validation_data_loaders( hyper_parameters: Dict, validation_fold_number: int = 0, ) -> Tuple[DataLoader, DataLoader]: # Load a DataFrame with the files and targets. data_set = load_imagenet_data_paths() # Split the data set into folds. data_set = add_fold_to_data_set(data_set, n_splits=24) # Create train and validation data sets. train_data_set = DCTDataSet( kinds=data_set[data_set["fold"] != validation_fold_number].kind.values, files=data_set[data_set["fold"] != validation_fold_number].file.values, labels=data_set[ data_set["fold"] != validation_fold_number].label.values, is_training=True, ) validation_data_set = DCTDataSet( kinds=data_set[data_set["fold"] == validation_fold_number].kind.values, files=data_set[data_set["fold"] == validation_fold_number].file.values, labels=data_set[data_set["fold"] == validation_fold_number].label.values, is_training=False, ) # Create train and validation data loaders. train_data_loader = DataLoader( train_data_set, sampler=BalanceClassSampler(labels=train_data_set.get_labels(), mode="downsampling"), batch_size=hyper_parameters["batch_size"], shuffle=False, num_workers=hyper_parameters["training_workers"], pin_memory=True, drop_last=True, collate_fn=collate_fn, ) validation_data_loader = DataLoader( validation_data_set, batch_size=hyper_parameters["batch_size"], shuffle=False, num_workers=hyper_parameters["training_workers"], pin_memory=True, drop_last=True, collate_fn=collate_fn, ) return train_data_loader, validation_data_loader
def make_train_and_validation_data_loaders( hyper_parameters: Dict, validation_fold_number: int = 0, ) -> Tuple[DataLoader, DataLoader]: # Define a set of image augmentations. # augmentations_train = Compose([VerticalFlip(p=0), HorizontalFlip(p=1)], p=1,) # augmentations_validation = Compose([], p=1) augmentations_train = None augmentations_validation = None # Load a DataFrame with the files and targets. train_data, val_data = load_cifar_data_paths() # Create train and validation data sets. train_data_set = DCTDataSet( kinds=train_data.kind.values, files=train_data.file.values, labels=train_data.label.values, transforms=augmentations_train, ) validation_data_set = DCTDataSet( kinds=val_data.kind.values, files=val_data.file.values, labels=val_data.label.values, ) # Create train and validation data loaders. train_data_loader = DataLoader( train_data_set, sampler=BalanceClassSampler(labels=train_data_set.get_labels(), mode="downsampling"), batch_size=hyper_parameters["batch_size"], shuffle=False, num_workers=hyper_parameters["training_workers"], pin_memory=False, drop_last=True, ) validation_data_loader = DataLoader( validation_data_set, batch_size=hyper_parameters["batch_size"], shuffle=False, num_workers=hyper_parameters["training_workers"], pin_memory=False, drop_last=True, ) return train_data_loader, validation_data_loader
def get_datasets( self, stage: str, datapath: str = None, in_csv: str = None, in_csv_train: str = None, in_csv_valid: str = None, in_csv_infer: str = None, train_folds: str = None, valid_folds: str = None, tag2class: str = None, class_column: str = None, tag_column: str = None, folds_seed: int = 42, n_folds: int = 5, one_hot_classes: int = None, balance_strategy: str = "upsampling", ): datasets = collections.OrderedDict() tag2class = safitty.load(tag2class) if tag2class is not None else None df, df_train, df_valid, df_infer = read_csv_data( in_csv=in_csv, in_csv_train=in_csv_train, in_csv_valid=in_csv_valid, in_csv_infer=in_csv_infer, train_folds=train_folds, valid_folds=valid_folds, tag2class=tag2class, class_column=class_column, tag_column=tag_column, seed=folds_seed, n_folds=n_folds, ) open_fn = [ ImageReader( input_key="filepath", output_key="image", rootpath=datapath ) ] if stage.startswith('infer'): open_fn.append(ScalarReader( input_key="filepath", output_key="filepath", default_value=-1, dtype=np.str, )) else: open_fn.append(ScalarReader( input_key="class", output_key="targets", default_value=-1, dtype=np.int64, )) if one_hot_classes: open_fn.append( ScalarReader( input_key="class", output_key="targets_one_hot", default_value=-1, dtype=np.int64, one_hot_classes=one_hot_classes, ) ) open_fn = ReaderCompose(readers=open_fn) for source, mode in zip( (df_train, df_valid, df_infer), ("train", "valid", "infer") ): if source is not None and len(source) > 0: dataset = ListDataset( source, open_fn=open_fn, dict_transform=self.get_transforms( stage=stage, dataset=mode ), ) if mode == "train": labels = [x["class"] for x in source] sampler = BalanceClassSampler( labels, mode=balance_strategy ) dataset = {"dataset": dataset, "sampler": sampler} datasets[mode] = dataset if stage == 'infer': datasets['infer'] = datasets['valid'] del datasets['valid'] if 'train' in datasets: del datasets['train'] return datasets
def make_train_and_validation_data_loaders( hyper_parameters: Dict, ) -> Tuple[DataLoader, DataLoader]: input_data_type = hyper_parameters["input_data_type"] validation_fold_number = hyper_parameters["validation_fold_number"] if input_data_type == "RGB": data_set_class = ColourDataSet # Define a set of image augmentations. augmentations_train = Compose( [ VerticalFlip(p=0.5), HorizontalFlip(p=0.5), RandomRotate90(p=0.5), Normalize(p=1), ToTensorV2(), ], p=1, ) augmentations_validation = Compose([Normalize(p=1), ToTensorV2()], p=1) elif input_data_type == "YCbCr": data_set_class = ColourDataSet # Define a set of image augmentations. augmentations_train = Compose( [ VerticalFlip(p=0.5), HorizontalFlip(p=0.5), RandomRotate90(p=0.5), ToTensorV2(), ], p=1, ) augmentations_validation = Compose([ToTensorV2()], p=1) # augmentations_train = None # augmentations_validation = None elif input_data_type == "DCT": data_set_class = DCTDataSet # Define a set of image augmentations. # augmentations_train = Compose([VerticalFlip(p=0), HorizontalFlip(p=1)], p=1,) # augmentations_validation = Compose([], p=1) augmentations_train = None augmentations_validation = None else: raise ValueError( f"Invalid input data type provided: {input_data_type}" ) # Load a DataFrame with the files and targets. data_set = load_data(n_classes=hyper_parameters["n_classes"]) # Split the data set into folds. data_set = add_fold_to_data_set(data_set) # Create train and validation data sets. train_data_set = data_set_class( kinds=data_set[data_set["fold"] != validation_fold_number].kind.values, image_names=data_set[ data_set["fold"] != validation_fold_number ].image_name.values, labels=data_set[ data_set["fold"] != validation_fold_number ].label.values, n_classes=hyper_parameters["n_classes"], transforms=augmentations_train, colour_space=input_data_type, use_quality_factor=hyper_parameters["use_quality_factor"], separate_classes_by_quality_factor=hyper_parameters[ "separate_classes_by_quality_factor" ], ) validation_data_set = data_set_class( kinds=data_set[data_set["fold"] == validation_fold_number].kind.values, image_names=data_set[ data_set["fold"] == validation_fold_number ].image_name.values, labels=data_set[ data_set["fold"] == validation_fold_number ].label.values, n_classes=hyper_parameters["n_classes"], transforms=augmentations_validation, colour_space=input_data_type, use_quality_factor=hyper_parameters["use_quality_factor"], separate_classes_by_quality_factor=hyper_parameters[ "separate_classes_by_quality_factor" ], ) # Create train and validation data loaders. train_data_loader = DataLoader( train_data_set, sampler=BalanceClassSampler( labels=train_data_set.get_labels(), mode="downsampling" ), batch_size=int( hyper_parameters["batch_size"] * len(hyper_parameters["devices"]) ), shuffle=False, num_workers=hyper_parameters["training_workers"], pin_memory=False, drop_last=True, ) validation_data_loader = DataLoader( validation_data_set, batch_size=int( hyper_parameters["batch_size"] * len(hyper_parameters["devices"]) ), shuffle=False, num_workers=hyper_parameters["training_workers"], pin_memory=False, drop_last=True, ) return train_data_loader, validation_data_loader