def _verify_dataset_integrity(dataset_folder, disable_dataset_integrity, enable_deep_dataset_integrity): """ Verifies dataset integrity by looking at the footprint.json in the dataset folder. In case the deep check is enable, the program will be stopped in case the check is not passed. Parameters ---------- dataset_folder : string Path string that points to the three folder train/val/test. Example: ~/../../data/svhn disable_dataset_integrity : boolean Flag to enable or disable verifying the dataset integrity enable_deep_dataset_integrity : boolean Flag to enable or disable verifying the dataset integrity in a deep fashion (check the hashes of all files) Returns ------- None """ if not disable_dataset_integrity: if enable_deep_dataset_integrity: if not verify_integrity_deep(dataset_folder): sys.exit(-1) else: verify_integrity_quick(dataset_folder)
def set_up_dataloaders(model_expected_input_size, dataset_folder, batch_size, workers, disable_dataset_integrity, enable_deep_dataset_integrity, inmem=False, **kwargs): """ Set up the dataloaders for the specified datasets. Parameters ---------- model_expected_input_size : tuple Specify the height and width that the model expects. dataset_folder : string Path string that points to the three folder train/val/test. Example: ~/../../data/svhn batch_size : int Number of datapoints to process at once workers : int Number of workers to use for the dataloaders inmem : boolean Flag: if False, the dataset is loaded in an online fashion i.e. only file names are stored and images are loaded on demand. This is slower than storing everything in memory. Returns ------- train_loader : torch.utils.data.DataLoader val_loader : torch.utils.data.DataLoader test_loader : torch.utils.data.DataLoader Dataloaders for train, val and test. int Number of classes for the model. """ # Recover dataset name dataset = os.path.basename(os.path.normpath(dataset_folder)) logging.info('Loading {} from:{}'.format(dataset, dataset_folder)) ############################################################################################### # Verify dataset integrity if not disable_dataset_integrity: if enable_deep_dataset_integrity: if not verify_integrity_deep(dataset_folder): sys.exit(-1) else: if not verify_integrity_quick(dataset_folder): sys.exit(-1) ############################################################################################### # Load the dataset splits as images try: logging.debug("Try to load dataset as images") train_ds, val_ds, test_ds = image_folder_dataset.load_dataset( dataset_folder, inmem, workers) # Loads the analytics csv and extract mean and std mean, std = _load_mean_std_from_file(dataset_folder, inmem, workers) # Set up dataset transforms logging.debug('Setting up dataset transforms') transform = transforms.Compose([ transforms.Resize(model_expected_input_size), transforms.ToTensor(), transforms.Normalize(mean=mean, std=std) ]) train_ds.transform = transform val_ds.transform = transform test_ds.transform = transform train_loader, val_loader, test_loader = _dataloaders_from_datasets( batch_size, train_ds, val_ds, test_ds, workers) logging.info("Dataset loaded as images") return train_loader, val_loader, test_loader, len(train_ds.classes) except RuntimeError: logging.debug("No images found in dataset folder provided") ############################################################################################### # Load the dataset splits as bidimensional try: logging.debug("Try to load dataset as bidimensional") train_ds, val_ds, test_ds = bidimensional_dataset.load_dataset( dataset_folder) # Loads the analytics csv and extract mean and std # TODO: update bidimensional to work with new load_mean_std functions mean, std = _load_mean_std_from_file(dataset_folder, inmem, workers) # Bring mean and std into range [0:1] from original domain mean = np.divide((mean - train_ds.min_coords), np.subtract(train_ds.max_coords, train_ds.min_coords)) std = np.divide((std - train_ds.min_coords), np.subtract(train_ds.max_coords, train_ds.min_coords)) # Set up dataset transforms logging.debug('Setting up dataset transforms') transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]) train_ds.transform = transform val_ds.transform = transform test_ds.transform = transform train_loader, val_loader, test_loader = _dataloaders_from_datasets( batch_size, train_ds, val_ds, test_ds, workers) logging.info("Dataset loaded as bidimensional data") return train_loader, val_loader, test_loader, len(train_ds.classes) except RuntimeError: logging.debug("No bidimensional found in dataset folder provided") ############################################################################################### # Verify that eventually a dataset has been correctly loaded logging.error( "No datasets have been loaded. Verify dataset folder location or dataset folder structure" ) sys.exit(-1)