def test_shuffle(dataset: ILSVRCDataset) -> None: """Tests that the shuffling flag works as expected. Also tests that filenames and labels are still properly mapped. :param dataset: the dataset. """ dataset.trim_dataset(DATASET_FRACTION) x_train_filenames = dataset.partition[TRAIN_KEY] y_train = dataset.get_labels(x_train_filenames, True, NUM_CLASSES) train_sequence = ImageDatasetSequence(x_train_filenames, y=y_train, batch_size=BATCH_SIZE, image_target_size=IMAGE_TARGET_SIZE, batch_augment_fn=None, batch_format_fn=None, overfit_single_batch=False, shuffle_on_epoch_end=True) img_to_label_before = {} for batch in train_sequence: x_batch, y_batch = batch for i in range(x_batch.shape[0]): img_data = tuple(x_batch[i].flatten()) label = tuple(y_batch[i]) img_to_label_before[img_data] = label # Test shuffle. first_batch_before = train_sequence.__getitem__(0) train_sequence.on_epoch_end() first_batch_after = train_sequence.__getitem__(0) assert (first_batch_before[0] != first_batch_after[0]).any() # Test filename/label mappings. for batch in train_sequence: x_batch, y_batch = batch for i in range(x_batch.shape[0]): img_data = tuple(x_batch[i].flatten()) label = tuple(y_batch[i]) assert img_to_label_before[img_data] == label
def get_model(dataset_args: Dict[str, Any], network_args: Dict[str, Any]) -> ProjectModel: """Returns the model. :param dataset_args: the dataset arguments; see DEFAULT_DATASET_ARGS for available arguments. :param network_args: the network arguments; see DEFAULT_NETWORK_ARGS for available arguments. :return: the model. """ dataset_args = {**DEFAULT_DATASET_ARGS, **dataset_args} network_args = {**DEFAULT_NETWORK_ARGS, **network_args} print('Dataset args: {0}'.format(dataset_args)) print('Network args: {0}'.format(network_args)) print('Loading dataset from {0}'.format(DEFAULT_DATASET_PATH)) dataset = ILSVRCDataset(DEFAULT_DATASET_PATH) if dataset_args['dataset_fraction'] < 1.0: dataset.trim_dataset(dataset_args['dataset_fraction']) print('Num training examples: {0}'.format( dataset.partition[TRAIN_KEY].shape[0])) print('Num validation examples: {0}'.format( dataset.partition[VAL_KEY].shape[0])) print('Num test examples: {0}'.format( dataset.partition[TEST_KEY].shape[0])) if network_args['architecture'] == ARCHITECTURE_MLP: network = MLP(network_args) elif network_args['architecture'] == ARCHITECTURE_LENET: network = LeNet(network_args) else: raise ValueError('Unrecognized architecture: {0}'.format( network_args['architecture'])) return ImageModel(dataset, network)
def test_trim_dataset(dataset: ILSVRCDataset) -> None: """Tests that the dataset is being trimmed properly. The trimmed dataset should be shuffled so that the classes retain the same approximate distribution. :param dataset: the dataset. """ train_size_before = dataset.partition[TRAIN_KEY].shape[0] val_size_before = dataset.partition[VAL_KEY].shape[0] test_size_before = dataset.partition[TEST_KEY].shape[0] train_subset_before = dataset.partition[TRAIN_KEY][:5] val_subset_before = dataset.partition[VAL_KEY][:5] test_subset_before = dataset.partition[TEST_KEY][:5] dataset.trim_dataset(DATASET_FRACTION, trim_val=True, trim_test=False) train_size_after = dataset.partition[TRAIN_KEY].shape[0] val_size_after = dataset.partition[VAL_KEY].shape[0] test_size_after = dataset.partition[TEST_KEY].shape[0] train_subset_after = dataset.partition[TRAIN_KEY][:5] val_subset_after = dataset.partition[VAL_KEY][:5] test_subset_after = dataset.partition[TEST_KEY][:5] # Check that trimming occurred (or didn't). assert (train_size_before * (DATASET_FRACTION - DELTA)) < \ train_size_after < \ (train_size_before * (DATASET_FRACTION + DELTA)) assert (val_size_before * (DATASET_FRACTION - DELTA)) < \ val_size_after < \ (val_size_before * (DATASET_FRACTION + DELTA)) assert test_size_before == test_size_after # Check that the datasets were shuffled (or weren't). # We're just going to use the first 5 filenames to check for shuffling; # it's extremely unlikely that all are the same after shuffling. assert (train_subset_before != train_subset_after).any() assert (val_subset_before != val_subset_after).any() assert (test_subset_before == test_subset_after).all()
def test_training_reproducible() -> None: """Tests that training results are reproducible.""" set_random_seed(SEED) dataset_args = {'dataset_fraction': 0.001} network_args = {'input_shape': (128, 128, 3), 'num_classes': 1000} train_args = {'epochs': 10, 'batch_size': 32, 'early_stopping': True} dataset = ILSVRCDataset(DEFAULT_DATASET_PATH) dataset.trim_dataset(dataset_args['dataset_fraction']) network = MLP(network_args) model = ProjectModel(dataset, network) history = train_model.train_model(model, train_args) assert str(history.history) == SEED_HISTORY
def test_images(dataset: ILSVRCDataset) -> None: """Tests that the sequence output images meet expected standards. :param dataset: the dataset. """ dataset.trim_dataset(DATASET_FRACTION) x_train_filenames = dataset.partition[TRAIN_KEY] y_train = dataset.get_labels(x_train_filenames, True, NUM_CLASSES) train_sequence = ImageDatasetSequence(x_train_filenames, y=y_train, batch_size=BATCH_SIZE, image_target_size=IMAGE_TARGET_SIZE, batch_augment_fn=None, batch_format_fn=None, overfit_single_batch=False, shuffle_on_epoch_end=True) # Test that only the last batch is not of length BATCH_SIZE. # Also test that there are the correct number of batches. on_last_batch = False num_batches_seen = 0 for batch in train_sequence: assert not on_last_batch x_batch, y_batch = batch # Take the first image/label pair and check that it meets standards. # Check that the image is of the right size. assert x_batch[0].shape == IMAGE_TARGET_SIZE + (3, ) # Check that the image is of the right datatype. assert x_batch.dtype == np.float32 # Check that the image is normalized. assert (0.0 <= x_batch.flatten()).all() assert (x_batch.flatten() <= 1.0).all() # Check that the label is categorical and of the right dimension. assert y_batch.shape[1] == NUM_CLASSES # Check that the label is of the right datatype. assert y_batch.dtype == np.float32 # Check that the label is one-hot. for label in y_batch: assert sum(label) == 1 on_last_batch = not (x_batch.shape[0] == BATCH_SIZE and y_batch.shape[0] == BATCH_SIZE) num_batches_seen += 1 assert num_batches_seen == len(train_sequence)
def test_overfit_single_batch(dataset: ILSVRCDataset) -> None: """Tests that the same batch of images is always presented to the model if overfitting on a single batch. :param dataset: the dataset. """ dataset.trim_dataset(DATASET_FRACTION) x_train_filenames = dataset.partition[TRAIN_KEY] y_train = dataset.get_labels(x_train_filenames, True, NUM_CLASSES) # Test that you can't set overfit and shuffle flags together. train_sequence = ImageDatasetSequence(x_train_filenames, y=y_train, batch_size=BATCH_SIZE, image_target_size=IMAGE_TARGET_SIZE, batch_augment_fn=None, batch_format_fn=None, overfit_single_batch=True, shuffle_on_epoch_end=True) with pytest.raises(ValueError): for _ in train_sequence: pass # Test that you always get the same batch, even after multiple epochs. train_sequence = ImageDatasetSequence(x_train_filenames, y=y_train, batch_size=BATCH_SIZE, image_target_size=IMAGE_TARGET_SIZE, batch_augment_fn=None, batch_format_fn=None, overfit_single_batch=True, shuffle_on_epoch_end=False) num_batches_epoch_1 = 0 for batch in train_sequence: assert (batch[0] == train_sequence.__getitem__(0)[0]).all() num_batches_epoch_1 += 1 train_sequence.on_epoch_end() num_batches_epoch_2 = 0 for batch in train_sequence: assert (batch[0] == train_sequence.__getitem__(0)[0]).all() num_batches_epoch_2 += 1 assert num_batches_epoch_1 == num_batches_epoch_2
def dataset() -> ILSVRCDataset: """Returns an ILSVRCDataset. :return: the dataset. """ dataset = ILSVRCDataset(DEFAULT_DATASET_PATH) return dataset