def load_dataset_into_batches(file_dir_path: str, subset: Subset, subset_size: int, shuffle: bool = False): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(file_dir_path, subset) return BatchGenerator(dataset, subset_size, shuffle, op)
def main(): data = PetsDataset("/home/helmuth/dlvc/cifar-10-batches-py", Subset.TRAINING) # ops chain op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32), ops.add(-127.5), ops.mul(1/127.5), ]) # batch generator #1 bg1 = BatchGenerator(data, len(data), False) assert(len(bg1) == 1) # batch generator #2 bg2 = BatchGenerator(data, 500, False, op) assert(len(bg2) == 16) # first batch cnt = 0 for batch in bg2: cnt += 1 if cnt < 16: assert(batch.data.shape == (500, 3072)) assert(batch.labels.shape == (500,)) assert(batch.data.dtype == np.float32) assert(np.issubdtype(batch.labels.dtype, np.integer)) if cnt == 1: print("First batch, first sample, not shuffled") print(batch.data[0]) # batch generator #3 bg3 = BatchGenerator(data, 500, True, op) # run 5 times through first sample of shuffled batch generator for i in range(5): it = iter(bg3) print("First batch, first sample, shuffled") print(next(it).data[0])
def test_data_transformation(self): op = ops.chain([ops.vectorize(), ops.type_cast(np.float32)]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) batch_gen = BatchGenerator(dataset, 100, False, op) self.assertEqual(len(batch_gen), 80) iter_gen = iter(batch_gen) iter_result = next(iter_gen) self.assertEqual(iter_result.data[0].shape, (3072, )) self.assertTrue(np.issubdtype(iter_result.data.dtype, np.float32))
def test_train_with_wrong_type_of_labels(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) batch_gen = BatchGenerator(dataset, 7959, False, op) batch_iter = iter(batch_gen) iter_result = next(batch_iter) classifier = KnnClassifier(10, 3072, 2) self.assertRaises(TypeError, classifier.train, iter_result.data, [0, 1, 0])
def load_dataset(subset: Subset) -> batches.BatchGenerator: dataset = PetsDataset('../data/cifar-10-batches-py', subset) op = ops.chain([ ops.hwc2chw(), ops.add(-127.5), ops.mul(1 / 127.5), ops.type_cast(np.float32) ]) return batches.BatchGenerator(dataset, 128, True, op)
def test_train_with_proper_data(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) batch_gen = BatchGenerator(dataset, 7959, False, op) batch_iter = iter(batch_gen) iter_result = next(batch_iter) classifier = KnnClassifier(10, 3072, 2) classifier.train(iter_result.data, iter_result.label)
def test_train_wrong_vector_size_in_data(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) batch_gen = BatchGenerator(dataset, 7959, False, op) batch_iter = iter(batch_gen) iter_result = next(batch_iter) classifier = KnnClassifier(10, 3072, 2) changed_data = np.delete(iter_result.data, 100, 1) self.assertRaises(RuntimeError, classifier.train, changed_data, iter_result.label)
def test_correctness_of_data_for_train(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) one_batch_gen = BatchGenerator(dataset, 7959, False, op) self.assertEqual(len(one_batch_gen), 1) many_batch_gen = BatchGenerator(dataset, 500, False, op) self.assertEqual(len(many_batch_gen), 16) reference = [116., 125., 125., 91., 101.] batch_iter = iter(many_batch_gen) batch_iter = next(batch_iter) [self.assertEqual(item, reference[i]) for i, item in enumerate(batch_iter.data[0][:5])]
def load_dataset(subset: Subset, augment=False) -> batches.BatchGenerator: dataset = PetsDataset('../data/cifar-10-batches-py', subset) ops_list = [] if augment: ops_list += [ops.hflip(), ops.rcrop(32, 12, 'constant')] ops_list += [ ops.mul(1 / 255), ops.type_cast(np.float32), # Imagenet: # ops.normalize( mean=np.array([0.485, 0.456, 0.406]), # std=np.array([0.229, 0.224, 0.225])), # Cifar-10: ops.normalize(mean=np.array([0.41477802, 0.45935813, 0.49693552]), std=np.array([0.25241926, 0.24699265, 0.25279155])), ops.hwc2chw() ] op = ops.chain(ops_list) return batches.BatchGenerator(dataset, 128, True, op)
def test_predict_with_proper_data(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset_training = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) dataset_valid = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.VALIDATION) batch_gen_t = BatchGenerator(dataset_training, 795, False, op) batch_gen_v = BatchGenerator(dataset_valid, 204, False, op) batch_iter_t = iter(batch_gen_t) iter_result_t = next(batch_iter_t) batch_iter_v = iter(batch_gen_v) iter_result_v = next(batch_iter_v) classifier = KnnClassifier(10, 3072, 2) classifier.train(iter_result_t.data, iter_result_t.label) results = classifier.predict(iter_result_v.data) self.assertEqual(len(results), 204) for result in results: self.assertEqual(np.sum(result), 1.0)
# skip last batch if batch_idx == last_batch_idx: continue assert batch.data.shape == (500, 3072), "Batch data shape: " + str( batch.data.shape) + ", expected: (500, 3072)." assert batch.labels.shape == (500, ), "Batch labels shape: " + str( batch.labels.shape) + ", expected: (500,)." batch_idx += 1 # The data type is always np.float32 and the label type is integral (one of the np.int and np.uint variants) # Implemented: for label type np.uint8 since there is less than 256 labels batch_generator = BatchGenerator( dataset_training, 500, False, op=chain([vectorize(), type_cast(dtype=np.float32)])) for batch in batch_generator: assert batch.data.dtype == np.float32, "Batch data type: " + str( batch.data.dtype) + ", expected: np.float32." assert batch.labels.dtype == np.uint8, "Batch labels type: " + str( batch.labels.dtype) + ", expected: np.uint8." # The first sample of the first training batch returned without shuffling # has label 0 ... batch_generator = BatchGenerator( dataset_training, 500, False, op=chain([type_cast(dtype=np.float32), vectorize()])) first_sample_label_unshuffled = None
from dlvc.batches import BatchGenerator from dlvc.test import Accuracy from dlvc.datasets.pets import PetsDataset from dlvc.dataset import Subset import dlvc.ops as ops np.random.seed(0) torch.manual_seed(0) DATA_PATH = "../cifar-10-batches-py/" MODEL_PATH = "best_model.pt" train_data = PetsDataset(DATA_PATH, Subset.TRAINING) val_data = PetsDataset(DATA_PATH, Subset.VALIDATION) op = ops.chain([ ops.type_cast(np.float32), ops.add(-127.5), ops.mul(1 / 127.5), ops.hflip(), ops.rcrop(32, 4, 'constant'), ops.add_noise(), ops.hwc2chw() ]) train_batches = BatchGenerator(train_data, 128, False, op) val_batches = BatchGenerator(val_data, 128, False, op) class Net(nn.Module): def __init__(self, img_size, num_classes): super(Net, self).__init__()
NUM_CHANNELS = 3 BATCH_SIZE = 128 NUM_CLASSES = 2 EPOCHS = 500 lr = 0.001 # weight decay 0 in this configuration, in part 3 this is changed wd = 0.0 pets_training = PetsDataset(dir, Subset.TRAINING) pets_validation = PetsDataset(dir, Subset.VALIDATION) pets_test = PetsDataset(dir, Subset.TEST) batchGenerator_training = BatchGenerator(pets_training, BATCH_SIZE, shuffle=True, op=chain([type_cast(dtype=np.float32), add(-127.5), mul(1 / 127.5), hwc2chw()])) batchGenerator_validation = BatchGenerator(pets_validation, BATCH_SIZE, shuffle=False, op=chain([type_cast(dtype=np.float32), add(-127.5), mul(1 / 127.5), hwc2chw()])) batchGenerator_test = BatchGenerator(pets_test, BATCH_SIZE, shuffle=False, op=chain([type_cast(dtype=np.float32), add(-127.5), mul(1 / 127.5), hwc2chw()]))
import numpy as np from dlvc.dataset import Subset from dlvc.datasets.pets import PetsDataset from dlvc import ops, batches dataset = PetsDataset('../data/cifar-10-batches-py', Subset.TRAINING) op = ops.chain([ops.mul(1 / 255), ops.type_cast(np.float32)]) batch_generator = batches.BatchGenerator(dataset, 7959, True, op) training_images = [] for batch in batch_generator: training_images.append(batch.data) training_images = np.array(training_images, dtype=np.float32) training_images = training_images.reshape(training_images.shape[1:]) train_mean = np.mean(training_images, axis=(0, 1, 2)) train_std = np.std(training_images, axis=(0, 1, 2)) print(train_mean, train_std)
np.random.seed(42) torch.manual_seed(42) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Batch size to be used BATCH_SIZE = 128 # Step 1: load the data sets (TRAIN, VALIDATION) train_data = PetsDataset("../cifar-10-batches-py", Subset.TRAINING) val_data = PetsDataset("../cifar-10-batches-py", Subset.VALIDATION) # Operations to standardize # First experiment: scale to [-1,1] op1 = ops.chain([ ops.type_cast(np.float32), ops.add(-127.5), ops.mul(1 / 127.5), ops.hwc2chw() ]) # Second experiment: scale to sample mean=0, sd=1 # calculate average training sample mean & sd op_calc = ops.chain([ops.type_cast(np.float32), ops.mean_sd()]) # using batch generator (could do it directly but I'm lazy) train_full_batch_gen = BatchGenerator(train_data, len(train_data), False, op_calc) train_full_batch = next(b for b in train_full_batch_gen) train_mean_sd = np.mean(train_full_batch.data, axis=0) # create operation to scale op2 = ops.chain([ ops.type_cast(np.float32),
def load_dataset(subset: Subset) -> batches.BatchGenerator: dataset = PetsDataset('../data/cifar-10-batches-py', subset) op = ops.chain([ops.vectorize(), ops.type_cast(np.float32)]) return batches.BatchGenerator(dataset, len(dataset), True, op)
import torch.nn as nn from dlvc.datasets.pets import PetsDataset from dlvc.models.pytorch import CnnClassifier from dlvc.batches import BatchGenerator from dlvc.test import Accuracy from dlvc.dataset import Subset import dlvc.ops as ops np.random.seed(0) pets_train = PetsDataset("../cifar-10-batches-py/", Subset.TRAINING) op = ops.chain([ ops.type_cast(np.float32), ops.add(-127.5), ops.mul(1 / 127.5), ops.hflip(), ops.rcrop(32, 4, 'constant'), ops.add_noise(), ops.hwc2chw() ]) reverse_op = ops.chain([ ops.chw2hwc(), ops.mul(127.5), ops.add(127.5), ops.type_cast(np.uint8), ])
def set_parameter(model, freeze_parameters): if freeze_parameters: for param in model.parameters(): param.requires_grad = False if USE_TRANSFER_LEARNING: # there are two networks to use in transfer learning "resnet" and "alexnet" net = initialize_transfer_learning_model("resnet", NUM_CLASSES, FREEZE_CNN_PARAMETERS) net, input_size = net pad_mode_for_resizing = 'constant' op_chain = chain([ type_cast(dtype=np.float32), add(-127.5), mul(1 / 127.5), rcrop(25, 2, 'median'), resize(input_size, pad_mode_for_resizing), hwc2chw() ]) else: net = CatDogNet() op_chain = chain([ type_cast(dtype=np.float32), add(-127.5), mul(1 / 127.5), rcrop(25, 2, 'median'), hwc2chw() ])
# The data and label shapes are (500, 3072) and (500,), respectively, unless for the last batch batch_generator = BatchGenerator(dataset_training, 500, shuffle=False, op=vectorize()) last_batch_idx = len(batch_generator) - 1 batch_idx = 0 for batch in batch_generator: # skip last batch if batch_idx == last_batch_idx: continue assert batch.data.shape == (500, 3072), "Batch data shape: " + str(batch.data.shape) + ", expected: (500, 3072)." assert batch.label.shape == (500,), "Batch labels shape: " + str(batch.label.shape) + ", expected: (500,)." batch_idx += 1 # The data type is always np.float32 and the label type is integral (one of the np.int and np.uint variants) # Implemented: for label type np.uint8 since there is less than 256 labels batch_generator = BatchGenerator(dataset_training, 500, False, op=chain([vectorize(), type_cast(dtype=np.float32)])) for batch in batch_generator: assert batch.data.dtype == np.float32, "Batch data type: " + str(batch.data.dtype) + ", expected: np.float32." assert batch.label.dtype == np.uint8, "Batch labels type: " + str(batch.label.dtype) + ", expected: np.uint8." # The first sample of the first training batch returned without shuffling # has label 0 ... batch_generator = BatchGenerator(dataset_training, 500, False, op=chain([type_cast(dtype=np.float32), vectorize()])) first_sample_label_unshuffled = None first_sample_data_unshuffled = None expected_label = 0 for batch in batch_generator: for label in batch.label: first_sample_label_unshuffled = label break for data in batch.data: