def main(): data_TRAINING = PetsDataset("/home/helmuth/dlvc/cifar-10-batches-py", Subset.TRAINING) data_VALIDATION = PetsDataset("/home/helmuth/dlvc/cifar-10-batches-py", Subset.VALIDATION) data_TEST = PetsDataset("/home/helmuth/dlvc/cifar-10-batches-py", Subset.TEST) # check length of datasets assert(len(data_TRAINING) == 7959) assert(len(data_VALIDATION) == 2041) assert(len(data_TEST) == 2000) # count cats and dogs cat_count = 0 dog_count = 0 for s in data_TRAINING: if s.label == 0: cat_count += 1 else: dog_count += 1 for s in data_TEST: if s.label == 0: cat_count += 1 else: dog_count += 1 for s in data_VALIDATION: if s.label == 0: cat_count += 1 else: dog_count += 1 assert(cat_count == 6000) assert(dog_count == 6000) assert(data_TRAINING[0].data.shape == (32,32,3)) assert(data_TRAINING[0].data.dtype == np.uint8) labels = [data_TRAINING[i].label for i in range(10)] assert(labels == [0, 0, 0, 0, 1, 0, 0, 0, 0, 1]) for i in range(10): cv2.imwrite('sample' + str(i) + '.png', data_TRAINING[i].data)
def test_correctness_of_data(self): training_set = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) validation_set = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.VALIDATION) test_set = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TEST) # Test number of samples in the individual data sets: self.assertEqual(len(training_set), 7959) self.assertEqual(len(validation_set), 2041) self.assertEqual(len(test_set), 2000) # #Test image shape and type self.assertEqual(test_set[3].data.shape, (32, 32, 3)) self.assertEqual(test_set[3].data.dtype, 'uint8') #Test labels of first 10 training samples test_samples = [] for i in range(0, 10): test_samples.append(training_set[i].label) self.assertEqual(test_samples, [0, 0, 0, 0, 1, 0, 0, 0, 0, 1]) #Make sure that color channels are in BGR order by displaying images #Open CV follows BGR order while Matlab follows RGB order my_little_sweet_dog = training_set[2].data channels = cv.split(my_little_sweet_dog) my_little_sweet_blue_dog = channels[0] my_little_sweet_red_dog = channels[2] self.assertTrue( np.sum(my_little_sweet_red_dog) > np.sum(my_little_sweet_blue_dog))
def load_dataset_into_batches(file_dir_path: str, subset: Subset, subset_size: int, shuffle: bool = False): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(file_dir_path, subset) return BatchGenerator(dataset, subset_size, shuffle, op)
def main(): data = PetsDataset("/home/helmuth/dlvc/cifar-10-batches-py", Subset.TRAINING) # ops chain op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32), ops.add(-127.5), ops.mul(1/127.5), ]) # batch generator #1 bg1 = BatchGenerator(data, len(data), False) assert(len(bg1) == 1) # batch generator #2 bg2 = BatchGenerator(data, 500, False, op) assert(len(bg2) == 16) # first batch cnt = 0 for batch in bg2: cnt += 1 if cnt < 16: assert(batch.data.shape == (500, 3072)) assert(batch.labels.shape == (500,)) assert(batch.data.dtype == np.float32) assert(np.issubdtype(batch.labels.dtype, np.integer)) if cnt == 1: print("First batch, first sample, not shuffled") print(batch.data[0]) # batch generator #3 bg3 = BatchGenerator(data, 500, True, op) # run 5 times through first sample of shuffled batch generator for i in range(5): it = iter(bg3) print("First batch, first sample, shuffled") print(next(it).data[0])
def test_data_transformation(self): op = ops.chain([ops.vectorize(), ops.type_cast(np.float32)]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) batch_gen = BatchGenerator(dataset, 100, False, op) self.assertEqual(len(batch_gen), 80) iter_gen = iter(batch_gen) iter_result = next(iter_gen) self.assertEqual(iter_result.data[0].shape, (3072, )) self.assertTrue(np.issubdtype(iter_result.data.dtype, np.float32))
def test_create_batch(self): dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) batch_set = BatchGenerator(dataset, 100, False) self.assertEqual(len(batch_set), 80) iter_gen = iter(batch_set) iter_result = next(iter_gen) self.assertEqual(iter_result.idx[0], 9) iter_result = next(iter_gen) self.assertEqual(iter_result.idx[0], 607)
def test_shuffle(self): dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) batch_set = BatchGenerator(dataset, 100, True) self.assertEqual(len(batch_set), 80) iter_gen = iter(batch_set) iter_result = next(iter_gen) self.assertFalse(iter_result.idx[0] == 9) iter_result = next(iter_gen) self.assertFalse(iter_result.idx[0] == 607)
def test_train_with_proper_data(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) batch_gen = BatchGenerator(dataset, 7959, False, op) batch_iter = iter(batch_gen) iter_result = next(batch_iter) classifier = KnnClassifier(10, 3072, 2) classifier.train(iter_result.data, iter_result.label)
def load_dataset(subset: Subset) -> batches.BatchGenerator: dataset = PetsDataset('../data/cifar-10-batches-py', subset) op = ops.chain([ ops.hwc2chw(), ops.add(-127.5), ops.mul(1 / 127.5), ops.type_cast(np.float32) ]) return batches.BatchGenerator(dataset, 128, True, op)
def test_train_with_wrong_type_of_labels(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) batch_gen = BatchGenerator(dataset, 7959, False, op) batch_iter = iter(batch_gen) iter_result = next(batch_iter) classifier = KnnClassifier(10, 3072, 2) self.assertRaises(TypeError, classifier.train, iter_result.data, [0, 1, 0])
def test_train_wrong_vector_size_in_data(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) batch_gen = BatchGenerator(dataset, 7959, False, op) batch_iter = iter(batch_gen) iter_result = next(batch_iter) classifier = KnnClassifier(10, 3072, 2) changed_data = np.delete(iter_result.data, 100, 1) self.assertRaises(RuntimeError, classifier.train, changed_data, iter_result.label)
def test_correctness_of_data_for_train(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) one_batch_gen = BatchGenerator(dataset, 7959, False, op) self.assertEqual(len(one_batch_gen), 1) many_batch_gen = BatchGenerator(dataset, 500, False, op) self.assertEqual(len(many_batch_gen), 16) reference = [116., 125., 125., 91., 101.] batch_iter = iter(many_batch_gen) batch_iter = next(batch_iter) [self.assertEqual(item, reference[i]) for i, item in enumerate(batch_iter.data[0][:5])]
def test_predict_with_proper_data(self): op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32) ]) dataset_training = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING) dataset_valid = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.VALIDATION) batch_gen_t = BatchGenerator(dataset_training, 795, False, op) batch_gen_v = BatchGenerator(dataset_valid, 204, False, op) batch_iter_t = iter(batch_gen_t) iter_result_t = next(batch_iter_t) batch_iter_v = iter(batch_gen_v) iter_result_v = next(batch_iter_v) classifier = KnnClassifier(10, 3072, 2) classifier.train(iter_result_t.data, iter_result_t.label) results = classifier.predict(iter_result_v.data) self.assertEqual(len(results), 204) for result in results: self.assertEqual(np.sum(result), 1.0)
def load_dataset(subset: Subset, augment=False) -> batches.BatchGenerator: dataset = PetsDataset('../data/cifar-10-batches-py', subset) ops_list = [] if augment: ops_list += [ops.hflip(), ops.rcrop(32, 12, 'constant')] ops_list += [ ops.mul(1 / 255), ops.type_cast(np.float32), # Imagenet: # ops.normalize( mean=np.array([0.485, 0.456, 0.406]), # std=np.array([0.229, 0.224, 0.225])), # Cifar-10: ops.normalize(mean=np.array([0.41477802, 0.45935813, 0.49693552]), std=np.array([0.25241926, 0.24699265, 0.25279155])), ops.hwc2chw() ] op = ops.chain(ops_list) return batches.BatchGenerator(dataset, 128, True, op)
from dlvc.test import Accuracy from dlvc.datasets.pets import PetsDataset from dlvc.dataset import Subset import dlvc.ops as ops np.random.seed(0) torch.manual_seed(0) DATA_PATH = "../cifar-10-batches-py/" RESULTS_FILE = "results.txt" NR_EPOCHS = 100 EARLY_STOPPING = 10 CUDA = torch.cuda.is_available() train_data = PetsDataset(DATA_PATH, Subset.TRAINING) val_data = PetsDataset(DATA_PATH, Subset.VALIDATION) op_all_augmentation = ops.chain([ ops.type_cast(np.float32), ops.add(-127.5), ops.mul(1 / 127.5), ops.hflip(), ops.rcrop(32, 4, 'constant'), ops.add_noise(), ops.hwc2chw() ]) op_augmentation_crop_flip = ops.chain([ ops.type_cast(np.float32), ops.add(-127.5),
def load_dataset(subset: Subset) -> batches.BatchGenerator: dataset = PetsDataset('../data/cifar-10-batches-py', subset) op = ops.chain([ops.vectorize(), ops.type_cast(np.float32)]) return batches.BatchGenerator(dataset, len(dataset), True, op)
from dlvc.batches import BatchGenerator from dlvc.ops import vectorize, chain, type_cast import cv2 ''' All small and quick tests go here. ''' dir = '/Users/mmatak/dev/college/DLVC/cifar-10/cifar-10-batches-py/' ########################################## # PART 1 # ########################################## # Number of samples in the individual datasets: 7959 (training), 2041 (validation), 2000 (test) dataset_test = PetsDataset(dir, Subset.TEST) assert (len(dataset_test) == TEST_SIZE ), "Number of elements in test_dataset is different than " % TEST_SIZE dataset_training = PetsDataset(dir, Subset.TRAINING) assert(len(dataset_training) == TRAINING_SIZE),\ "Number of elements in training_dataset is different than " % TRAINING_SIZE dataset_validation = PetsDataset(dir, Subset.VALIDATION) assert(len(dataset_validation) == VALIDATION_SIZE), \ "Number of elements in validation_dataset is different than " % VALIDATION_SIZE # Total number of cat and dog samples: 6000 per class def count_labels(dataset):
def test_batch_size_is_not_integer_exception(self): dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TEST) self.assertRaises(TypeError, BatchGenerator, dataset, 50.5, False)
from dlvc.dataset import Subset TrainedModel = namedtuple('TrainedModel', ['model', 'accuracy']) # initialize RNG for reproducability random.seed(42) np.random.seed(42) torch.manual_seed(42) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Batch size to be used BATCH_SIZE = 128 # Step 1: load the data sets (TRAIN, VALIDATION) train_data = PetsDataset("../cifar-10-batches-py", Subset.TRAINING) val_data = PetsDataset("../cifar-10-batches-py", Subset.VALIDATION) # Operations to standardize # scale to sample mean=0, sd=1 # calculate average training sample mean & sd op_calc = ops.chain([ ops.type_cast(np.float32), ops.mean_sd() ]) # using batch generator (could do it directly but I'm lazy) train_full_batch_gen = BatchGenerator( train_data, len(train_data), False, op_calc)
import torch import torch.nn as nn from dlvc.models.pytorch import CnnClassifier from dlvc.batches import BatchGenerator from dlvc.test import Accuracy from dlvc.datasets.pets import PetsDataset from dlvc.dataset import Subset import dlvc.ops as ops np.random.seed(0) torch.manual_seed(0) DATA_PATH = "../cifar-10-batches-py/" MODEL_PATH = "best_model.pt" train_data = PetsDataset(DATA_PATH, Subset.TRAINING) val_data = PetsDataset(DATA_PATH, Subset.VALIDATION) op = ops.chain([ ops.type_cast(np.float32), ops.add(-127.5), ops.mul(1 / 127.5), ops.hflip(), ops.rcrop(32, 4, 'constant'), ops.add_noise(), ops.hwc2chw() ]) train_batches = BatchGenerator(train_data, 128, False, op) val_batches = BatchGenerator(val_data, 128, False, op)
def test_negative_batch_size_exception(self): dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TEST) self.assertRaises(ValueError, BatchGenerator, dataset, -1, False)
import numpy as np import torch import random from dlvc.datasets.pets import PetsDataset from dlvc.batches import BatchGenerator from dlvc.dataset import Subset TrainedModel = namedtuple('TrainedModel', ['model', 'accuracy']) # initialize RNG for reproducability random.seed(42) np.random.seed(42) torch.manual_seed(42) # Step 1: load the data sets (TRAIN, VALIDATION & TEST) train_data = PetsDataset("../cifar-10-batches-py", Subset.TRAINING) val_data = PetsDataset("../cifar-10-batches-py", Subset.VALIDATION) test_data = PetsDataset("../cifar-10-batches-py", Subset.TEST) # Operations to standardize op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32), ops.add(-127.5), ops.mul(1/127.5), ]) # Step 2: Create batch generator for each BATCH_SIZE = 512 train_batches = BatchGenerator(train_data, BATCH_SIZE, True, op) val_batches = BatchGenerator(val_data, BATCH_SIZE, True, op) test_batches = BatchGenerator(test_data, BATCH_SIZE, True, op)
import numpy as np from dlvc.dataset import Subset from dlvc.datasets.pets import PetsDataset from dlvc import ops, batches dataset = PetsDataset('../data/cifar-10-batches-py', Subset.TRAINING) op = ops.chain([ops.mul(1 / 255), ops.type_cast(np.float32)]) batch_generator = batches.BatchGenerator(dataset, 7959, True, op) training_images = [] for batch in batch_generator: training_images.append(batch.data) training_images = np.array(training_images, dtype=np.float32) training_images = training_images.reshape(training_images.shape[1:]) train_mean = np.mean(training_images, axis=(0, 1, 2)) train_std = np.std(training_images, axis=(0, 1, 2)) print(train_mean, train_std)
from dlvc.test import Accuracy import numpy as np from dlvc.ops import * from dlvc.datasets.pets import PetsDataset from dlvc.batches import BatchGenerator from dlvc.dataset import Subset import matplotlib.pyplot as plt from mpl_toolkits import mplot3d TrainedModel = namedtuple('TrainedModel', ['model', 'accuracy']) # TODO implement steps 1-2 data_path = "" #something ending with "...\\cifar-10-batches.py" trainingDataset = PetsDataset(data_path, Subset.TRAINING) validationDataset = PetsDataset(data_path, Subset.VALIDATION) testDataset = PetsDataset(data_path, Subset.TEST) op = chain([ vectorize(), type_cast(np.float32), add(-127.5), mul(1 / 127.5), ]) bg_training = BatchGenerator(dataset=trainingDataset, num=32, shuffle=True, op=op) bg_validation = BatchGenerator(dataset=validationDataset,
from dlvc.batches import BatchGenerator from dlvc.ops import vectorize, chain, type_cast import time ''' Tests in this file should test the whole pipeline. They take more time than unit tests. ''' # make sure the whole pipeline works: # when k=1 and # training and predict subset are equal and # kNN must have accuracy 100% start = time.time() pets = PetsDataset( '/Users/mmatak/dev/college/DLVC/cifar-10/cifar-10-batches-py/', Subset.TEST) num_classes = 2 k = 1 knn = KnnClassifier(k, 32 * 32 * 3, num_classes) batchGenerator = BatchGenerator(pets, 512, False, op=chain( [type_cast(dtype=np.float32), vectorize()])) groundTruthLabels = None for batch in batchGenerator: knn.train(batch.data, batch.label) groundTruthLabels = batch.label
def test_bigger_batch_then_dataset_exception(self): dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TEST) self.assertRaises(ValueError, BatchGenerator, dataset, 5000, False)
import numpy as np import cv2 from dlvc.datasets.pets import PetsDataset from dlvc.models.linear import LinearClassifier from dlvc.batches import BatchGenerator from dlvc.test import Accuracy from dlvc.dataset import Subset import dlvc.ops as ops np.random.seed(0) pets_train = PetsDataset("../cifar-10-batches-py/", Subset.TRAINING) pets_val = PetsDataset("../cifar-10-batches-py/", Subset.VALIDATION) random_accuracy = Accuracy() validation_accuracy = Accuracy() train_accuracy = Accuracy() print('Number of Classes = {}'.format(pets_train.num_classes())) print('Number of Images = {}'.format(pets_train.__len__())) print('First 10 Classes >>> {}'.format(pets_train.labels[:10])) op = ops.chain([ ops.vectorize(), ops.type_cast(np.float32), ops.add(-127.5), ops.mul(1 / 127.5), ])
NUM_CHANNELS = 3 BATCH_SIZE = 128 NUM_CLASSES = 2 EPOCHS = 1000 lr = 0.001 wd = 0.00000001 EARLY_STOPPING = True EARLY_STOPPING_NUM_OF_EPOCHS = 100 USE_DROPOUT = True USE_TRANSFER_LEARNING = True FREEZE_CNN_PARAMETERS = True pets_training = PetsDataset(dir, Subset.TRAINING) pets_validation = PetsDataset(dir, Subset.VALIDATION) class CatDogNet(nn.Module): def __init__(self): super(CatDogNet, self).__init__() # First Layer 2xConv and Max pool out_Shape = (16x16x32) self.conv1_layer1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1) self.batch_norm1_layer1 = nn.BatchNorm2d(num_features=32) self.relu1_layer1 = nn.ReLU()
import cv2 import torch import torch.nn as nn from dlvc.datasets.pets import PetsDataset from dlvc.models.pytorch import CnnClassifier from dlvc.batches import BatchGenerator from dlvc.test import Accuracy from dlvc.dataset import Subset import dlvc.ops as ops np.random.seed(0) pets_train = PetsDataset("../cifar-10-batches-py/", Subset.TRAINING) op = ops.chain([ ops.type_cast(np.float32), ops.add(-127.5), ops.mul(1 / 127.5), ops.hflip(), ops.rcrop(32, 4, 'constant'), ops.add_noise(), ops.hwc2chw() ]) reverse_op = ops.chain([ ops.chw2hwc(), ops.mul(127.5), ops.add(127.5),