def load_dataset_into_batches(file_dir_path: str, subset: Subset, subset_size: int, shuffle: bool = False):
    op = ops.chain([
        ops.vectorize(),
        ops.type_cast(np.float32)
    ])
    dataset = PetsDataset(file_dir_path, subset)
    return BatchGenerator(dataset, subset_size, shuffle, op)
Ejemplo n.º 2
0
def main():
    data = PetsDataset("/home/helmuth/dlvc/cifar-10-batches-py", Subset.TRAINING)
    # ops chain
    op = ops.chain([
        ops.vectorize(),
        ops.type_cast(np.float32),
        ops.add(-127.5),
        ops.mul(1/127.5),
    ])
    # batch generator #1
    bg1 = BatchGenerator(data, len(data), False)
    assert(len(bg1) == 1)
    # batch generator #2
    bg2 = BatchGenerator(data, 500, False, op)
    assert(len(bg2) == 16)
    # first batch
    cnt = 0
    for batch in bg2:
        cnt += 1
        if cnt < 16:
            assert(batch.data.shape == (500, 3072))
            assert(batch.labels.shape == (500,))
        assert(batch.data.dtype == np.float32)
        assert(np.issubdtype(batch.labels.dtype, np.integer))
        if cnt == 1:
            print("First batch, first sample, not shuffled")
            print(batch.data[0])
    # batch generator #3
    bg3 = BatchGenerator(data, 500, True, op)
    # run 5 times through first sample of shuffled batch generator
    for i in range(5):
        it = iter(bg3)
        print("First batch, first sample, shuffled")
        print(next(it).data[0])
Ejemplo n.º 3
0
 def test_data_transformation(self):
     op = ops.chain([ops.vectorize(), ops.type_cast(np.float32)])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir),
                           Subset.TRAINING)
     batch_gen = BatchGenerator(dataset, 100, False, op)
     self.assertEqual(len(batch_gen), 80)
     iter_gen = iter(batch_gen)
     iter_result = next(iter_gen)
     self.assertEqual(iter_result.data[0].shape, (3072, ))
     self.assertTrue(np.issubdtype(iter_result.data.dtype, np.float32))
Ejemplo n.º 4
0
def load_dataset(subset: Subset) -> batches.BatchGenerator:
    dataset = PetsDataset('../data/cifar-10-batches-py', subset)

    op = ops.chain([
        ops.hwc2chw(),
        ops.add(-127.5),
        ops.mul(1 / 127.5),
        ops.type_cast(np.float32)
    ])

    return batches.BatchGenerator(dataset, 128, True, op)
 def test_train_with_wrong_type_of_labels(self):
     op = ops.chain([
         ops.vectorize(),
         ops.type_cast(np.float32)
     ])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
     batch_gen = BatchGenerator(dataset, 7959, False, op)
     batch_iter = iter(batch_gen)
     iter_result = next(batch_iter)
     classifier = KnnClassifier(10, 3072, 2)
     self.assertRaises(TypeError, classifier.train, iter_result.data, [0, 1, 0])
 def test_train_with_proper_data(self):
     op = ops.chain([
         ops.vectorize(),
         ops.type_cast(np.float32)
     ])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
     batch_gen = BatchGenerator(dataset, 7959, False, op)
     batch_iter = iter(batch_gen)
     iter_result = next(batch_iter)
     classifier = KnnClassifier(10, 3072, 2)
     classifier.train(iter_result.data, iter_result.label)
 def test_train_wrong_vector_size_in_data(self):
     op = ops.chain([
         ops.vectorize(),
         ops.type_cast(np.float32)
     ])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
     batch_gen = BatchGenerator(dataset, 7959, False, op)
     batch_iter = iter(batch_gen)
     iter_result = next(batch_iter)
     classifier = KnnClassifier(10, 3072, 2)
     changed_data = np.delete(iter_result.data, 100, 1)
     self.assertRaises(RuntimeError, classifier.train, changed_data, iter_result.label)
 def test_correctness_of_data_for_train(self):
     op = ops.chain([
         ops.vectorize(),
         ops.type_cast(np.float32)
     ])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
     one_batch_gen = BatchGenerator(dataset, 7959, False, op)
     self.assertEqual(len(one_batch_gen), 1)
     many_batch_gen = BatchGenerator(dataset, 500, False, op)
     self.assertEqual(len(many_batch_gen), 16)
     reference = [116., 125., 125., 91., 101.]
     batch_iter = iter(many_batch_gen)
     batch_iter = next(batch_iter)
     [self.assertEqual(item, reference[i]) for i, item in enumerate(batch_iter.data[0][:5])]
Ejemplo n.º 9
0
def load_dataset(subset: Subset, augment=False) -> batches.BatchGenerator:
    dataset = PetsDataset('../data/cifar-10-batches-py', subset)

    ops_list = []

    if augment:
        ops_list += [ops.hflip(), ops.rcrop(32, 12, 'constant')]

    ops_list += [
        ops.mul(1 / 255),
        ops.type_cast(np.float32),
        # Imagenet:
        # ops.normalize(  mean=np.array([0.485, 0.456, 0.406]),
        #                 std=np.array([0.229, 0.224, 0.225])),
        # Cifar-10:
        ops.normalize(mean=np.array([0.41477802, 0.45935813, 0.49693552]),
                      std=np.array([0.25241926, 0.24699265, 0.25279155])),
        ops.hwc2chw()
    ]

    op = ops.chain(ops_list)

    return batches.BatchGenerator(dataset, 128, True, op)
    def test_predict_with_proper_data(self):

        op = ops.chain([
            ops.vectorize(),
            ops.type_cast(np.float32)
        ])
        dataset_training = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
        dataset_valid = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.VALIDATION)

        batch_gen_t = BatchGenerator(dataset_training, 795, False, op)
        batch_gen_v = BatchGenerator(dataset_valid, 204, False, op)

        batch_iter_t = iter(batch_gen_t)
        iter_result_t = next(batch_iter_t)

        batch_iter_v = iter(batch_gen_v)
        iter_result_v = next(batch_iter_v)

        classifier = KnnClassifier(10, 3072, 2)
        classifier.train(iter_result_t.data, iter_result_t.label)
        results = classifier.predict(iter_result_v.data)
        self.assertEqual(len(results), 204)
        for result in results:
            self.assertEqual(np.sum(result), 1.0)
Ejemplo n.º 11
0
from dlvc.dataset import Subset
import dlvc.ops as ops

np.random.seed(0)
torch.manual_seed(0)

TrainedModel = namedtuple('TrainedModel', ['model', 'accuracy'])

DATA_PATH = "../cifar-10-batches-py/"
train_data = PetsDataset(DATA_PATH, Subset.TRAINING)
val_data = PetsDataset(DATA_PATH, Subset.VALIDATION)
test_data = PetsDataset(DATA_PATH, Subset.TEST)

op = ops.chain([
    ops.type_cast(np.float32),
    ops.add(-127.5),
    ops.mul(1 / 127.5),
    ops.hwc2chw()
])

train_batches = BatchGenerator(train_data, 128, False, op)
val_batches = BatchGenerator(val_data, 128, False, op)
test_batches = BatchGenerator(test_data, 128, False, op)


class Net(nn.Module):
    def __init__(self, img_size, num_classes):
        super(Net, self).__init__()
        self.img_size = img_size

        # Instantiate the ReLU nonlinearity
        self.relu = nn.ReLU()
Ejemplo n.º 12
0
TrainedModel = namedtuple('TrainedModel', ['model', 'accuracy'])

# initialize RNG for reproducability
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Step 1: load the data sets (TRAIN, VALIDATION & TEST)
train_data = PetsDataset("../cifar-10-batches-py", Subset.TRAINING)
val_data = PetsDataset("../cifar-10-batches-py", Subset.VALIDATION)
test_data = PetsDataset("../cifar-10-batches-py", Subset.TEST)

# Operations to standardize
op = ops.chain([
    ops.vectorize(),
    ops.type_cast(np.float32),
    ops.add(-127.5),
    ops.mul(1/127.5),
])
# Step 2: Create batch generator for each
BATCH_SIZE = 512
train_batches = BatchGenerator(train_data, BATCH_SIZE, True, op)
val_batches = BatchGenerator(val_data, BATCH_SIZE, True, op)
test_batches = BatchGenerator(test_data, BATCH_SIZE, True, op)

def train_model(lr: float, momentum: float) -> TrainedModel:
    '''
    Trains a linear classifier with a given learning rate (lr) and momentum.
    Computes the accuracy on the validation set.
    Returns both the trained classifier and accuracy.
    '''
import numpy as np

from dlvc.dataset import Subset
from dlvc.datasets.pets import PetsDataset
from dlvc import ops, batches

dataset = PetsDataset('../data/cifar-10-batches-py', Subset.TRAINING)

op = ops.chain([ops.mul(1 / 255), ops.type_cast(np.float32)])

batch_generator = batches.BatchGenerator(dataset, 7959, True, op)

training_images = []

for batch in batch_generator:
    training_images.append(batch.data)

training_images = np.array(training_images, dtype=np.float32)
training_images = training_images.reshape(training_images.shape[1:])

train_mean = np.mean(training_images, axis=(0, 1, 2))
train_std = np.std(training_images, axis=(0, 1, 2))

print(train_mean, train_std)
Ejemplo n.º 14
0
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Batch size to be used
BATCH_SIZE = 128

# Step 1: load the data sets (TRAIN, VALIDATION)
train_data = PetsDataset("../cifar-10-batches-py", Subset.TRAINING)
val_data = PetsDataset("../cifar-10-batches-py", Subset.VALIDATION)

# Operations to standardize
# First experiment: scale to [-1,1]
op1 = ops.chain([
    ops.type_cast(np.float32),
    ops.add(-127.5),
    ops.mul(1 / 127.5),
    ops.hwc2chw()
])
# Second experiment: scale to sample mean=0, sd=1
# calculate average training sample mean & sd
op_calc = ops.chain([ops.type_cast(np.float32), ops.mean_sd()])
# using batch generator (could do it directly but I'm lazy)
train_full_batch_gen = BatchGenerator(train_data, len(train_data), False,
                                      op_calc)
train_full_batch = next(b for b in train_full_batch_gen)
train_mean_sd = np.mean(train_full_batch.data, axis=0)
# create operation to scale
op2 = ops.chain([
    ops.type_cast(np.float32),
    ops.scale(train_mean_sd[0], train_mean_sd[1]),
    ops.hwc2chw()
Ejemplo n.º 15
0
def load_dataset(subset: Subset) -> batches.BatchGenerator:
    dataset = PetsDataset('../data/cifar-10-batches-py', subset)

    op = ops.chain([ops.vectorize(), ops.type_cast(np.float32)])

    return batches.BatchGenerator(dataset, len(dataset), True, op)
Ejemplo n.º 16
0
DATA_PATH = "../cifar-10-batches-py/"
RESULTS_FILE = "results.txt"
NR_EPOCHS = 100
EARLY_STOPPING = 10

CUDA = torch.cuda.is_available()

train_data = PetsDataset(DATA_PATH, Subset.TRAINING)
val_data = PetsDataset(DATA_PATH, Subset.VALIDATION)

op_all_augmentation = ops.chain([
    ops.type_cast(np.float32),
    ops.add(-127.5),
    ops.mul(1 / 127.5),
    ops.hflip(),
    ops.rcrop(32, 4, 'constant'),
    ops.add_noise(),
    ops.hwc2chw()
])

op_augmentation_crop_flip = ops.chain([
    ops.type_cast(np.float32),
    ops.add(-127.5),
    ops.mul(1 / 127.5),
    ops.hflip(),
    ops.rcrop(32, 4, 'constant'),
    ops.hwc2chw()
])

op_no_augmentation = ops.chain([
Ejemplo n.º 17
0
from dlvc.dataset import Subset
import dlvc.ops as ops

np.random.seed(0)
torch.manual_seed(0)

DATA_PATH = "../cifar-10-batches-py/"
MODEL_PATH = "best_model.pt"
train_data = PetsDataset(DATA_PATH, Subset.TRAINING)
val_data = PetsDataset(DATA_PATH, Subset.VALIDATION)

op = ops.chain([
    ops.type_cast(np.float32),
    ops.add(-127.5),
    ops.mul(1 / 127.5),
    ops.hflip(),
    ops.rcrop(32, 4, 'constant'),
    ops.add_noise(),
    ops.hwc2chw()
])

train_batches = BatchGenerator(train_data, 128, False, op)
val_batches = BatchGenerator(val_data, 128, False, op)


class Net(nn.Module):
    def __init__(self, img_size, num_classes):
        super(Net, self).__init__()
        self.img_size = img_size

        # Instantiate the ReLU nonlinearity
Ejemplo n.º 18
0
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Batch size to be used
BATCH_SIZE = 128

# Step 1: load the data sets (TRAIN, VALIDATION)
train_data = PetsDataset("../cifar-10-batches-py", Subset.TRAINING)
val_data = PetsDataset("../cifar-10-batches-py", Subset.VALIDATION)

# Operations to standardize
# scale to sample mean=0, sd=1
# calculate average training sample mean & sd
op_calc = ops.chain([
    ops.type_cast(np.float32),
    ops.mean_sd()
])
# using batch generator (could do it directly but I'm lazy)
train_full_batch_gen = BatchGenerator(
    train_data,
    len(train_data),
    False,
    op_calc)
train_full_batch = next(b for b in train_full_batch_gen)
train_mean_sd = np.mean(train_full_batch.data, axis=0)
# create operation to scale
op2 = ops.chain([
    ops.type_cast(np.float32),
    ops.scale(train_mean_sd[0], train_mean_sd[1]),
    ops.hwc2chw()
])
Ejemplo n.º 19
0
BATCH_SIZE = 128
NUM_CLASSES = 2
EPOCHS = 500
lr = 0.001
# weight decay 0 in this configuration, in part 3 this is changed
wd = 0.0

pets_training = PetsDataset(dir, Subset.TRAINING)
pets_validation = PetsDataset(dir, Subset.VALIDATION)
pets_test = PetsDataset(dir, Subset.TEST)


batchGenerator_training = BatchGenerator(pets_training, BATCH_SIZE, shuffle=True,
                                         op=chain([type_cast(dtype=np.float32),
                                                   add(-127.5),
                                                   mul(1 / 127.5),
                                                   hwc2chw()]))
batchGenerator_validation = BatchGenerator(pets_validation, BATCH_SIZE, shuffle=False,
                                         op=chain([type_cast(dtype=np.float32),
                                                   add(-127.5),
                                                   mul(1 / 127.5),
                                                   hwc2chw()]))
batchGenerator_test = BatchGenerator(pets_test, BATCH_SIZE, shuffle=False,
                                         op=chain([type_cast(dtype=np.float32),
                                                   add(-127.5),
                                                   mul(1 / 127.5),
                                                   hwc2chw()]))


class CatDogNet(nn.Module):
    def __init__(self):
Ejemplo n.º 20
0
wd = 0.00000001

EARLY_STOPPING = True
EARLY_STOPPING_NUM_OF_EPOCHS = 100
USE_DROPOUT = True

pets_training = PetsDataset(dir, Subset.TRAINING)
pets_validation = PetsDataset(dir, Subset.VALIDATION)

batchGenerator_training = BatchGenerator(pets_training,
                                         BATCH_SIZE,
                                         shuffle=True,
                                         op=chain([
                                             type_cast(dtype=np.float32),
                                             add(-127.5),
                                             mul(1 / 127.5),
                                             rcrop(25, 2, 'median'),
                                             hwc2chw()
                                         ]))
batchGenerator_validation = BatchGenerator(pets_validation,
                                           BATCH_SIZE,
                                           shuffle=False,
                                           op=chain([
                                               type_cast(dtype=np.float32),
                                               add(-127.5),
                                               mul(1 / 127.5),
                                               hwc2chw()
                                           ]))


class CatDogNet(nn.Module):
Ejemplo n.º 21
0
    # skip last batch
    if batch_idx == last_batch_idx:
        continue
    assert batch.data.shape == (500, 3072), "Batch data shape: " + str(
        batch.data.shape) + ", expected: (500, 3072)."
    assert batch.labels.shape == (500, ), "Batch labels shape: " + str(
        batch.labels.shape) + ", expected: (500,)."
    batch_idx += 1

# The data type is always np.float32 and the label type is integral (one of the np.int and np.uint variants)
# Implemented: for label type np.uint8 since there is less than 256 labels
batch_generator = BatchGenerator(
    dataset_training,
    500,
    False,
    op=chain([vectorize(), type_cast(dtype=np.float32)]))
for batch in batch_generator:
    assert batch.data.dtype == np.float32, "Batch data type: " + str(
        batch.data.dtype) + ", expected: np.float32."
    assert batch.labels.dtype == np.uint8, "Batch labels type: " + str(
        batch.labels.dtype) + ", expected: np.uint8."

# The first sample of the first training batch returned without shuffling
# has label 0 ...
batch_generator = BatchGenerator(
    dataset_training,
    500,
    False,
    op=chain([type_cast(dtype=np.float32),
              vectorize()]))
first_sample_label_unshuffled = None
Ejemplo n.º 22
0
    dataset = PetsDataset('../data/cifar-10-batches-py',
                          Subset.TEST)

    print('Found dataset directory')
    print(f'{len(dataset)} samples')

    test_sample = dataset[1]

    print(f'Index of test sample: {test_sample.idx}')
    print(f'Label of test sample: {test_sample.label}')
    #cv2.imshow('Sample Image', test_sample.data)
    #cv2.waitKey(0)
    #cv2.destroyAllWindows()

    op = ops.chain([
        ops.vectorize(),
        ops.type_cast(np.float32)
    ])

    #The number of training batches is 1 if the batch size is set to the number of samples in the dataset
    generator = batches.BatchGenerator(dataset, len(dataset), True, op)
    print(len(generator))

    #The number of training batches is 16 if the batch size is set to 500
    generator = batches.BatchGenerator(dataset, 500, True, op)
    print(len(generator))

    #The data and label shapes are (500, 3072) and (500,), respectively, unless for the last batch
    #The data type is always np.float32 and the label type is integral (one of the np.int and np.uint variants)
    #generator = batches.BatchGenerator(dataset, 20, False, op)
    for i in iter(generator):
        pass
Ejemplo n.º 23
0
def set_parameter(model, freeze_parameters):
    if freeze_parameters:
        for param in model.parameters():
            param.requires_grad = False


if USE_TRANSFER_LEARNING:
    # there are two networks to use in transfer learning "resnet" and "alexnet"
    net = initialize_transfer_learning_model("resnet", NUM_CLASSES,
                                             FREEZE_CNN_PARAMETERS)
    net, input_size = net
    pad_mode_for_resizing = 'constant'
    op_chain = chain([
        type_cast(dtype=np.float32),
        add(-127.5),
        mul(1 / 127.5),
        rcrop(25, 2, 'median'),
        resize(input_size, pad_mode_for_resizing),
        hwc2chw()
    ])
else:
    net = CatDogNet()
    op_chain = chain([
        type_cast(dtype=np.float32),
        add(-127.5),
        mul(1 / 127.5),
        rcrop(25, 2, 'median'),
        hwc2chw()
    ])

batchGenerator_training = BatchGenerator(pets_training,
                                         BATCH_SIZE,
Ejemplo n.º 24
0
# The data and label shapes are (500, 3072) and (500,), respectively, unless for the last batch
batch_generator = BatchGenerator(dataset_training, 500, shuffle=False, op=vectorize())
last_batch_idx = len(batch_generator) - 1
batch_idx = 0
for batch in batch_generator:
    # skip last batch
    if batch_idx == last_batch_idx:
        continue
    assert batch.data.shape == (500, 3072), "Batch data shape: " + str(batch.data.shape) + ", expected: (500, 3072)."
    assert batch.label.shape == (500,), "Batch labels shape: " + str(batch.label.shape) + ", expected: (500,)."
    batch_idx += 1

# The data type is always np.float32 and the label type is integral (one of the np.int and np.uint variants)
# Implemented: for label type np.uint8 since there is less than 256 labels
batch_generator = BatchGenerator(dataset_training, 500, False, op=chain([vectorize(), type_cast(dtype=np.float32)]))
for batch in batch_generator:
    assert batch.data.dtype == np.float32, "Batch data type: " + str(batch.data.dtype) + ", expected: np.float32."
    assert batch.label.dtype == np.uint8, "Batch labels type: " + str(batch.label.dtype) + ", expected: np.uint8."

# The first sample of the first training batch returned without shuffling
# has label 0 ...
batch_generator = BatchGenerator(dataset_training, 500, False, op=chain([type_cast(dtype=np.float32), vectorize()]))
first_sample_label_unshuffled = None
first_sample_data_unshuffled = None
expected_label = 0
for batch in batch_generator:
    for label in batch.label:
        first_sample_label_unshuffled = label
        break
    for data in batch.data: