def load_dataset_into_batches(file_dir_path: str, subset: Subset, subset_size: int, shuffle: bool = False):
    op = ops.chain([
        ops.vectorize(),
        ops.type_cast(np.float32)
    ])
    dataset = PetsDataset(file_dir_path, subset)
    return BatchGenerator(dataset, subset_size, shuffle, op)
Ejemplo n.º 2
0
def main():
    data = PetsDataset("/home/helmuth/dlvc/cifar-10-batches-py", Subset.TRAINING)
    # ops chain
    op = ops.chain([
        ops.vectorize(),
        ops.type_cast(np.float32),
        ops.add(-127.5),
        ops.mul(1/127.5),
    ])
    # batch generator #1
    bg1 = BatchGenerator(data, len(data), False)
    assert(len(bg1) == 1)
    # batch generator #2
    bg2 = BatchGenerator(data, 500, False, op)
    assert(len(bg2) == 16)
    # first batch
    cnt = 0
    for batch in bg2:
        cnt += 1
        if cnt < 16:
            assert(batch.data.shape == (500, 3072))
            assert(batch.labels.shape == (500,))
        assert(batch.data.dtype == np.float32)
        assert(np.issubdtype(batch.labels.dtype, np.integer))
        if cnt == 1:
            print("First batch, first sample, not shuffled")
            print(batch.data[0])
    # batch generator #3
    bg3 = BatchGenerator(data, 500, True, op)
    # run 5 times through first sample of shuffled batch generator
    for i in range(5):
        it = iter(bg3)
        print("First batch, first sample, shuffled")
        print(next(it).data[0])
Ejemplo n.º 3
0
 def test_data_transformation(self):
     op = ops.chain([ops.vectorize(), ops.type_cast(np.float32)])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir),
                           Subset.TRAINING)
     batch_gen = BatchGenerator(dataset, 100, False, op)
     self.assertEqual(len(batch_gen), 80)
     iter_gen = iter(batch_gen)
     iter_result = next(iter_gen)
     self.assertEqual(iter_result.data[0].shape, (3072, ))
     self.assertTrue(np.issubdtype(iter_result.data.dtype, np.float32))
 def test_train_with_wrong_type_of_labels(self):
     op = ops.chain([
         ops.vectorize(),
         ops.type_cast(np.float32)
     ])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
     batch_gen = BatchGenerator(dataset, 7959, False, op)
     batch_iter = iter(batch_gen)
     iter_result = next(batch_iter)
     classifier = KnnClassifier(10, 3072, 2)
     self.assertRaises(TypeError, classifier.train, iter_result.data, [0, 1, 0])
 def test_train_with_proper_data(self):
     op = ops.chain([
         ops.vectorize(),
         ops.type_cast(np.float32)
     ])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
     batch_gen = BatchGenerator(dataset, 7959, False, op)
     batch_iter = iter(batch_gen)
     iter_result = next(batch_iter)
     classifier = KnnClassifier(10, 3072, 2)
     classifier.train(iter_result.data, iter_result.label)
 def test_train_wrong_vector_size_in_data(self):
     op = ops.chain([
         ops.vectorize(),
         ops.type_cast(np.float32)
     ])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
     batch_gen = BatchGenerator(dataset, 7959, False, op)
     batch_iter = iter(batch_gen)
     iter_result = next(batch_iter)
     classifier = KnnClassifier(10, 3072, 2)
     changed_data = np.delete(iter_result.data, 100, 1)
     self.assertRaises(RuntimeError, classifier.train, changed_data, iter_result.label)
 def test_correctness_of_data_for_train(self):
     op = ops.chain([
         ops.vectorize(),
         ops.type_cast(np.float32)
     ])
     dataset = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
     one_batch_gen = BatchGenerator(dataset, 7959, False, op)
     self.assertEqual(len(one_batch_gen), 1)
     many_batch_gen = BatchGenerator(dataset, 500, False, op)
     self.assertEqual(len(many_batch_gen), 16)
     reference = [116., 125., 125., 91., 101.]
     batch_iter = iter(many_batch_gen)
     batch_iter = next(batch_iter)
     [self.assertEqual(item, reference[i]) for i, item in enumerate(batch_iter.data[0][:5])]
    def test_predict_with_proper_data(self):

        op = ops.chain([
            ops.vectorize(),
            ops.type_cast(np.float32)
        ])
        dataset_training = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.TRAINING)
        dataset_valid = PetsDataset(os.path.join(os.getcwd(), self._data_dir), Subset.VALIDATION)

        batch_gen_t = BatchGenerator(dataset_training, 795, False, op)
        batch_gen_v = BatchGenerator(dataset_valid, 204, False, op)

        batch_iter_t = iter(batch_gen_t)
        iter_result_t = next(batch_iter_t)

        batch_iter_v = iter(batch_gen_v)
        iter_result_v = next(batch_iter_v)

        classifier = KnnClassifier(10, 3072, 2)
        classifier.train(iter_result_t.data, iter_result_t.label)
        results = classifier.predict(iter_result_v.data)
        self.assertEqual(len(results), 204)
        for result in results:
            self.assertEqual(np.sum(result), 1.0)
Ejemplo n.º 9
0
expected = 1
assert num_of_batches == expected, "Number of batches is " + str(
    num_of_batches) + ", expected: " + str(expected)

# The number of training batches is 16 if the batch size is set to 500
batch_generator = BatchGenerator(dataset_training, 500, False)
num_of_batches = len(batch_generator)
expected = 16
assert num_of_batches == expected, "Number of batches is " + str(
    num_of_batches) + ", expected: " + str(expected)

# The data and label shapes are (500, 3072) and (500,), respectively, unless for the last batch
batch_generator = BatchGenerator(dataset_training,
                                 500,
                                 shuffle=False,
                                 op=vectorize())
last_batch_idx = len(batch_generator) - 1
batch_idx = 0
for batch in batch_generator:
    # skip last batch
    if batch_idx == last_batch_idx:
        continue
    assert batch.data.shape == (500, 3072), "Batch data shape: " + str(
        batch.data.shape) + ", expected: (500, 3072)."
    assert batch.labels.shape == (500, ), "Batch labels shape: " + str(
        batch.labels.shape) + ", expected: (500,)."
    batch_idx += 1

# The data type is always np.float32 and the label type is integral (one of the np.int and np.uint variants)
# Implemented: for label type np.uint8 since there is less than 256 labels
batch_generator = BatchGenerator(
Ejemplo n.º 10
0
import numpy as np

dir = '/Users/mmatak/dev/college/DLVC/cifar-10/cifar-10-batches-py/'

IMAGE_HEIGHT = 32
IMAGE_WIDTH = 32
NUM_CHANNELS = 3

NUM_CLASSES = 2

pets_training = PetsDataset(dir, Subset.TRAINING)
pets_validation = PetsDataset(dir, Subset.VALIDATION)
pets_test = PetsDataset(dir, Subset.TEST)

batchGenerator_training = BatchGenerator(pets_training, len(pets_training), False,
                                         op=chain([type_cast(dtype=np.float32), vectorize()]))
batchGenerator_validation = BatchGenerator(pets_validation, len(pets_validation), False,
                                         op=chain([type_cast(dtype=np.float32), vectorize()]))
batchGenerator_test = BatchGenerator(pets_test, len(pets_test), False,
                                         op=chain([type_cast(dtype=np.float32), vectorize()]))

best_accuracy = Accuracy()
best_k = -1
results = {}
knn = None

for k in range(1, 100, 40):  # grid search example
    knn = KnnClassifier(k, IMAGE_HEIGHT*IMAGE_WIDTH*NUM_CHANNELS, NUM_CLASSES)
    accuracy = Accuracy()

    # train and compute validation accuracy ...
Ejemplo n.º 11
0
TrainedModel = namedtuple('TrainedModel', ['model', 'accuracy'])

# initialize RNG for reproducability
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Step 1: load the data sets (TRAIN, VALIDATION & TEST)
train_data = PetsDataset("../cifar-10-batches-py", Subset.TRAINING)
val_data = PetsDataset("../cifar-10-batches-py", Subset.VALIDATION)
test_data = PetsDataset("../cifar-10-batches-py", Subset.TEST)

# Operations to standardize
op = ops.chain([
    ops.vectorize(),
    ops.type_cast(np.float32),
    ops.add(-127.5),
    ops.mul(1/127.5),
])
# Step 2: Create batch generator for each
BATCH_SIZE = 512
train_batches = BatchGenerator(train_data, BATCH_SIZE, True, op)
val_batches = BatchGenerator(val_data, BATCH_SIZE, True, op)
test_batches = BatchGenerator(test_data, BATCH_SIZE, True, op)

def train_model(lr: float, momentum: float) -> TrainedModel:
    '''
    Trains a linear classifier with a given learning rate (lr) and momentum.
    Computes the accuracy on the validation set.
    Returns both the trained classifier and accuracy.
Ejemplo n.º 12
0
def load_dataset(subset: Subset) -> batches.BatchGenerator:
    dataset = PetsDataset('../data/cifar-10-batches-py', subset)

    op = ops.chain([ops.vectorize(), ops.type_cast(np.float32)])

    return batches.BatchGenerator(dataset, len(dataset), True, op)
Ejemplo n.º 13
0
#  kNN must have accuracy 100%

start = time.time()

pets = PetsDataset(
    '/Users/mmatak/dev/college/DLVC/cifar-10/cifar-10-batches-py/',
    Subset.TEST)
num_classes = 2
k = 1
knn = KnnClassifier(k, 32 * 32 * 3, num_classes)
batchGenerator = BatchGenerator(pets,
                                512,
                                False,
                                op=chain(
                                    [type_cast(dtype=np.float32),
                                     vectorize()]))

groundTruthLabels = None
for batch in batchGenerator:
    knn.train(batch.data, batch.label)
    groundTruthLabels = batch.label

predictedLabels = None


def measure_accuracy(predictedLabels: np.ndarray,
                     groundTruthLabels: np.ndarray):
    correct = 0
    for index, trueLabel in enumerate(groundTruthLabels):
        predictedLabel = np.argmax(predictedLabels[index])
        if predictedLabel == trueLabel:
Ejemplo n.º 14
0
assert num_of_batches == expected, "Number of batches is " + str(num_of_batches) + ", expected: " + str(expected)

# The number of training batches is 16 if the batch size is set to 500
batch_generator = BatchGenerator(dataset_training, 500, False)
num_of_batches = len(batch_generator)
expected = 16
assert num_of_batches == expected, "Number of batches is " + str(num_of_batches) + ", expected: " + str(expected)
# and the last batch has size 459
batch_idx = 0
for batch in batch_generator:
    batch_idx += 1
    if batch_idx == 16:
        assert len(batch.label) == 459, "Num of samples in the last batch is: " + str(len(batch.label)) + ", expected: 459"

# The data and label shapes are (500, 3072) and (500,), respectively, unless for the last batch
batch_generator = BatchGenerator(dataset_training, 500, shuffle=False, op=vectorize())
last_batch_idx = len(batch_generator) - 1
batch_idx = 0
for batch in batch_generator:
    # skip last batch
    if batch_idx == last_batch_idx:
        continue
    assert batch.data.shape == (500, 3072), "Batch data shape: " + str(batch.data.shape) + ", expected: (500, 3072)."
    assert batch.label.shape == (500,), "Batch labels shape: " + str(batch.label.shape) + ", expected: (500,)."
    batch_idx += 1

# The data type is always np.float32 and the label type is integral (one of the np.int and np.uint variants)
# Implemented: for label type np.uint8 since there is less than 256 labels
batch_generator = BatchGenerator(dataset_training, 500, False, op=chain([vectorize(), type_cast(dtype=np.float32)]))
for batch in batch_generator:
    assert batch.data.dtype == np.float32, "Batch data type: " + str(batch.data.dtype) + ", expected: np.float32."