-
Notifications
You must be signed in to change notification settings - Fork 0
/
stl10_data_loader.py
200 lines (160 loc) · 7.76 KB
/
stl10_data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
from tensorflow.python.keras.utils import to_categorical
import numpy as np
import sys
import os
import urllib.request as urllib
import tarfile
# the dimensions of each image in the STL-10 dataset (96x96x3).
HEIGHT, WIDTH, DEPTH = 96, 96, 3
# size of a single image in bytes
SIZE = HEIGHT * WIDTH * DEPTH
# path to the directory with the data
# DATA_DIR = '/data/data2/jmena/STL-10'
DATA_DIR = 'stl10_data'
# url of the binary data
DATA_URL = 'http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz'
# path to the binary train file with image data
TRAIN_DATA_PATH = DATA_DIR + '/stl10_binary/train_X.bin'
# path to the binary test file with image data
TEST_DATA_PATH = DATA_DIR + '/stl10_binary/test_X.bin'
# path to the binary train file with labels
TRAIN_LABELS_PATH = DATA_DIR + '/stl10_binary/train_y.bin'
# path to the binary test file with labels
TEST_LABELS_PATH = DATA_DIR + '/stl10_binary/test_y.bin'
# path to class names file
CLASS_NAMES_PATH = DATA_DIR + '/stl10_binary/class_names.txt'
# path to the binary unlabeled file with image data
UNLABELED_DATA_PATH = DATA_DIR + '/stl10_binary/unlabeled_X.bin'
class STL10Loader(object):
def __init__(self, num_classes):
assert num_classes in [2, 10]
self.num_classes = num_classes
def read_labels(self, path_to_labels):
with open(path_to_labels, 'rb') as f:
labels = np.fromfile(f, dtype=np.uint8)
return labels
def read_all_images(self, path_to_data):
with open(path_to_data, 'rb') as f:
# read whole file in uint8 chunks
everything = np.fromfile(f, dtype=np.uint8)
# We force the data into 3x96x96 chunks, since the
# images are stored in "column-major order", meaning
# that "the first 96*96 values are the red channel,
# the next 96*96 are green, and the last are blue."
# The -1 is since the size of the pictures depends
# on the input file, and this way numpy determines
# the size on its own.
images = np.reshape(everything, (-1, DEPTH, WIDTH, HEIGHT))
# Now transpose the images into a standard image format
# readable by, for example, matplotlib.imshow
# You might want to comment this line or reverse the shuffle
# if you will use a learning algorithm like CNN, since they like
# their channels separated.
images = np.transpose(images, (0, 3, 2, 1))
return images
def download_and_extract(self):
# if the dataset already exists locally, no need to download it again.
if all((
os.path.exists(TRAIN_DATA_PATH),
os.path.exists(TRAIN_LABELS_PATH),
os.path.exists(TEST_DATA_PATH),
os.path.exists(TEST_LABELS_PATH),
)):
return
dest_directory = DATA_DIR
if not os.path.exists(dest_directory):
os.makedirs(dest_directory)
filename = DATA_URL.split('/')[-1]
filepath = os.path.join(dest_directory, filename)
if not os.path.exists(filepath):
def _progress(count, block_size, total_size):
sys.stdout.write('\rDownloading %s %.2f%%' % (filename,
float(count * block_size) / float(total_size) * 100.0))
sys.stdout.flush()
filepath, _ = urllib.urlretrieve(DATA_URL, filepath, reporthook=_progress)
print('Downloaded', filename)
tarfile.open(filepath, 'r:gz').extractall(dest_directory)
def load_dataset(self):
# download the extract the dataset.
self.download_and_extract()
# load the train and test data and labels.
x_train = self.read_all_images(TRAIN_DATA_PATH)
y_train = self.read_labels(TRAIN_LABELS_PATH)
x_test = self.read_all_images(TEST_DATA_PATH)
y_test = self.read_labels(TEST_LABELS_PATH)
# convert all images to floats in the range [0, 1]
x_train = x_train.astype('float32')
# x_train = (x_train + 53 - 127.5) / 127.5
x_train = (x_train - 10) / 255.0
x_test = x_test.astype('float32')
x_test = (x_test - 10) / 255.0
# x_test = (x_test +53 - 127.5) / 127.5
# convert the labels to be zero based.
y_train -= 1
y_test -= 1
if self.num_classes == 2:
# filter only the first 2 cifar10 classes
x_train = x_train[np.isin(y_train, [0, 2])]
y_train = y_train[np.isin(y_train, [0, 2])]
x_test = x_test[np.isin(y_test, [0, 2])]
y_test = y_test[np.isin(y_test, [0, 2])]
# create cifar10 equivalent labels
y_train_cifar10 = np.copy(y_train)
y_test_cifar10 = np.copy(y_test)
# we fix classes bird(1) and car(2) which are swapped in the two datasets
y_train_cifar10[y_train_cifar10 == 2] = 1
y_test_cifar10[y_test_cifar10 == 2] = 1
else:
# create cifar10 equivalent labels
y_train_cifar10 = np.copy(y_train)
y_test_cifar10 = np.copy(y_test)
# we fix classes bird(1) and car(2) which are swapped in the two datasets
y_train_cifar10[y_train_cifar10 == 1] = 10
y_train_cifar10[y_train_cifar10 == 2] = 1
y_train_cifar10[y_train_cifar10 == 10] = 2
y_test_cifar10[y_test_cifar10 == 1] = 10
y_test_cifar10[y_test_cifar10 == 2] = 1
y_test_cifar10[y_test_cifar10 == 10] = 2
# we fix classes horse(6) and monkey(7) which are swapped in the two datasets
# in this case, the resulting class 6, monkey, differs from CIFAR, where it
# corresponds to frogs
y_train_cifar10[y_train_cifar10 == 6] = 10
y_train_cifar10[y_train_cifar10 == 7] = 6
y_train_cifar10[y_train_cifar10 == 10] = 7
y_test_cifar10[y_test_cifar10 == 6] = 10
y_test_cifar10[y_test_cifar10 == 7] = 6
y_test_cifar10[y_test_cifar10 == 10] = 7
# convert labels to hot-one vectors.
y_train_cat = to_categorical(y_train_cifar10, self.num_classes)
y_test_cat = to_categorical(y_test_cifar10, self.num_classes)
return (x_train, y_train, y_train_cifar10, y_train_cat), (x_test, y_test, y_test_cifar10, y_test_cat)
def load_raw_dataset(self):
# download the extract the dataset.
self.download_and_extract()
# load the train and test data and labels.
x_train = self.read_all_images(TRAIN_DATA_PATH)
y_train = self.read_labels(TRAIN_LABELS_PATH)
x_test = self.read_all_images(TEST_DATA_PATH)
y_test = self.read_labels(TEST_LABELS_PATH)
# convert all images to floats in the range [0, 1]
x_train = x_train.astype('float32')
x_train = (x_train - 10) / 255.0
x_test = x_test.astype('float32')
x_test = (x_test - 10) / 255.0
# convert the labels to be zero based.
y_train -= 1
y_test -= 1
# convert labels to hot-one vectors.
y_train_cat = to_categorical(y_train, self.num_classes)
y_test_cat = to_categorical(y_test, self.num_classes)
return (x_train, y_train, y_train_cat), (x_test, y_test, y_test_cat)
def load_raw_ood_dataset(self, num_training=500, num_test=500):
# download the extract the dataset.
self.download_and_extract()
# load the train and test data and labels.
unlabeled = self.read_all_images(UNLABELED_DATA_PATH)
# convert all images to floats in the range [0, 1]
unlabeled = unlabeled.astype('float32')
unlabeled = (unlabeled - 10) / 255.0
np.random.shuffle(unlabeled)
return unlabeled[:num_training], unlabeled[num_training:num_training+num_test]