def gcn_zca(train_set, valid_set, test_set, dataset): # Apply gcn train_set = global_contrast_normalization(train_set) valid_set = global_contrast_normalization(valid_set) test_set = global_contrast_normalization(test_set) # Apply zca train_set, mean, W = zca_whitening(train_set) valid_set, _, _ = zca_whitening(valid_set, mean=mean, whitening=W) test_set, _, _ = zca_whitening(test_set, mean=mean, whitening=W) return train_set, valid_set, test_set
def load_data(self, original_scale=False): print("Loading data...") # load training data X, y = [], [] count = 1 filename = '%s/data_batch_%i' % (self.data_path, count) while os.path.exists(filename): with open(filename, 'rb') as f: batch = pickle.load(f) X.append(batch['data']) y.append(batch['labels']) count += 1 filename = '%s/data_batch_%i' % (self.data_path, count) # reshape data and cast them properly X = np.concatenate(X).reshape(-1, 3, 32, 32).astype(np.float32) y = np.concatenate(y).astype(np.int32) # load test set path = '%s/test_batch' % self.data_path with open(path, 'rb') as f: batch = pickle.load(f) # reshaping and casting for test data X_test = batch['data'].reshape(-1, 3, 32, 32).astype(np.float32) y_test = np.array(batch['labels'], dtype=np.int32) if Cfg.ad_experiment: normal = eval(Cfg.cifar10_normal) outliers = eval(Cfg.cifar10_outlier) # extract normal and anomalous class X_norm, X_out, y_norm, y_out, _, _ = extract_norm_and_out( X, y, normal=normal, outlier=outliers) # reduce outliers to fraction defined n_norm = len(y_norm) n_out = int(np.ceil(Cfg.out_frac * n_norm / (1 - Cfg.out_frac))) # shuffle to obtain random validation splits np.random.seed(self.seed) perm_norm = np.random.permutation(len(y_norm)) perm_out = np.random.permutation(len(y_out)) # split into training and validation set n_norm_split = int(Cfg.cifar10_val_frac * n_norm) n_out_split = int(Cfg.cifar10_val_frac * n_out) self._X_train = np.concatenate( (X_norm[perm_norm[n_norm_split:]], X_out[perm_out[:n_out][n_out_split:]])) self._y_train = np.append(y_norm[perm_norm[n_norm_split:]], y_out[perm_out[:n_out][n_out_split:]]) self._X_val = np.concatenate( (X_norm[perm_norm[:n_norm_split]], X_out[perm_out[:n_out][:n_out_split]])) self._y_val = np.append(y_norm[perm_norm[:n_norm_split]], y_out[perm_out[:n_out][:n_out_split]]) # shuffle data (since batches are extracted block-wise) self.n_train = len(self._y_train) self.n_val = len(self._y_val) perm_train = np.random.permutation(self.n_train) perm_val = np.random.permutation(self.n_val) self._X_train = self._X_train[perm_train] self._y_train = self._y_train[perm_train] self._X_val = self._X_val[perm_val] self._y_val = self._y_val[perm_val] # Subset train set such that we only get batches of the same size self.n_train = (self.n_train / Cfg.batch_size) * Cfg.batch_size subset = np.random.choice(len(self._X_train), self.n_train, replace=False) self._X_train = self._X_train[subset] self._y_train = self._y_train[subset] # Adjust number of batches Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size)) # test set X_norm, X_out, y_norm, y_out, idx_norm, idx_out = extract_norm_and_out( X_test, y_test, normal=normal, outlier=outliers) # store original test labels for visualisation yo_norm = y_test[idx_norm] yo_out = y_test[idx_out] self._yo_test = np.append(yo_norm, yo_out) self._X_test = np.concatenate((X_norm, X_out)) self._y_test = np.append(y_norm, y_out) perm_test = np.random.permutation(len(self._y_test)) self._X_test = self._X_test[perm_test] self._y_test = self._y_test[perm_test] self._yo_test = self._yo_test[perm_test] self.n_test = len(self._y_test) else: # split into training and validation sets with stored seed np.random.seed(self.seed) perm = np.random.permutation(len(X)) self._X_train = X[perm[self.n_val:]] self._y_train = y[perm[self.n_val:]] self._X_val = X[perm[:self.n_val]] self._y_val = y[perm[:self.n_val]] self._X_test = X_test self._y_test = y_test # normalize data (if original scale should not be preserved) if not original_scale: # simple rescaling to [0,1] normalize_data(self._X_train, self._X_val, self._X_test, scale=np.float32(255)) # global contrast normalization if Cfg.gcn: global_contrast_normalization(self._X_train, self._X_val, self._X_test, scale=Cfg.unit_norm_used) # ZCA whitening if Cfg.zca_whitening: self._X_train, self._X_val, self._X_test = zca_whitening( self._X_train, self._X_val, self._X_test) # rescale to [0,1] (w.r.t. min and max in train data) rescale_to_unit_interval(self._X_train, self._X_val, self._X_test) # PCA if Cfg.pca: self._X_train, self._X_val, self._X_test = pca( self._X_train, self._X_val, self._X_test, 0.95) flush_last_line() print("Data loaded.")
def load_data(self, original_scale=False): print("Loading data...") X = load_mnist_images('%strain-images-idx3-ubyte.gz' % self.data_path) y = load_mnist_labels('%strain-labels-idx1-ubyte.gz' % self.data_path) X_test = load_mnist_images('%st10k-images-idx3-ubyte.gz' % self.data_path) y_test = load_mnist_labels('%st10k-labels-idx1-ubyte.gz' % self.data_path) if Cfg.ad_experiment: X_norm, y_norm = get_norm_for_mnist() # shuffle to obtain random validation splits np.random.seed(self.seed) perm_norm = np.random.permutation(len(y_norm)) n_norm = len(y_norm) # split into training and validation set n_norm_split = int(Cfg.mnist_val_frac * n_norm) self._X_train = X_norm[perm_norm[n_norm_split:]] self._y_train = y_norm[perm_norm[n_norm_split:]] self._X_val = X_norm[perm_norm[:n_norm_split]] self._y_val = y_norm[perm_norm[:n_norm_split]] # shuffle data (since batches are extracted block-wise) self.n_train = len(self._y_train) self.n_val = len(self._y_val) perm_train = np.random.permutation(self.n_train) perm_val = np.random.permutation(self.n_val) self._X_train = self._X_train[perm_train] self._y_train = self._y_train[perm_train] self._X_val = self._X_train[perm_val] self._y_val = self._y_train[perm_val] print("Number of data in training: " + str(self.n_train)) print("Number of data in validation: " + str(self.n_val)) # Subset train set such that we only get batches of the same size self.n_train = (self.n_train / Cfg.batch_size) * Cfg.batch_size subset = np.random.choice(len(self._X_train), self.n_train, replace=False) self._X_train = self._X_train[subset] self._y_train = self._y_train[subset] # Adjust number of batches Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size)) # test set X_norm, X_out, y_norm, y_out = get_outlier_for_mnist() self._X_test = np.concatenate((X_norm, X_out)) self._y_test = np.append(y_norm, y_out) perm_test = np.random.permutation(len(self._y_test)) self._X_test = self._X_test[perm_test] self._y_test = self._y_test[perm_test] self.n_test = len(self._y_test) print("Number of outlier data in testing: " + str(np.shape(y_out)[0])) print("Number of normal data in testing: " + str(np.shape(y_norm)[0])) else: # split into training, validation, and test sets np.random.seed(self.seed) perm = np.random.permutation(len(X)) self._X_train = X[perm[self.n_val:]] self._y_train = y[perm[self.n_val:]] self._X_val = X[perm[:self.n_val]] self._y_val = y[perm[:self.n_val]] self._X_test = X_test self._y_test = y_test # normalize data (if original scale should not be preserved) if not original_scale: # simple rescaling to [0,1] normalize_data(self._X_train, self._X_val, self._X_test, scale=np.float32(255)) # global contrast normalization if Cfg.gcn: global_contrast_normalization(self._X_train, self._X_val, self._X_test, scale=Cfg.unit_norm_used) # ZCA whitening if Cfg.zca_whitening: self._X_train, self._X_val, self._X_test = zca_whitening( self._X_train, self._X_val, self._X_test) # rescale to [0,1] (w.r.t. min and max in train data) rescale_to_unit_interval(self._X_train, self._X_val, self._X_test) # PCA if Cfg.pca: self._X_train, self._X_val, self._X_test = pca( self._X_train, self._X_val, self._X_test, 0.95) flush_last_line() print("Data loaded.")
def load_data(self, original_scale=False): print("Loading data...") # get train data X = readTrafficSigns(rootpath=self.data_path, which_set="train", label=14) # get (normal) test data # X_test_norm = readTrafficSigns(rootpath=self.data_path, which_set="test", label=14) # sub-sample test set data of size np.random.seed(self.seed) perm = np.random.permutation(len(X)) X_test_norm = X[perm[:100], ...] self._X_train = X[perm[100:], ...] self.n_train = len(self._X_train) self._y_train = np.zeros(self.n_train, dtype=np.uint8) # load (adversarial) test data X_test_adv = np.load(self.data_path + "/Images_150.npy") labels_adv = np.load(self.data_path + "/Labels_150.npy") self._X_test = np.concatenate( (X_test_norm, X_test_adv[labels_adv == 1]), axis=0).astype(np.float32) self._y_test = np.concatenate( (np.zeros(len(X_test_norm), dtype=np.uint8), np.ones(int(np.sum(labels_adv)), dtype=np.uint8)), axis=0) self.n_test = len(self._X_test) # since val set is referenced at some points initialize empty np arrays self._X_val = np.empty(shape=(0, 3, 32, 32), dtype=np.float32) self._y_val = np.empty(shape=(0), dtype=np.uint8) # Adjust number of batches Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size)) # shuffle np.random.seed(self.seed) perm_train = np.random.permutation(self.n_train) perm_test = np.random.permutation(self.n_test) self._X_train = self._X_train[perm_train, ...] self._y_train = self._y_train[perm_train] self._X_test = self._X_test[perm_test, ...] self._y_test = self._y_test[perm_test] # Adjust number of batches Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size)) # normalize data (if original scale should not be preserved) if not original_scale: # simple rescaling to [0,1] normalize_data(self._X_train, self._X_val, self._X_test, scale=np.float32(255)) # global contrast normalization if Cfg.gcn: global_contrast_normalization(self._X_train, self._X_val, self._X_test, scale=Cfg.unit_norm_used) # ZCA whitening if Cfg.zca_whitening: self._X_train, self._X_val, self._X_test = zca_whitening( self._X_train, self._X_val, self._X_test) # rescale to [0,1] (w.r.t. min and max in train data) rescale_to_unit_interval(self._X_train, self._X_val, self._X_test) # PCA if Cfg.pca: self._X_train, self._X_val, self._X_test = pca( self._X_train, self._X_val, self._X_test, 0.95) flush_last_line() print("Data loaded.")
def load_data(self, original_scale=False): print("Loading data...") X = load_mnist_images('%strain-images-idx3-ubyte.gz' % self.data_path) y = load_mnist_labels('%strain-labels-idx1-ubyte.gz' % self.data_path) X_test = load_mnist_images('%st10k-images-idx3-ubyte.gz' % self.data_path) y_test = load_mnist_labels('%st10k-labels-idx1-ubyte.gz' % self.data_path) if Cfg.ad_experiment: # set normal and anomalous class normal = [] outliers = [] if Cfg.mnist_normal == -1: normal = list(range(0, 10)) normal.remove(Cfg.mnist_outlier) else: normal.append(Cfg.mnist_normal) if Cfg.mnist_outlier == -1: outliers = list(range(0, 10)) outliers.remove(Cfg.mnist_normal) else: outliers.append(Cfg.mnist_outlier) # extract normal and anomalous class X_norm, X_out, y_norm, y_out = extract_norm_and_out( X, y, normal=normal, outlier=outliers) # reduce outliers to fraction defined n_norm = len(y_norm) n_out = int(np.ceil(Cfg.out_frac * n_norm / (1 - Cfg.out_frac))) # shuffle to obtain random validation splits np.random.seed(self.seed) perm_norm = np.random.permutation(len(y_norm)) perm_out = np.random.permutation(len(y_out)) # split into training and validation set n_norm_split = int(Cfg.mnist_val_frac * n_norm) n_out_split = int(Cfg.mnist_val_frac * n_out) self._X_train = np.concatenate( (X_norm[perm_norm[n_norm_split:]], X_out[perm_out[:n_out][n_out_split:]])) self._y_train = np.append(y_norm[perm_norm[n_norm_split:]], y_out[perm_out[:n_out][n_out_split:]]) self._X_val = np.concatenate( (X_norm[perm_norm[:n_norm_split]], X_out[perm_out[:n_out][:n_out_split]])) self._y_val = np.append(y_norm[perm_norm[:n_norm_split]], y_out[perm_out[:n_out][:n_out_split]]) # shuffle data (since batches are extracted block-wise) self.n_train = len(self._y_train) self.n_val = len(self._y_val) perm_train = np.random.permutation(self.n_train) perm_val = np.random.permutation(self.n_val) self._X_train = self._X_train[perm_train] self._y_train = self._y_train[perm_train] self._X_val = self._X_train[perm_val] self._y_val = self._y_train[perm_val] # Subset train set such that we only get batches of the same size self.n_train = (self.n_train / Cfg.batch_size) * Cfg.batch_size subset = np.random.choice(len(self._X_train), int(self.n_train), replace=False) self._X_train = self._X_train[subset] self._y_train = self._y_train[subset] # Adjust number of batches Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size)) # test set X_norm, X_out, y_norm, y_out = extract_norm_and_out( X_test, y_test, normal=normal, outlier=outliers) self._X_test = np.concatenate((X_norm, X_out)) self._y_test = np.append(y_norm, y_out) perm_test = np.random.permutation(len(self._y_test)) self._X_test = self._X_test[perm_test] self._y_test = self._y_test[perm_test] self.n_test = len(self._y_test) else: # split into training, validation, and test sets np.random.seed(self.seed) perm = np.random.permutation(len(X)) self._X_train = X[perm[self.n_val:]] self._y_train = y[perm[self.n_val:]] self._X_val = X[perm[:self.n_val]] self._y_val = y[perm[:self.n_val]] self._X_test = X_test self._y_test = y_test # normalize data (if original scale should not be preserved) if not original_scale: # simple rescaling to [0,1] normalize_data(self._X_train, self._X_val, self._X_test, scale=np.float32(255)) # global contrast normalization if Cfg.gcn: global_contrast_normalization(self._X_train, self._X_val, self._X_test, scale=Cfg.unit_norm_used) # ZCA whitening if Cfg.zca_whitening: self._X_train, self._X_val, self._X_test = zca_whitening( self._X_train, self._X_val, self._X_test) # rescale to [0,1] (w.r.t. min and max in train data) rescale_to_unit_interval(self._X_train, self._X_val, self._X_test) # PCA if Cfg.pca: self._X_train, self._X_val, self._X_test = pca( self._X_train, self._X_val, self._X_test, 0.95) flush_last_line() print("Data loaded.")
def load_data(self, original_scale=False): print("Loading data...") # load normal and outlier data self._X_train = [img_to_array(load_img(Cfg.train_folder + filename)) for filename in os.listdir(Cfg.train_folder)][:Cfg.n_train] self._X_val = [img_to_array(load_img(Cfg.val_folder + filename)) for filename in os.listdir(Cfg.val_folder)][:Cfg.n_val] n_test_out = Cfg.n_test - Cfg.n_test_in _X_test_in = [img_to_array(load_img(Cfg.test_in_folder + filename)) for filename in os.listdir(Cfg.test_in_folder)][:Cfg.n_test_in] _X_test_out = [img_to_array(load_img(Cfg.test_out_folder + filename)) for filename in os.listdir(Cfg.test_out_folder)][:n_test_out] _y_test_in = np.zeros((Cfg.n_test_in,),dtype=np.int32) _y_test_out = np.ones((n_test_out,),dtype=np.int32) self._X_test = np.concatenate([_X_test_in, _X_test_out]) self._y_test = np.concatenate([_y_test_in, _y_test_out]) self.out_frac = Cfg.out_frac # tranpose to channels first self._X_train = np.moveaxis(self._X_train,-1,1) self._X_val = np.moveaxis(self._X_val,-1,1) self._X_test = np.moveaxis(self._X_test,-1,1) # cast data properly self._X_train = self._X_train.astype(np.float32) self._X_val = self._X_val.astype(np.float32) self._X_test = self._X_test.astype(np.float32) self._y_test = self._y_test.astype(np.int32) # Train and val labels are 0, since all are normal class self._y_train = np.zeros((len(self._X_train),),dtype=np.int32) self._y_val = np.zeros((len(self._X_val),),dtype=np.int32) if Cfg.ad_experiment: # shuffle to obtain random validation splits np.random.seed(self.seed) # shuffle data (since batches are extracted block-wise) self.n_train = len(self._y_train) self.n_val = len(self._y_val) perm_train = np.random.permutation(self.n_train) perm_val = np.random.permutation(self.n_val) self._X_train = self._X_train[perm_train] self._y_train = self._y_train[perm_train] self._X_val = self._X_train[perm_val] self._y_val = self._y_train[perm_val] print("Shuffled data") # Subset train set such that we only get batches of the same size assert(self.n_train >= Cfg.batch_size) self.n_train = (self.n_train / Cfg.batch_size) * Cfg.batch_size subset = np.random.choice(len(self._X_train), self.n_train, replace=False) self._X_train = self._X_train[subset] self._y_train = self._y_train[subset] # Adjust number of batches Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size)) # normalize data (if original scale should not be preserved) if not original_scale: # simple rescaling to [0,1] normalize_data(self._X_train, self._X_val, self._X_test, scale=np.float32(255)) # global contrast normalization if Cfg.gcn: global_contrast_normalization(self._X_train, self._X_val, self._X_test, scale=Cfg.unit_norm_used) # ZCA whitening if Cfg.zca_whitening: self._X_train, self._X_val, self._X_test = zca_whitening(self._X_train, self._X_val, self._X_test) # rescale to [0,1] (w.r.t. min and max in train data) rescale_to_unit_interval(self._X_train, self._X_val, self._X_test) # PCA if Cfg.pca: self._X_train, self._X_val, self._X_test = pca(self._X_train, self._X_val, self._X_test, 0.95) flush_last_line() print("Max pixel value: ", np.amax(self._X_train)) print("Data loaded.")
def load_data(self, original_scale=False): print("[INFO ]: ", "Please wait while ", self.dataset_name, " data is being loaded...") [X, y] = load_lhc_train_images(self.data_path) [X_test, y_test] = load_lhc_test_images(self.data_path) if Cfg.ad_experiment: # set normal and anomalous class normal = [1] outliers = [0] # extract normal and anomalous class X_norm, X_out, y_norm, y_out = extract_norm_and_out( X, y, normal=normal, outlier=outliers) # reduce outliers to fraction defined n_norm = len(y_norm) n_out = int( np.ceil( float(Cfg.out_frac) * n_norm / (1 - float(Cfg.out_frac)))) # shuffle to obtain random validation splits np.random.seed(self.seed) perm_norm = np.random.permutation(len(y_norm)) perm_out = np.random.permutation(len(y_out)) # split into training and validation set n_norm_split = int(Cfg.lhc_val_frac * n_norm) n_out_split = int(Cfg.lhc_val_frac * n_out) self._X_train = np.concatenate( (X_norm[perm_norm[n_norm_split:]], X_out[perm_out[:n_out][n_out_split:]])) self._y_train = np.append(y_norm[perm_norm[n_norm_split:]], y_out[perm_out[:n_out][n_out_split:]]) self._X_val = np.concatenate( (X_norm[perm_norm[:n_norm_split]], X_out[perm_out[:n_out][:n_out_split]])) self._y_val = np.append(y_norm[perm_norm[:n_norm_split]], y_out[perm_out[:n_out][:n_out_split]]) # shuffle data (since batches are extracted block-wise) self.n_train = len(self._y_train) self.n_val = len(self._y_val) perm_train = np.random.permutation(self.n_train) perm_val = np.random.permutation(self.n_val) self._X_train = self._X_train[perm_train] self._y_train = self._y_train[perm_train] self._X_val = self._X_train[perm_val] self._y_val = self._y_train[perm_val] # Subset train set such that we only get batches of the same size self.n_train = (self.n_train / Cfg.batch_size) * Cfg.batch_size subset = np.random.choice(len(self._X_train), int(self.n_train), replace=False) self._X_train = self._X_train[subset] self._y_train = self._y_train[subset] # Adjust number of batches Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size)) # test set X_norm, X_out, y_norm, y_out = extract_norm_and_out( X_test, y_test, normal=normal, outlier=outliers) self._X_test = np.concatenate((X_norm, X_out)) self._y_test = np.append(y_norm, y_out) perm_test = np.random.permutation(len(self._y_test)) self._X_test = self._X_test[perm_test] self._y_test = self._y_test[perm_test] self.n_test = len(self._y_test) else: # split into training, validation, and test sets np.random.seed(self.seed) perm = np.random.permutation(len(X)) self._X_train = X[perm[self.n_val:]] self._y_train = y[perm[self.n_val:]] self._X_val = X[perm[:self.n_val]] self._y_val = y[perm[:self.n_val]] self._X_test = X_test self._y_test = y_test # normalize data (if original scale should not be preserved) if not original_scale: # simple rescaling to [0,1] normalize_data(self._X_train, self._X_val, self._X_test, scale=np.float32(255)) # global contrast normalization if Cfg.gcn: global_contrast_normalization(self._X_train, self._X_val, self._X_test, scale=Cfg.unit_norm_used) # ZCA whitening if Cfg.zca_whitening: self._X_train, self._X_val, self._X_test = zca_whitening( self._X_train, self._X_val, self._X_test) # rescale to [0,1] (w.r.t. min and max in train data) rescale_to_unit_interval(self._X_train, self._X_val, self._X_test) # PCA if Cfg.pca: self._X_train, self._X_val, self._X_test = pca( self._X_train, self._X_val, self._X_test, 0.95) flush_last_line() print("[INFO] : Data loaded.")