Ejemplo n.º 1
0
    def load_data(self, original_scale=False):

        print("Loading data...")

        X = load_mnist_images('%strain-images-idx3-ubyte.gz' % self.data_path)
        y = load_mnist_labels('%strain-labels-idx1-ubyte.gz' % self.data_path)
        X_test = load_mnist_images('%st10k-images-idx3-ubyte.gz' %
                                   self.data_path)
        y_test = load_mnist_labels('%st10k-labels-idx1-ubyte.gz' %
                                   self.data_path)

        if Cfg.ad_experiment:
            X_norm, y_norm = get_norm_for_mnist()

            # shuffle to obtain random validation splits
            np.random.seed(self.seed)
            perm_norm = np.random.permutation(len(y_norm))
            n_norm = len(y_norm)

            # split into training and validation set
            n_norm_split = int(Cfg.mnist_val_frac * n_norm)
            self._X_train = X_norm[perm_norm[n_norm_split:]]

            self._y_train = y_norm[perm_norm[n_norm_split:]]
            self._X_val = X_norm[perm_norm[:n_norm_split]]
            self._y_val = y_norm[perm_norm[:n_norm_split]]

            # shuffle data (since batches are extracted block-wise)
            self.n_train = len(self._y_train)
            self.n_val = len(self._y_val)
            perm_train = np.random.permutation(self.n_train)
            perm_val = np.random.permutation(self.n_val)
            self._X_train = self._X_train[perm_train]
            self._y_train = self._y_train[perm_train]
            self._X_val = self._X_train[perm_val]
            self._y_val = self._y_train[perm_val]

            print("Number of data in training: " + str(self.n_train))
            print("Number of data in validation: " + str(self.n_val))

            # Subset train set such that we only get batches of the same size
            self.n_train = (self.n_train / Cfg.batch_size) * Cfg.batch_size
            subset = np.random.choice(len(self._X_train),
                                      self.n_train,
                                      replace=False)
            self._X_train = self._X_train[subset]
            self._y_train = self._y_train[subset]

            # Adjust number of batches
            Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size))

            # test set
            X_norm, X_out, y_norm, y_out = get_outlier_for_mnist()
            self._X_test = np.concatenate((X_norm, X_out))
            self._y_test = np.append(y_norm, y_out)
            perm_test = np.random.permutation(len(self._y_test))
            self._X_test = self._X_test[perm_test]
            self._y_test = self._y_test[perm_test]
            self.n_test = len(self._y_test)
            print("Number of outlier data in testing: " +
                  str(np.shape(y_out)[0]))
            print("Number of normal data in testing: " +
                  str(np.shape(y_norm)[0]))

        else:
            # split into training, validation, and test sets
            np.random.seed(self.seed)
            perm = np.random.permutation(len(X))

            self._X_train = X[perm[self.n_val:]]
            self._y_train = y[perm[self.n_val:]]
            self._X_val = X[perm[:self.n_val]]
            self._y_val = y[perm[:self.n_val]]
            self._X_test = X_test
            self._y_test = y_test

        # normalize data (if original scale should not be preserved)
        if not original_scale:

            # simple rescaling to [0,1]
            normalize_data(self._X_train,
                           self._X_val,
                           self._X_test,
                           scale=np.float32(255))

            # global contrast normalization
            if Cfg.gcn:
                global_contrast_normalization(self._X_train,
                                              self._X_val,
                                              self._X_test,
                                              scale=Cfg.unit_norm_used)

            # ZCA whitening
            if Cfg.zca_whitening:
                self._X_train, self._X_val, self._X_test = zca_whitening(
                    self._X_train, self._X_val, self._X_test)

            # rescale to [0,1] (w.r.t. min and max in train data)
            rescale_to_unit_interval(self._X_train, self._X_val, self._X_test)

            # PCA
            if Cfg.pca:
                self._X_train, self._X_val, self._X_test = pca(
                    self._X_train, self._X_val, self._X_test, 0.95)

        flush_last_line()
        print("Data loaded.")
Ejemplo n.º 2
0
    def load_data(self, original_scale=False):

        print("Loading data...")

        # load training data
        X, y = [], []
        count = 1
        filename = '%s/data_batch_%i' % (self.data_path, count)
        while os.path.exists(filename):
            with open(filename, 'rb') as f:
                batch = pickle.load(f)
            X.append(batch['data'])
            y.append(batch['labels'])
            count += 1
            filename = '%s/data_batch_%i' % (self.data_path, count)

        # reshape data and cast them properly
        X = np.concatenate(X).reshape(-1, 3, 32, 32).astype(np.float32)
        y = np.concatenate(y).astype(np.int32)

        # load test set
        path = '%s/test_batch' % self.data_path
        with open(path, 'rb') as f:
            batch = pickle.load(f)

        # reshaping and casting for test data
        X_test = batch['data'].reshape(-1, 3, 32, 32).astype(np.float32)
        y_test = np.array(batch['labels'], dtype=np.int32)

        if Cfg.ad_experiment:

            normal = eval(Cfg.cifar10_normal)
            outliers = eval(Cfg.cifar10_outlier)

            # extract normal and anomalous class
            X_norm, X_out, y_norm, y_out, _, _ = extract_norm_and_out(
                X, y, normal=normal, outlier=outliers)

            # reduce outliers to fraction defined
            n_norm = len(y_norm)
            n_out = int(np.ceil(Cfg.out_frac * n_norm / (1 - Cfg.out_frac)))

            # shuffle to obtain random validation splits
            np.random.seed(self.seed)
            perm_norm = np.random.permutation(len(y_norm))
            perm_out = np.random.permutation(len(y_out))

            # split into training and validation set
            n_norm_split = int(Cfg.cifar10_val_frac * n_norm)
            n_out_split = int(Cfg.cifar10_val_frac * n_out)
            self._X_train = np.concatenate(
                (X_norm[perm_norm[n_norm_split:]],
                 X_out[perm_out[:n_out][n_out_split:]]))
            self._y_train = np.append(y_norm[perm_norm[n_norm_split:]],
                                      y_out[perm_out[:n_out][n_out_split:]])
            self._X_val = np.concatenate(
                (X_norm[perm_norm[:n_norm_split]],
                 X_out[perm_out[:n_out][:n_out_split]]))
            self._y_val = np.append(y_norm[perm_norm[:n_norm_split]],
                                    y_out[perm_out[:n_out][:n_out_split]])

            # shuffle data (since batches are extracted block-wise)
            self.n_train = len(self._y_train)
            self.n_val = len(self._y_val)
            perm_train = np.random.permutation(self.n_train)
            perm_val = np.random.permutation(self.n_val)
            self._X_train = self._X_train[perm_train]
            self._y_train = self._y_train[perm_train]
            self._X_val = self._X_val[perm_val]
            self._y_val = self._y_val[perm_val]

            # Subset train set such that we only get batches of the same size
            self.n_train = (self.n_train / Cfg.batch_size) * Cfg.batch_size
            subset = np.random.choice(len(self._X_train),
                                      self.n_train,
                                      replace=False)
            self._X_train = self._X_train[subset]
            self._y_train = self._y_train[subset]

            # Adjust number of batches
            Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size))

            # test set
            X_norm, X_out, y_norm, y_out, idx_norm, idx_out = extract_norm_and_out(
                X_test, y_test, normal=normal, outlier=outliers)

            # store original test labels for visualisation
            yo_norm = y_test[idx_norm]
            yo_out = y_test[idx_out]
            self._yo_test = np.append(yo_norm, yo_out)

            self._X_test = np.concatenate((X_norm, X_out))
            self._y_test = np.append(y_norm, y_out)
            perm_test = np.random.permutation(len(self._y_test))
            self._X_test = self._X_test[perm_test]
            self._y_test = self._y_test[perm_test]
            self._yo_test = self._yo_test[perm_test]
            self.n_test = len(self._y_test)

        else:
            # split into training and validation sets with stored seed
            np.random.seed(self.seed)
            perm = np.random.permutation(len(X))

            self._X_train = X[perm[self.n_val:]]
            self._y_train = y[perm[self.n_val:]]
            self._X_val = X[perm[:self.n_val]]
            self._y_val = y[perm[:self.n_val]]

            self._X_test = X_test
            self._y_test = y_test

        # normalize data (if original scale should not be preserved)
        if not original_scale:

            # simple rescaling to [0,1]
            normalize_data(self._X_train,
                           self._X_val,
                           self._X_test,
                           scale=np.float32(255))

            # global contrast normalization
            if Cfg.gcn:
                global_contrast_normalization(self._X_train,
                                              self._X_val,
                                              self._X_test,
                                              scale=Cfg.unit_norm_used)

            # ZCA whitening
            if Cfg.zca_whitening:
                self._X_train, self._X_val, self._X_test = zca_whitening(
                    self._X_train, self._X_val, self._X_test)

            # rescale to [0,1] (w.r.t. min and max in train data)
            rescale_to_unit_interval(self._X_train, self._X_val, self._X_test)

            # PCA
            if Cfg.pca:
                self._X_train, self._X_val, self._X_test = pca(
                    self._X_train, self._X_val, self._X_test, 0.95)

        flush_last_line()
        print("Data loaded.")
Ejemplo n.º 3
0
    def load_data(self, original_scale=False):

        print("Loading data...")

        # get train data
        X = readTrafficSigns(rootpath=self.data_path,
                             which_set="train",
                             label=14)

        # get (normal) test data
        # X_test_norm = readTrafficSigns(rootpath=self.data_path, which_set="test", label=14)
        # sub-sample test set data of size
        np.random.seed(self.seed)
        perm = np.random.permutation(len(X))
        X_test_norm = X[perm[:100], ...]
        self._X_train = X[perm[100:], ...]
        self.n_train = len(self._X_train)
        self._y_train = np.zeros(self.n_train, dtype=np.uint8)

        # load (adversarial) test data
        X_test_adv = np.load(self.data_path + "/Images_150.npy")
        labels_adv = np.load(self.data_path + "/Labels_150.npy")

        self._X_test = np.concatenate(
            (X_test_norm, X_test_adv[labels_adv == 1]),
            axis=0).astype(np.float32)
        self._y_test = np.concatenate(
            (np.zeros(len(X_test_norm), dtype=np.uint8),
             np.ones(int(np.sum(labels_adv)), dtype=np.uint8)),
            axis=0)
        self.n_test = len(self._X_test)

        # since val set is referenced at some points initialize empty np arrays
        self._X_val = np.empty(shape=(0, 3, 32, 32), dtype=np.float32)
        self._y_val = np.empty(shape=(0), dtype=np.uint8)

        # Adjust number of batches
        Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size))

        # shuffle
        np.random.seed(self.seed)
        perm_train = np.random.permutation(self.n_train)
        perm_test = np.random.permutation(self.n_test)
        self._X_train = self._X_train[perm_train, ...]
        self._y_train = self._y_train[perm_train]
        self._X_test = self._X_test[perm_test, ...]
        self._y_test = self._y_test[perm_test]

        # Adjust number of batches
        Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size))

        # normalize data (if original scale should not be preserved)
        if not original_scale:

            # simple rescaling to [0,1]
            normalize_data(self._X_train,
                           self._X_val,
                           self._X_test,
                           scale=np.float32(255))

            # global contrast normalization
            if Cfg.gcn:
                global_contrast_normalization(self._X_train,
                                              self._X_val,
                                              self._X_test,
                                              scale=Cfg.unit_norm_used)

            # ZCA whitening
            if Cfg.zca_whitening:
                self._X_train, self._X_val, self._X_test = zca_whitening(
                    self._X_train, self._X_val, self._X_test)

            # rescale to [0,1] (w.r.t. min and max in train data)
            rescale_to_unit_interval(self._X_train, self._X_val, self._X_test)

            # PCA
            if Cfg.pca:
                self._X_train, self._X_val, self._X_test = pca(
                    self._X_train, self._X_val, self._X_test, 0.95)

        flush_last_line()
        print("Data loaded.")
Ejemplo n.º 4
0
    def load_data(self, original_scale=False):

        print("Loading data...")

        X = load_mnist_images('%strain-images-idx3-ubyte.gz' % self.data_path)
        y = load_mnist_labels('%strain-labels-idx1-ubyte.gz' % self.data_path)
        X_test = load_mnist_images('%st10k-images-idx3-ubyte.gz' %
                                   self.data_path)
        y_test = load_mnist_labels('%st10k-labels-idx1-ubyte.gz' %
                                   self.data_path)

        if Cfg.ad_experiment:

            # set normal and anomalous class
            normal = []
            outliers = []

            if Cfg.mnist_normal == -1:
                normal = list(range(0, 10))
                normal.remove(Cfg.mnist_outlier)
            else:
                normal.append(Cfg.mnist_normal)

            if Cfg.mnist_outlier == -1:
                outliers = list(range(0, 10))
                outliers.remove(Cfg.mnist_normal)
            else:
                outliers.append(Cfg.mnist_outlier)

            # extract normal and anomalous class
            X_norm, X_out, y_norm, y_out = extract_norm_and_out(
                X, y, normal=normal, outlier=outliers)

            # reduce outliers to fraction defined
            n_norm = len(y_norm)
            n_out = int(np.ceil(Cfg.out_frac * n_norm / (1 - Cfg.out_frac)))

            # shuffle to obtain random validation splits
            np.random.seed(self.seed)
            perm_norm = np.random.permutation(len(y_norm))
            perm_out = np.random.permutation(len(y_out))

            # split into training and validation set
            n_norm_split = int(Cfg.mnist_val_frac * n_norm)
            n_out_split = int(Cfg.mnist_val_frac * n_out)
            self._X_train = np.concatenate(
                (X_norm[perm_norm[n_norm_split:]],
                 X_out[perm_out[:n_out][n_out_split:]]))
            self._y_train = np.append(y_norm[perm_norm[n_norm_split:]],
                                      y_out[perm_out[:n_out][n_out_split:]])
            self._X_val = np.concatenate(
                (X_norm[perm_norm[:n_norm_split]],
                 X_out[perm_out[:n_out][:n_out_split]]))
            self._y_val = np.append(y_norm[perm_norm[:n_norm_split]],
                                    y_out[perm_out[:n_out][:n_out_split]])

            # shuffle data (since batches are extracted block-wise)
            self.n_train = len(self._y_train)
            self.n_val = len(self._y_val)
            perm_train = np.random.permutation(self.n_train)
            perm_val = np.random.permutation(self.n_val)
            self._X_train = self._X_train[perm_train]
            self._y_train = self._y_train[perm_train]
            self._X_val = self._X_train[perm_val]
            self._y_val = self._y_train[perm_val]

            # Subset train set such that we only get batches of the same size
            self.n_train = (self.n_train / Cfg.batch_size) * Cfg.batch_size
            subset = np.random.choice(len(self._X_train),
                                      int(self.n_train),
                                      replace=False)
            self._X_train = self._X_train[subset]
            self._y_train = self._y_train[subset]

            # Adjust number of batches
            Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size))

            # test set
            X_norm, X_out, y_norm, y_out = extract_norm_and_out(
                X_test, y_test, normal=normal, outlier=outliers)
            self._X_test = np.concatenate((X_norm, X_out))
            self._y_test = np.append(y_norm, y_out)
            perm_test = np.random.permutation(len(self._y_test))
            self._X_test = self._X_test[perm_test]
            self._y_test = self._y_test[perm_test]
            self.n_test = len(self._y_test)

        else:
            # split into training, validation, and test sets
            np.random.seed(self.seed)
            perm = np.random.permutation(len(X))

            self._X_train = X[perm[self.n_val:]]
            self._y_train = y[perm[self.n_val:]]
            self._X_val = X[perm[:self.n_val]]
            self._y_val = y[perm[:self.n_val]]
            self._X_test = X_test
            self._y_test = y_test

        # normalize data (if original scale should not be preserved)
        if not original_scale:

            # simple rescaling to [0,1]
            normalize_data(self._X_train,
                           self._X_val,
                           self._X_test,
                           scale=np.float32(255))

            # global contrast normalization
            if Cfg.gcn:
                global_contrast_normalization(self._X_train,
                                              self._X_val,
                                              self._X_test,
                                              scale=Cfg.unit_norm_used)

            # ZCA whitening
            if Cfg.zca_whitening:
                self._X_train, self._X_val, self._X_test = zca_whitening(
                    self._X_train, self._X_val, self._X_test)

            # rescale to [0,1] (w.r.t. min and max in train data)
            rescale_to_unit_interval(self._X_train, self._X_val, self._X_test)

            # PCA
            if Cfg.pca:
                self._X_train, self._X_val, self._X_test = pca(
                    self._X_train, self._X_val, self._X_test, 0.95)

        flush_last_line()
        print("Data loaded.")
Ejemplo n.º 5
0
    def load_data(self, original_scale=False):

        print("Loading data...")

        # load normal and outlier data
        self._X_train = [img_to_array(load_img(Cfg.train_folder + filename)) for filename in os.listdir(Cfg.train_folder)][:Cfg.n_train]
        self._X_val = [img_to_array(load_img(Cfg.val_folder + filename)) for filename in os.listdir(Cfg.val_folder)][:Cfg.n_val]
        n_test_out = Cfg.n_test - Cfg.n_test_in
        _X_test_in = [img_to_array(load_img(Cfg.test_in_folder + filename)) for filename in os.listdir(Cfg.test_in_folder)][:Cfg.n_test_in]
        _X_test_out = [img_to_array(load_img(Cfg.test_out_folder + filename)) for filename in os.listdir(Cfg.test_out_folder)][:n_test_out]
        _y_test_in  = np.zeros((Cfg.n_test_in,),dtype=np.int32)
        _y_test_out = np.ones((n_test_out,),dtype=np.int32)
        self._X_test = np.concatenate([_X_test_in, _X_test_out])
        self._y_test = np.concatenate([_y_test_in, _y_test_out])
        self.out_frac = Cfg.out_frac

        # tranpose to channels first
        self._X_train = np.moveaxis(self._X_train,-1,1)
        self._X_val = np.moveaxis(self._X_val,-1,1)
        self._X_test = np.moveaxis(self._X_test,-1,1)


        # cast data properly
        self._X_train = self._X_train.astype(np.float32)
        self._X_val = self._X_val.astype(np.float32)
        self._X_test = self._X_test.astype(np.float32)
        self._y_test = self._y_test.astype(np.int32)

        # Train and val labels are 0, since all are normal class
        self._y_train = np.zeros((len(self._X_train),),dtype=np.int32)
        self._y_val = np.zeros((len(self._X_val),),dtype=np.int32)

        if Cfg.ad_experiment:
            # shuffle to obtain random validation splits
            np.random.seed(self.seed)

            # shuffle data (since batches are extracted block-wise)
            self.n_train = len(self._y_train)
            self.n_val = len(self._y_val)
            perm_train = np.random.permutation(self.n_train)
            perm_val = np.random.permutation(self.n_val)
            self._X_train = self._X_train[perm_train]
            self._y_train = self._y_train[perm_train]
            self._X_val = self._X_train[perm_val]
            self._y_val = self._y_train[perm_val]
            print("Shuffled data")

            # Subset train set such that we only get batches of the same size
            assert(self.n_train >= Cfg.batch_size)
            self.n_train = (self.n_train / Cfg.batch_size) * Cfg.batch_size
            subset = np.random.choice(len(self._X_train), self.n_train, replace=False)
            self._X_train = self._X_train[subset]
            self._y_train = self._y_train[subset]

            # Adjust number of batches
            Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size))

        # normalize data (if original scale should not be preserved)
        if not original_scale:

            # simple rescaling to [0,1]
            normalize_data(self._X_train, self._X_val, self._X_test, scale=np.float32(255))

            # global contrast normalization
            if Cfg.gcn:
                global_contrast_normalization(self._X_train, self._X_val, self._X_test, scale=Cfg.unit_norm_used)

            # ZCA whitening
            if Cfg.zca_whitening:
                self._X_train, self._X_val, self._X_test = zca_whitening(self._X_train, self._X_val, self._X_test)

            # rescale to [0,1] (w.r.t. min and max in train data)
            rescale_to_unit_interval(self._X_train, self._X_val, self._X_test)

            # PCA
            if Cfg.pca:
                self._X_train, self._X_val, self._X_test = pca(self._X_train, self._X_val, self._X_test, 0.95)

        flush_last_line()
        print("Max pixel value: ", np.amax(self._X_train))
        print("Data loaded.")
Ejemplo n.º 6
0
    def load_data(self, original_scale=False):

        print("[INFO ]: ", "Please wait while ", self.dataset_name,
              " data is being loaded...")

        [X, y] = load_lhc_train_images(self.data_path)
        [X_test, y_test] = load_lhc_test_images(self.data_path)

        if Cfg.ad_experiment:

            # set normal and anomalous class
            normal = [1]
            outliers = [0]

            # extract normal and anomalous class
            X_norm, X_out, y_norm, y_out = extract_norm_and_out(
                X, y, normal=normal, outlier=outliers)

            # reduce outliers to fraction defined
            n_norm = len(y_norm)
            n_out = int(
                np.ceil(
                    float(Cfg.out_frac) * n_norm / (1 - float(Cfg.out_frac))))

            # shuffle to obtain random validation splits
            np.random.seed(self.seed)
            perm_norm = np.random.permutation(len(y_norm))
            perm_out = np.random.permutation(len(y_out))

            # split into training and validation set
            n_norm_split = int(Cfg.lhc_val_frac * n_norm)
            n_out_split = int(Cfg.lhc_val_frac * n_out)
            self._X_train = np.concatenate(
                (X_norm[perm_norm[n_norm_split:]],
                 X_out[perm_out[:n_out][n_out_split:]]))
            self._y_train = np.append(y_norm[perm_norm[n_norm_split:]],
                                      y_out[perm_out[:n_out][n_out_split:]])
            self._X_val = np.concatenate(
                (X_norm[perm_norm[:n_norm_split]],
                 X_out[perm_out[:n_out][:n_out_split]]))
            self._y_val = np.append(y_norm[perm_norm[:n_norm_split]],
                                    y_out[perm_out[:n_out][:n_out_split]])

            # shuffle data (since batches are extracted block-wise)
            self.n_train = len(self._y_train)
            self.n_val = len(self._y_val)
            perm_train = np.random.permutation(self.n_train)
            perm_val = np.random.permutation(self.n_val)
            self._X_train = self._X_train[perm_train]
            self._y_train = self._y_train[perm_train]
            self._X_val = self._X_train[perm_val]
            self._y_val = self._y_train[perm_val]

            # Subset train set such that we only get batches of the same size
            self.n_train = (self.n_train / Cfg.batch_size) * Cfg.batch_size
            subset = np.random.choice(len(self._X_train),
                                      int(self.n_train),
                                      replace=False)
            self._X_train = self._X_train[subset]
            self._y_train = self._y_train[subset]

            # Adjust number of batches
            Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size))

            # test set
            X_norm, X_out, y_norm, y_out = extract_norm_and_out(
                X_test, y_test, normal=normal, outlier=outliers)
            self._X_test = np.concatenate((X_norm, X_out))
            self._y_test = np.append(y_norm, y_out)
            perm_test = np.random.permutation(len(self._y_test))
            self._X_test = self._X_test[perm_test]
            self._y_test = self._y_test[perm_test]
            self.n_test = len(self._y_test)

        else:
            # split into training, validation, and test sets
            np.random.seed(self.seed)
            perm = np.random.permutation(len(X))

            self._X_train = X[perm[self.n_val:]]
            self._y_train = y[perm[self.n_val:]]
            self._X_val = X[perm[:self.n_val]]
            self._y_val = y[perm[:self.n_val]]
            self._X_test = X_test
            self._y_test = y_test

        # normalize data (if original scale should not be preserved)
        if not original_scale:

            # simple rescaling to [0,1]
            normalize_data(self._X_train,
                           self._X_val,
                           self._X_test,
                           scale=np.float32(255))

            # global contrast normalization
            if Cfg.gcn:
                global_contrast_normalization(self._X_train,
                                              self._X_val,
                                              self._X_test,
                                              scale=Cfg.unit_norm_used)

            # ZCA whitening
            if Cfg.zca_whitening:
                self._X_train, self._X_val, self._X_test = zca_whitening(
                    self._X_train, self._X_val, self._X_test)

            # rescale to [0,1] (w.r.t. min and max in train data)
            rescale_to_unit_interval(self._X_train, self._X_val, self._X_test)

            # PCA
            if Cfg.pca:
                self._X_train, self._X_val, self._X_test = pca(
                    self._X_train, self._X_val, self._X_test, 0.95)

        flush_last_line()
        print("[INFO] : Data loaded.")
Ejemplo n.º 7
0
    def load_data(self, original_scale=False):

        print("Loading data...")

        X, y = load_mobiFall_data('%smobiFall/train_mobiFall' % self.data_path)

        X_test, y_test = load_mobiFall_data('%smobiFall/test_mobiFall' %
                                            self.data_path)

        if Cfg.ad_experiment:

            # set normal and anomalous class
            normal = eval(Cfg.mobiFall_normal)
            outliers = eval(Cfg.mobiFall_outlier)

            # extract normal and anomalous class
            X_norm, X_out, y_norm, y_out, idx_norm, idx_out = extract_norm_and_out(
                X, y, normal=normal, outlier=outliers)

            # reduce outliers to fraction defined
            n_norm = len(y_norm)
            n_out = int(np.ceil(Cfg.out_frac * n_norm / (1 - Cfg.out_frac)))

            # shuffle to obtain random validation splits
            np.random.seed(self.seed)
            perm_norm = np.random.permutation(len(y_norm))
            perm_out = np.random.permutation(len(y_out))

            # split into training and validation set
            n_norm_split = int(Cfg.mobiFall_val_frac * n_norm)
            n_out_split = int(Cfg.mobiFall_val_frac * n_out)
            self._X_train = np.concatenate(
                (X_norm[perm_norm[n_norm_split:]],
                 X_out[perm_out[:n_out][n_out_split:]]))
            self._y_train = np.append(y_norm[perm_norm[n_norm_split:]],
                                      y_out[perm_out[:n_out][n_out_split:]])
            self._X_val = np.concatenate(
                (X_norm[perm_norm[:n_norm_split]],
                 X_out[perm_out[:n_out][:n_out_split]]))
            self._y_val = np.append(y_norm[perm_norm[:n_norm_split]],
                                    y_out[perm_out[:n_out][:n_out_split]])

            # shuffle data (since batches are extracted block-wise)
            self.n_train = len(self._y_train)
            self.n_val = len(self._y_val)
            perm_train = np.random.permutation(self.n_train)
            perm_val = np.random.permutation(self.n_val)
            self._X_train = self._X_train[perm_train]
            self._y_train = self._y_train[perm_train]
            self._X_val = self._X_val[perm_val]
            self._y_val = self._y_val[perm_val]

            # Subset train set such that we only get batches of the same size
            self.n_train = (self.n_train / Cfg.batch_size) * Cfg.batch_size
            subset = np.random.choice(len(self._X_train),
                                      self.n_train,
                                      replace=False)
            self._X_train = self._X_train[subset]
            self._y_train = self._y_train[subset]

            # Adjust number of batches
            Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size))

            # test set
            X_norm, X_out, y_norm, y_out, idx_norm, idx_out = extract_norm_and_out(
                X_test, y_test, normal=normal, outlier=outliers)

            yo_norm = y_test[idx_norm]
            yo_out = y_test[idx_out]
            self._yo_test = np.append(yo_norm, yo_out)

            self._X_test = np.concatenate((X_norm, X_out))
            self._y_test = np.append(y_norm, y_out)
            perm_test = np.random.permutation(len(self._y_test))
            self._X_test = self._X_test[perm_test]
            self._y_test = self._y_test[perm_test]
            self._yo_test = self._yo_test[perm_test]
            self.n_test = len(self._y_test)

        else:
            # split into training, validation, and test sets
            np.random.seed(self.seed)
            perm = np.random.permutation(len(X))

            self._X_train = X[perm[self.n_val:]]
            self._y_train = y[perm[self.n_val:]]
            self._X_val = X[perm[:self.n_val]]
            self._y_val = y[perm[:self.n_val]]
            self._X_test = X_test
            self._y_test = y_test

        # normalize data (if original scale should not be preserved)
        if not original_scale:

            # rescale to [0,1] (w.r.t. min and max in train data)
            rescale_to_unit_interval(self._X_train, self._X_val, self._X_test)

            # PCA
            if Cfg.pca:
                self._X_train, self._X_val, self._X_test = pca(
                    self._X_train, self._X_val, self._X_test, 0.95)

        flush_last_line()
        print("Data loaded.")