Example #1
0
 def load_data():
     file = os.path.join(io.get_data_root(), 'data', 'power/data.npy')
     try:
         return np.load(file)
     except FileNotFoundError:
         download_data()
         return np.load(file)
Example #2
0
def preprocess_and_save_miniboone():
    train, val, test = load_miniboone()
    splits = (('train', train), ('val', val), ('test', test))
    for split in splits:
        name, data = split
        file = os.path.join(io.get_data_root(), 'data',
                            'miniboone/{}.npy'.format(name))
        np.save(file, data)
Example #3
0
def load_bsds300():
    path = os.path.join(io.get_data_root(), 'data', 'BSDS300/BSDS300.hdf5')
    try:
        file = h5py.File(path, 'r')
    except FileNotFoundError:
        download_data()
        file = h5py.File(path, 'r')

    return file['train'], file['validation'], file['test']
Example #4
0
 def __init__(self, split='train', frac=None):
     path = os.path.join(io.get_data_root(), 'data',
                         'miniboone/{}.npy'.format(split))
     try:
         self.data = np.load(path).astype(np.float32)
     except FileNotFoundError:
         print('Preprocessing and saving Miniboone...')
         preprocess_and_save_miniboone()
         print('Done!')
         self.data = np.load(path).astype(np.float32)
     self.n, self.dim = self.data.shape
     if frac is not None:
         self.n = int(frac * self.n)
Example #5
0
def load_miniboone():
    def load_data(root_path):
        # NOTE: To remember how the pre-processing was done.
        # data_ = pd.read_csv(root_path, names=[str(x) for x in range(50)], delim_whitespace=True)
        # print data_.head()
        # data_ = data_.as_matrix()
        # # Remove some random outliers
        # indices = (data_[:, 0] < -100)
        # data_ = data_[~indices]
        #
        # i = 0
        # # Remove any features that have too many re-occuring real values.
        # features_to_remove = []
        # for feature in data_.T:
        #     c = Counter(feature)
        #     max_count = np.array([v for k, v in sorted(c.iteritems())])[0]
        #     if max_count > 5:
        #         features_to_remove.append(i)
        #     i += 1
        # data_ = data_[:, np.array([i for i in range(data_.shape[1]) if i not in features_to_remove])]
        # np.save("~/data_/miniboone/data_.npy", data_)

        try:
            data = np.load(root_path)
        except FileNotFoundError:
            download_data()
            data = np.load(root_path)
        N_test = int(0.1 * data.shape[0])
        data_test = data[-N_test:]
        data = data[0:-N_test]
        N_validate = int(0.1 * data.shape[0])
        data_validate = data[-N_validate:]
        data_train = data[0:-N_validate]

        return data_train, data_validate, data_test

    def load_data_normalised(root_path):
        data_train, data_validate, data_test = load_data(root_path)
        data = np.vstack((data_train, data_validate))
        mu = data.mean(axis=0)
        s = data.std(axis=0)
        data_train = (data_train - mu) / s
        data_validate = (data_validate - mu) / s
        data_test = (data_test - mu) / s

        return data_train, data_validate, data_test

    path = os.path.join(io.get_data_root(), 'data', 'miniboone/data.npy')
    return load_data_normalised(path)
Example #6
0
def download_data():
    data_root = io.get_data_root()
    if not os.path.isdir(os.path.join(data_root, 'data')):
        query = (
            "> UCI, BSDS300, and MNIST data not found.\n"
            "> The zipped download is 817MB in size, and 1.6GB once unzipped.\n"
            "> The download includes CIFAR-10, although it is not used in the paper.\n"
            "> After extraction, this script will delete the zipped download.\n"
            "> You will also be able to specify whether to delete CIFAR-10.\n"
            "> Do you wish to download the data? [Y/n]")
        response = input(query)
        if response in ['y', '']:
            download_and_extract(data_root)
        elif response == 'n':
            sys.exit()
        else:
            print('Response not understood.')
            sys.exit()
Example #7
0
def load_gas():
    def load_data(file):
        try:
            data = pd.read_pickle(file)
        except FileNotFoundError:
            download_data()
            data = pd.read_pickle(file)
        data.drop("Meth", axis=1, inplace=True)
        data.drop("Eth", axis=1, inplace=True)
        data.drop("Time", axis=1, inplace=True)
        return data

    def get_correlation_numbers(data):
        C = data.corr()
        A = C > 0.98
        B = A.sum(axis=1)
        return B

    def load_data_and_clean(file):
        data = load_data(file)
        B = get_correlation_numbers(data)

        while np.any(B > 1):
            col_to_remove = np.where(B > 1)[0][0]
            col_name = data.columns[col_to_remove]
            data.drop(col_name, axis=1, inplace=True)
            B = get_correlation_numbers(data)
        data = (data - data.mean()) / data.std()

        return data.values

    def load_data_and_clean_and_split(file):
        data = load_data_and_clean(file)
        N_test = int(0.1 * data.shape[0])
        data_test = data[-N_test:]
        data_train = data[0:-N_test]
        N_validate = int(0.1 * data_train.shape[0])
        data_validate = data_train[-N_validate:]
        data_train = data_train[0:-N_validate]

        return data_train, data_validate, data_test

    file = os.path.join(io.get_data_root(), 'data', 'gas/ethylene_CO.pickle')
    return load_data_and_clean_and_split(file)
Example #8
0
def load_hepmass():
    def load_data(path):
        try:
            data_train = pd.read_csv(filepath_or_buffer=os.path.join(
                path, "1000_train.csv"),
                                     index_col=False)
            data_test = pd.read_csv(filepath_or_buffer=os.path.join(
                path, "1000_test.csv"),
                                    index_col=False)
        except FileNotFoundError:
            download_data()
            data_train = pd.read_csv(filepath_or_buffer=os.path.join(
                path, "1000_train.csv"),
                                     index_col=False)
            data_test = pd.read_csv(filepath_or_buffer=os.path.join(
                path, "1000_test.csv"),
                                    index_col=False)
        return data_train, data_test

    def load_data_no_discrete(path):
        """
        Loads the positive class examples from the first 10 percent of the dataset.
        """
        data_train, data_test = load_data(path)

        # Gets rid of any background noise examples i.e. class label 0.
        data_train = data_train[data_train[data_train.columns[0]] == 1]
        data_train = data_train.drop(data_train.columns[0], axis=1)
        data_test = data_test[data_test[data_test.columns[0]] == 1]
        data_test = data_test.drop(data_test.columns[0], axis=1)
        # Because the data_ set is messed up!
        data_test = data_test.drop(data_test.columns[-1], axis=1)

        return data_train, data_test

    def load_data_no_discrete_normalised(path):

        data_train, data_test = load_data_no_discrete(path)
        mu = data_train.mean()
        s = data_train.std()
        data_train = (data_train - mu) / s
        data_test = (data_test - mu) / s

        return data_train, data_test

    def load_data_no_discrete_normalised_as_array(path):

        data_train, data_test = load_data_no_discrete_normalised(path)
        data_train, data_test = data_train.values, data_test.values

        i = 0
        # Remove any features that have too many re-occurring real values.
        features_to_remove = []
        for feature in data_train.T:
            c = Counter(feature)
            max_count = np.array([v for k, v in sorted(c.items())])[0]
            if max_count > 5:
                features_to_remove.append(i)
            i += 1
        data_train = data_train[:,
                                np.array([
                                    i for i in range(data_train.shape[1])
                                    if i not in features_to_remove
                                ])]
        data_test = data_test[:,
                              np.array([
                                  i for i in range(data_test.shape[1])
                                  if i not in features_to_remove
                              ])]

        N = data_train.shape[0]
        N_validate = int(N * 0.1)
        data_validate = data_train[-N_validate:]
        data_train = data_train[0:-N_validate]

        return data_train, data_validate, data_test

    path = os.path.join(io.get_data_root(), 'data', 'hepmass')
    return load_data_no_discrete_normalised_as_array(path)