def _create_data(self): root = utils.get_data_root() path = os.path.join(root, 'faces', self.name + '.jpg') try: image = io.imread(path) except FileNotFoundError: raise RuntimeError('Unknown face name: {}'.format(self.name)) image = color.rgb2gray(image) self.image = transform.resize(image, [512, 512]) grid = np.array([(x, y) for x in range(self.image.shape[0]) for y in range(self.image.shape[1])]) rotation_matrix = np.array([[0, -1], [1, 0]]) p = self.image.reshape(-1) / sum(self.image.reshape(-1)) ix = np.random.choice(range(len(grid)), size=self.num_points, replace=True, p=p) points = grid[ix].astype(np.float32) points += np.random.rand(self.num_points, 2) # dequantize points /= (self.image.shape[0]) # scale to [0, 1] # assert 0 <= min(points) <= max(points) <= 1 self.data = torch.tensor(points @ rotation_matrix).float() self.data[:, 1] += 1
def __init__(self, split='train', frac=None): path = os.path.join(utils.get_data_root(), 'power', '{}.npy'.format(split)) self.data = np.load(path).astype(np.float32) self.n, self.dim = self.data.shape if frac is not None: self.n = int(frac * self.n)
def save_splits(): train, val, test = load_power() splits = (('train', train), ('val', val), ('test', test)) for split in splits: name, data = split file = os.path.join(utils.get_data_root(), 'power', '{}.npy'.format(name)) np.save(file, data)
def __init__(self, split='train', transform=None): self.transform = transform path = os.path.join(utils.get_data_root(), 'omniglot', 'omniglot.mat') rawdata = loadmat(path) if split == 'train': self.data = rawdata['data'].T.reshape(-1, 28, 28) self.targets = rawdata['target'].T elif split == 'test': self.data = rawdata['testdata'].T.reshape(-1, 28, 28) self.targets = rawdata['testtarget'].T else: raise ValueError
def load_miniboone(): def load_data(path): # NOTE: To remember how the pre-processing was done. # data_ = pd.read_csv(root_path, names=[str(x) for x in range(50)], delim_whitespace=True) # print data_.head() # data_ = data_.as_matrix() # # Remove some random outliers # indices = (data_[:, 0] < -100) # data_ = data_[~indices] # # i = 0 # # Remove any features that have too many re-occuring real values. # features_to_remove = [] # for feature in data_.T: # c = Counter(feature) # max_count = np.array([v for k, v in sorted(c.iteritems())])[0] # if max_count > 5: # features_to_remove.append(i) # i += 1 # data_ = data_[:, np.array([i for i in range(data_.shape[1]) if i not in features_to_remove])] # np.save("~/data_/miniboone/data_.npy", data_) data = np.load(path) N_test = int(0.1 * data.shape[0]) data_test = data[-N_test:] data = data[0:-N_test] N_validate = int(0.1 * data.shape[0]) data_validate = data[-N_validate:] data_train = data[0:-N_validate] return data_train, data_validate, data_test def load_data_normalised(path): data_train, data_validate, data_test = load_data(path) data = np.vstack((data_train, data_validate)) mu = data.mean(axis=0) s = data.std(axis=0) data_train = (data_train - mu) / s data_validate = (data_validate - mu) / s data_test = (data_test - mu) / s return data_train, data_validate, data_test return load_data_normalised( path=os.path.join(utils.get_data_root(), 'miniboone', 'data.npy'))
def load_gas(): def load_data(file): data = pd.read_pickle(file) data.drop("Meth", axis=1, inplace=True) data.drop("Eth", axis=1, inplace=True) data.drop("Time", axis=1, inplace=True) return data def get_correlation_numbers(data): C = data.corr() A = C > 0.98 B = A.sum(axis=1) return B def load_data_and_clean(file): data = load_data(file) B = get_correlation_numbers(data) while np.any(B > 1): col_to_remove = np.where(B > 1)[0][0] col_name = data.columns[col_to_remove] data.drop(col_name, axis=1, inplace=True) B = get_correlation_numbers(data) data = (data - data.mean()) / data.std() return data.values def load_data_and_clean_and_split(file): data = load_data_and_clean(file) N_test = int(0.1 * data.shape[0]) data_test = data[-N_test:] data_train = data[0:-N_test] N_validate = int(0.1 * data_train.shape[0]) data_validate = data_train[-N_validate:] data_train = data_train[0:-N_validate] return data_train, data_validate, data_test return load_data_and_clean_and_split( file=os.path.join(utils.get_data_root(), 'gas', 'ethylene_CO.pickle'))
def load_data(): file = os.path.join(utils.get_data_root(), 'power', 'data.npy') return np.load(file)
def load_hepmass(): def load_data(path): data_train = pd.read_csv(filepath_or_buffer=os.path.join(path, '1000_train.csv'), index_col=False) data_test = pd.read_csv(filepath_or_buffer=os.path.join(path, '1000_test.csv'), index_col=False) return data_train, data_test def load_data_no_discrete(path): """Loads the positive class examples from the first 10% of the dataset.""" data_train, data_test = load_data(path) # Gets rid of any background noise examples i.e. class label 0. data_train = data_train[data_train[data_train.columns[0]] == 1] data_train = data_train.drop(data_train.columns[0], axis=1) data_test = data_test[data_test[data_test.columns[0]] == 1] data_test = data_test.drop(data_test.columns[0], axis=1) # Because the data_ set is messed up! data_test = data_test.drop(data_test.columns[-1], axis=1) return data_train, data_test def load_data_no_discrete_normalised(path): data_train, data_test = load_data_no_discrete(path) mu = data_train.mean() s = data_train.std() data_train = (data_train - mu) / s data_test = (data_test - mu) / s return data_train, data_test def load_data_no_discrete_normalised_as_array(path): data_train, data_test = load_data_no_discrete_normalised(path) data_train, data_test = data_train.values, data_test.values i = 0 # Remove any features that have too many re-occurring real values. features_to_remove = [] for feature in data_train.T: c = Counter(feature) max_count = np.array([v for k, v in sorted(c.items())])[0] if max_count > 5: features_to_remove.append(i) i += 1 data_train = data_train[:, np.array( [i for i in range(data_train.shape[1]) if i not in features_to_remove])] data_test = data_test[:, np.array( [i for i in range(data_test.shape[1]) if i not in features_to_remove])] N = data_train.shape[0] N_validate = int(N * 0.1) data_validate = data_train[-N_validate:] data_train = data_train[0:-N_validate] return data_train, data_validate, data_test return load_data_no_discrete_normalised_as_array( path=os.path.join(utils.get_data_root(), 'hepmass') )
def load_bsds300(): path = os.path.join(utils.get_data_root(), 'bsds300', 'bsds300.hdf5') file = h5py.File(path, 'r') return file['train'], file['validation'], file['test']