def get_dataset(self, **kwargs): filepath = '../data' dataPath = os.path.join(filepath, 'dualALOO-N=300-D=200_X.txt') labelsPath = os.path.join(filepath, 'dualALOO-N=300-D=200_Y.txt') Y = np.loadtxt(labelsPath) X = np.loadtxt(dataPath) X = utils.whiten_data(X) good_features = np.setdiff1d(np.arange(X.shape[1]), np.where(np.var(X, axis=0) == 0)[0]) X = X[:, good_features] X = np.append(X, np.ones((X.shape[0], 1)), axis=1) return Dataset(X, Y, X.shape[1] - 1, X.shape[0], classification=True), None
def get_dataset(self, **kwargs): filepath = '../data/E2006-tfidf' #dataPath = os.path.join(filepath, 'X-N=full-D=last60k.h5') #f = h5py.File(dataPath, 'r') #X = np.array(f['X']) #Y = np.array(f['Y']) dataPath = os.path.join(filepath, 'X-N=full-D=last60k.txt') labelPath = os.path.join(filepath, 'Y-N=full.txt') X = np.loadtxt(dataPath) Y = np.loadtxt(labelPath) X = utils.whiten_data(X) X = np.append(X, np.ones((X.shape[0], 1)), axis=1) return Dataset(X, Y, X.shape[1] - 1, X.shape[0], classification=True), None
def get_dataset(self, small=False, seed=1234, **kwargs): if small is True: dataPath = os.path.join(self.filepath, 'X_5k_1234.txt') labelsPath = os.path.join(self.filepath, 'Y_5k_1234.txt') else: dataPath = os.path.join(self.filepath, 'X.txt') labelsPath = os.path.join(self.filepath, 'Y.txt') Y = np.loadtxt(labelsPath) X = np.loadtxt(dataPath) bad_dims = np.where(np.var(X, axis=0) == 0) good_dims = np.setdiff1d(np.arange(X.shape[1]), bad_dims) X = X[:, good_dims] X = utils.whiten_data(X) X = np.append(X, np.ones((X.shape[0], 1)), axis=1) return Dataset(X, Y, X.shape[1] - 1, X.shape[0], classification=False), None
def get_dataset(self, filepath=None, **kwargs): if filepath is None: filepath = '../data/BlogFeedback' dataPath = os.path.join(filepath, 'X-N=20000-D=20280.txt') labelsPath = os.path.join(filepath, 'Y-N=20000.txt') Y = np.loadtxt(labelsPath) X = np.loadtxt(dataPath) X = utils.whiten_data(X) good_features = np.setdiff1d(np.arange(X.shape[1]), np.where(np.var(X, axis=0) == 0)[0]) X = X[:, good_features] X = np.append(X, np.ones((X.shape[0], 1)), axis=1) goodYs = np.setdiff1d(np.arange(X.shape[0]), np.where(Y > 1000)[0]) X = X[goodYs] Y = Y[goodYs] return Dataset(X, Y, X.shape[1] - 1, X.shape[0], classification=True), None
def get_dataset(self, small=False, smallNsmallDDataset=False, smallD=False, Ntrain=None, seed=1234, **kwargs): if not smallD: dataPath = os.path.join(self.filepath, 'X_clean.txt') else: dataPath = os.path.join(self.filepath, 'X_smallD.txt') labelsPath = os.path.join(self.filepath, 'Y_clean.txt') Y = np.loadtxt(labelsPath) X = np.loadtxt(dataPath) X = utils.whiten_data(X) X = np.append(X, np.ones((X.shape[0], 1)), axis=1) return Dataset(X, Y, X.shape[1] - 1, X.shape[0], classification=True), None
def get_dataset(self, Ntrain=16000, fpath='../data/bikeshare/hour.csv', seed=None): f = open(fpath, 'r') header = f.readline().split(',') X = [] Y = [] #base_date = datetime.strptime('2011-01-01') #date_fmt = '%Y-%m-%d' for line in f: split = line.split(',') Y.append(int(split[-1])) x = np.zeros(12) x[0] = split[6] x[1] = split[8] x[1 + int(split[9])] = 1 x[6:] = [float(val) for val in split[10:16]] #days = (datetime.strptime(split[1]) - base_date).days X.append(x) f.close() np.random.seed(1234) X = np.array(X) Y = np.array(Y) perm = np.random.permutation(X.shape[0]) X = X[perm, :] X = utils.whiten_data(X) Y = Y[perm] return (Dataset(X[:Ntrain], Y[:Ntrain], D=12, N=Ntrain, classification=False), Dataset(X[Ntrain:], Y[Ntrain:], D=12, N=len(X) - Ntrain, classification=False))
def get_dataset(self, small=False, smallNsmallDDataset=False, Ntrain=None, seed=1234, **kwargs): if small is True: dataPath = os.path.join(self.filepath, 'parsedX_small.txt') labelsPath = os.path.join(self.filepath, 'parsedY_small.txt') elif smallNsmallDDataset is True: dataPath = os.path.join(self.filepath, 'parsedX_smallNsmallD.txt') labelsPath = os.path.join(self.filepath, 'parsedY_smallNsmallD.txt') else: dataPath = os.path.join(self.filepath, 'X-N=8000-D=5408.txt') labelsPath = os.path.join(self.filepath, 'Y-N=8000.txt') Y = np.loadtxt(labelsPath) X = np.loadtxt(dataPath) if Ntrain is not None and Ntrain != 0: np.random.seed(seed) positiveLocs = np.where(Y == 1) negativeLocs = np.where(Y == -1) remainder = Ntrain - positiveLocs[0].shape[0] negativeSubsample = np.random.choice(negativeLocs[0], remainder, replace=False) X = np.append(X[positiveLocs[0], :], X[negativeSubsample, :], axis=0) Y = np.append(Y[positiveLocs[0]], Y[negativeSubsample], axis=0) X = utils.whiten_data(X) X = np.append(X, np.ones((X.shape[0], 1)), axis=1) return Dataset(X, Y, X.shape[1] - 1, X.shape[0], classification=True), None