Ejemplo n.º 1
0
    def get_dataset(self, **kwargs):
        filepath = '../data'
        dataPath = os.path.join(filepath, 'dualALOO-N=300-D=200_X.txt')
        labelsPath = os.path.join(filepath, 'dualALOO-N=300-D=200_Y.txt')
        Y = np.loadtxt(labelsPath)
        X = np.loadtxt(dataPath)
        X = utils.whiten_data(X)
        good_features = np.setdiff1d(np.arange(X.shape[1]),
                                     np.where(np.var(X, axis=0) == 0)[0])
        X = X[:, good_features]
        X = np.append(X, np.ones((X.shape[0], 1)), axis=1)

        return Dataset(X, Y, X.shape[1] - 1, X.shape[0],
                       classification=True), None
Ejemplo n.º 2
0
    def get_dataset(self, **kwargs):
        filepath = '../data/E2006-tfidf'
        #dataPath = os.path.join(filepath, 'X-N=full-D=last60k.h5')
        #f = h5py.File(dataPath, 'r')
        #X = np.array(f['X'])
        #Y = np.array(f['Y'])
        dataPath = os.path.join(filepath, 'X-N=full-D=last60k.txt')
        labelPath = os.path.join(filepath, 'Y-N=full.txt')
        X = np.loadtxt(dataPath)
        Y = np.loadtxt(labelPath)
        X = utils.whiten_data(X)
        X = np.append(X, np.ones((X.shape[0], 1)), axis=1)

        return Dataset(X, Y, X.shape[1] - 1, X.shape[0],
                       classification=True), None
Ejemplo n.º 3
0
    def get_dataset(self, small=False, seed=1234, **kwargs):

        if small is True:
            dataPath = os.path.join(self.filepath, 'X_5k_1234.txt')
            labelsPath = os.path.join(self.filepath, 'Y_5k_1234.txt')
        else:
            dataPath = os.path.join(self.filepath, 'X.txt')
            labelsPath = os.path.join(self.filepath, 'Y.txt')

        Y = np.loadtxt(labelsPath)
        X = np.loadtxt(dataPath)
        bad_dims = np.where(np.var(X, axis=0) == 0)
        good_dims = np.setdiff1d(np.arange(X.shape[1]), bad_dims)
        X = X[:, good_dims]

        X = utils.whiten_data(X)
        X = np.append(X, np.ones((X.shape[0], 1)), axis=1)
        return Dataset(X, Y, X.shape[1] - 1, X.shape[0],
                       classification=False), None
Ejemplo n.º 4
0
    def get_dataset(self, filepath=None, **kwargs):
        if filepath is None:
            filepath = '../data/BlogFeedback'
        dataPath = os.path.join(filepath, 'X-N=20000-D=20280.txt')
        labelsPath = os.path.join(filepath, 'Y-N=20000.txt')

        Y = np.loadtxt(labelsPath)
        X = np.loadtxt(dataPath)
        X = utils.whiten_data(X)
        good_features = np.setdiff1d(np.arange(X.shape[1]),
                                     np.where(np.var(X, axis=0) == 0)[0])
        X = X[:, good_features]
        X = np.append(X, np.ones((X.shape[0], 1)), axis=1)

        goodYs = np.setdiff1d(np.arange(X.shape[0]), np.where(Y > 1000)[0])
        X = X[goodYs]
        Y = Y[goodYs]

        return Dataset(X, Y, X.shape[1] - 1, X.shape[0],
                       classification=True), None
Ejemplo n.º 5
0
    def get_dataset(self,
                    small=False,
                    smallNsmallDDataset=False,
                    smallD=False,
                    Ntrain=None,
                    seed=1234,
                    **kwargs):

        if not smallD:
            dataPath = os.path.join(self.filepath, 'X_clean.txt')
        else:
            dataPath = os.path.join(self.filepath, 'X_smallD.txt')

        labelsPath = os.path.join(self.filepath, 'Y_clean.txt')
        Y = np.loadtxt(labelsPath)
        X = np.loadtxt(dataPath)
        X = utils.whiten_data(X)
        X = np.append(X, np.ones((X.shape[0], 1)), axis=1)
        return Dataset(X, Y, X.shape[1] - 1, X.shape[0],
                       classification=True), None
Ejemplo n.º 6
0
    def get_dataset(self,
                    Ntrain=16000,
                    fpath='../data/bikeshare/hour.csv',
                    seed=None):

        f = open(fpath, 'r')
        header = f.readline().split(',')
        X = []
        Y = []
        #base_date = datetime.strptime('2011-01-01')
        #date_fmt = '%Y-%m-%d'

        for line in f:
            split = line.split(',')
            Y.append(int(split[-1]))
            x = np.zeros(12)
            x[0] = split[6]
            x[1] = split[8]
            x[1 + int(split[9])] = 1
            x[6:] = [float(val) for val in split[10:16]]
            #days = (datetime.strptime(split[1]) - base_date).days
            X.append(x)
        f.close()

        np.random.seed(1234)
        X = np.array(X)
        Y = np.array(Y)
        perm = np.random.permutation(X.shape[0])
        X = X[perm, :]
        X = utils.whiten_data(X)
        Y = Y[perm]
        return (Dataset(X[:Ntrain],
                        Y[:Ntrain],
                        D=12,
                        N=Ntrain,
                        classification=False),
                Dataset(X[Ntrain:],
                        Y[Ntrain:],
                        D=12,
                        N=len(X) - Ntrain,
                        classification=False))
Ejemplo n.º 7
0
    def get_dataset(self,
                    small=False,
                    smallNsmallDDataset=False,
                    Ntrain=None,
                    seed=1234,
                    **kwargs):
        if small is True:
            dataPath = os.path.join(self.filepath, 'parsedX_small.txt')
            labelsPath = os.path.join(self.filepath, 'parsedY_small.txt')
        elif smallNsmallDDataset is True:
            dataPath = os.path.join(self.filepath, 'parsedX_smallNsmallD.txt')
            labelsPath = os.path.join(self.filepath,
                                      'parsedY_smallNsmallD.txt')
        else:
            dataPath = os.path.join(self.filepath, 'X-N=8000-D=5408.txt')
            labelsPath = os.path.join(self.filepath, 'Y-N=8000.txt')

        Y = np.loadtxt(labelsPath)
        X = np.loadtxt(dataPath)

        if Ntrain is not None and Ntrain != 0:
            np.random.seed(seed)
            positiveLocs = np.where(Y == 1)
            negativeLocs = np.where(Y == -1)
            remainder = Ntrain - positiveLocs[0].shape[0]
            negativeSubsample = np.random.choice(negativeLocs[0],
                                                 remainder,
                                                 replace=False)

            X = np.append(X[positiveLocs[0], :],
                          X[negativeSubsample, :],
                          axis=0)
            Y = np.append(Y[positiveLocs[0]], Y[negativeSubsample], axis=0)

        X = utils.whiten_data(X)
        X = np.append(X, np.ones((X.shape[0], 1)), axis=1)
        return Dataset(X, Y, X.shape[1] - 1, X.shape[0],
                       classification=True), None