def __init__(self, which_set='train', n_datapoints=None, fname="mnist.pkl.gz", preproc=[]): super(MNIST, self).__init__(preproc) _logger.info("Loading MNIST data") fname = datapath(fname) if fname[-3:] == ".gz": open_func = gzip.open else: open_func = open with open_func(fname) as f: (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = pickle.load(f) if which_set == 'train': self.X, self.Y = self.prepare(train_x, train_y, n_datapoints) elif which_set == 'valid': self.X, self.Y = self.prepare(valid_x, valid_y, n_datapoints) elif which_set == 'test': self.X, self.Y = self.prepare(test_x, test_y, n_datapoints) elif which_set == 'salakhutdinov_train': train_x = np.concatenate([train_x, valid_x]) train_y = np.concatenate([train_y, valid_y]) self.X, self.Y = self.prepare(train_x, train_y, n_datapoints) elif which_set == 'salakhutdinov_valid': train_x = np.concatenate([train_x, valid_x])[::-1] train_y = np.concatenate([train_y, valid_y])[::-1] self.X, self.Y = self.prepare(train_x, train_y, n_datapoints) else: raise ValueError("Unknown dataset %s" % which_set) self.n_datapoints = self.X.shape[0]
def __init__(self, which_set='train', fname="chardata.mat", shuffle_seed=123, n_used_for_validation=1345, preproc=[]): super(Omniglot, self).__init__(preproc) def reshape_data(data): return data.reshape((-1, 28, 28)).reshape((-1, 28*28), order='fortran') _logger.info("Loading Omniglot data (28x28)") fname = datapath(fname) omni_raw = scipy.io.loadmat(fname) train_data = reshape_data(omni_raw['data'].T.astype('float32')) test_data = reshape_data(omni_raw['testdata'].T.astype('float32')) permutation = np.random.RandomState(seed=shuffle_seed).permutation(train_data.shape[0]) train_data = train_data[permutation] if which_set == 'train': self.X = train_data[:-n_used_for_validation] elif which_set == 'valid': self.X = train_data[-n_used_for_validation:] elif which_set == 'test': self.X = test_data else: raise ValueError("Unknown dataset %s" % which_set) self.n_datapoints = self.X.shape[0] self.Y = np.zeros((self.n_datapoints, 2), dtype=floatX)
def __init__(self, which_set='train', n_datapoints=-1, path="caltech-silhouettes", preproc=[]): super(CalTechSilhouettes, self).__init__(preproc) _logger.info("Loading CalTech 101 Silhouettes data (28x28)") path = datapath(path) test_x = np.load(path+"/test_data.npy") test_y = np.load(path+"/test_labels.npy") if which_set == 'train': X = np.load(path+"/train_data.npy") Y = np.load(path+"/train_labels.npy") elif which_set == 'valid': X = np.load(path+"/val_data.npy") Y = np.load(path+"/val_labels.npy") elif which_set == 'test': X = np.load(path+"/test_data.npy") Y = np.load(path+"/test_labels.npy") else: raise ValueError("Unknown dataset %s" % which_set) if n_datapoints > 0: X = X[:n_datapoints] Y = Y[:n_datapoints] else: n_datapoints = X.shape[0] X = X.astype(floatX) self.n_datapoints = n_datapoints self.X = X self.Y = Y
def __init__(self, which_set='train', size=48, fold=0, n_datapoints=-1, path="TFD", preproc=[]): super(TorontoFaceDataset, self).__init__(preproc) _logger.info("Loading Toronto Face Dataset (48x48)") fname = datapath(path) if size == 48: fname += "/TFD_48x48.mat" elif size == 96: fname += "/TFD_96x96.mat" else: raise ValueError("Unknown size %s. Allowerd options 48 or 96." % size) assert 0 <= fold and fold <= 4 # Load dataset data = loadmat(fname) if which_set == 'unlabeled': idx = (data['folds'][:, fold] == 0) elif which_set == 'train': idx = (data['folds'][:, fold] == 1) elif which_set == 'unlabeled+train': idx = (data['folds'][:, fold] == 0) idx += (data['folds'][:, fold] == 1) elif which_set == 'valid': idx = (data['folds'][:, fold] == 2) elif which_set == 'test': idx = (data['folds'][:, fold] == 3) else: raise ValueError("Unknown dataset %s" % which_set) X = data['images'][idx, :, :] #Y = data['labs_id'][idx,:] if n_datapoints > 0: X = X[:n_datapoints] Y = Y[:n_datapoints] else: n_datapoints = X.shape[0] # Normalize to 0..1 X = (X / 255.).astype(floatX) # Flatten images X = X.reshape([n_datapoints, -1]) self.n_datapoints = n_datapoints self.X = X self.Y = None
def __init__(self, which_set='train', size=48, fold=0, n_datapoints=-1, path="TFD", preproc=[]): super(TorontoFaceDataset, self).__init__(preproc) _logger.info("Loading Toronto Face Dataset (48x48)") fname = datapath(path) if size == 48: fname += "/TFD_48x48.mat" elif size == 96: fname += "/TFD_96x96.mat" else: raise ValueError("Unknown size %s. Allowerd options 48 or 96." % size) assert 0 <= fold and fold <= 4 # Load dataset data = loadmat(fname) if which_set == 'unlabeled': idx = (data['folds'][:,fold] == 0) elif which_set == 'train': idx = (data['folds'][:,fold] == 1) elif which_set == 'unlabeled+train': idx = (data['folds'][:,fold] == 0) idx += (data['folds'][:,fold] == 1) elif which_set == 'valid': idx = (data['folds'][:,fold] == 2) elif which_set == 'test': idx = (data['folds'][:,fold] == 3) else: raise ValueError("Unknown dataset %s" % which_set) X = data['images'][idx,:,:] #Y = data['labs_id'][idx,:] if n_datapoints > 0: X = X[:n_datapoints] Y = Y[:n_datapoints] else: n_datapoints = X.shape[0] # Normalize to 0..1 X = (X / 255.).astype(floatX) # Flatten images X = X.reshape([n_datapoints, -1]) self.n_datapoints = n_datapoints self.X = X self.Y = None
def __init__(self, which_set='train', fname="caltech101_silhouettes_28_split1.mat", preproc=[]): super(CalTech101Silhouettes, self).__init__(preproc) _logger.info("Loading CalTech 101 Silhouettes data (28x28)") fname = datapath(fname) if which_set == 'train': self.X = scipy.io.loadmat(fname)['train_data'].astype(floatX) elif which_set == 'valid': self.X = scipy.io.loadmat(fname)['val_data'].astype(floatX) elif which_set == 'test': self.X = scipy.io.loadmat(fname)['test_data'].astype(floatX) else: raise ValueError("Unknown dataset %s" % which_set) self.n_datapoints = self.X.shape[0] self.Y = np.zeros((self.n_datapoints, 2), dtype=floatX)
def __init__(self, data_name, which_set='train', preproc=[]): super(UCIBinary, self).__init__(preproc) UCIdatasets = { 'adult': 'adult.h5', 'connect4': 'connect4.h5', 'dna': 'dna.h5', 'mushrooms': 'mushrooms.h5', 'nips': 'nips.h5', 'ocrletters': 'ocr_letters.h5', 'rcv1': 'rcv1.h5', 'web': 'web.h5' } assert data_name in UCIdatasets.keys() _logger.info("Loading %s data" % data_name) fname = datapath(UCIdatasets[data_name]) try: with h5py.File(fname, "r") as h5: if which_set == 'train': train_x = h5['train'] self.X = np.array(train_x).astype(floatX) elif which_set == 'valid': valid_x = h5['valid'] self.X = np.array(valid_x).astype(floatX) elif which_set == 'test': test_x = h5['test'] self.X = np.array(test_x).astype(floatX) else: raise ValueError("Unknown dataset %s" % which_set) except KeyError, e: logger.info("Failed to read data from %s: %s" % (fname, e)) exit(1)