def __init__(self, npy_filename, which_set, one_hot, split): assert which_set in ['train', 'valid', 'test'] self.one_hot = one_hot self.split = split # Load data from .npy file npy_filename_root = os.path.join(preprocess('${PYLEARN2_DATA_PATH}'), 'icml07data', 'npy', npy_filename) x_file = npy_filename_root + '_inputs.npy' y_file = npy_filename_root + '_labels.npy' x_file = datasetCache.cache_file(x_file) y_file = datasetCache.cache_file(y_file) data_x = np.load(x_file, mmap_mode='r') data_y = np.load(y_file, mmap_mode='r') # some sanity checkes assert np.isfinite(data_x).all() assert np.isfinite(data_y).all() assert data_x.shape[0] == data_y.shape[0] # extract n_train, n_valid, n_test = split sets = { 'train': (0, n_train), 'valid': (n_train, n_train + n_valid), 'test': (n_train + n_valid, n_train + n_valid + n_test) } start, end = sets[which_set] data_x = data_x[start:end] data_y = data_y[start:end] if one_hot: n_examples = data_y.shape[0] n_classes = data_y.max() + 1 data_oh = np.zeros((n_examples, n_classes), dtype='float32') for i in xrange(data_y.shape[0]): data_oh[i, data_y[i]] = 1. data_y = data_oh view_converter = DefaultViewConverter((28, 28, 1)) super(ICML07DataSet, self).__init__( X=data_x, y=data_y, view_converter=view_converter)
def __init__(self, npy_filename, which_set, one_hot, split): assert which_set in ['train', 'valid', 'test'] self.one_hot = one_hot self.split = split # Load data from .npy file npy_filename_root = os.path.join(preprocess('${PYLEARN2_DATA_PATH}'), 'icml07data', 'npy', npy_filename) x_file = npy_filename_root + '_inputs.npy' y_file = npy_filename_root + '_labels.npy' x_file = datasetCache.cache_file(x_file) y_file = datasetCache.cache_file(y_file) data_x = np.load(x_file, mmap_mode='r') data_y = np.load(y_file, mmap_mode='r') # some sanity checkes assert np.isfinite(data_x).all() assert np.isfinite(data_y).all() assert data_x.shape[0] == data_y.shape[0] # extract n_train, n_valid, n_test = split sets = { 'train': (0, n_train), 'valid': (n_train, n_train + n_valid), 'test': (n_train + n_valid, n_train + n_valid + n_test) } start, end = sets[which_set] data_x = data_x[start:end] data_y = data_y[start:end] if one_hot: n_examples = data_y.shape[0] n_classes = data_y.max() + 1 data_oh = np.zeros((n_examples, n_classes), dtype='float32') for i in xrange(data_y.shape[0]): data_oh[i, data_y[i]] = 1. data_y = data_oh view_converter = DefaultViewConverter((28, 28, 1)) super(ICML07DataSet, self).__init__(X=data_x, y=data_y, view_converter=view_converter)
def load(cls, which_set): base = '%s/norb_small/foveated/smallnorb-' base = base % os.getenv('PYLEARN2_DATA_PATH') if which_set == 'train': base += '5x46789x9x18x6x2x96x96-training-dat' else: base += '5x01235x9x18x6x2x96x96-testing-dat' fname = base + '.npy' fname = datasetCache.cache_file(fname) data = np.load(fname, 'r') return data
def load(cls, which_set): base = "%s/norb_small/foveated/smallnorb-" base = base % os.getenv("PYLEARN2_DATA_PATH") if which_set == "train": base += "5x46789x9x18x6x2x96x96-training-dat" else: base += "5x01235x9x18x6x2x96x96-testing-dat" fname = base + ".npy" fname = datasetCache.cache_file(fname) data = numpy.load(fname, "r") return data
def load(cls, which_set, desc): """ .. todo:: WRITEME """ assert desc in ['dat', 'cat', 'info'] base = '%s/norb_small/original_npy/smallnorb-' base = base % os.getenv('PYLEARN2_DATA_PATH') if which_set == 'train': base += '5x46789x9x18x6x2x96x96-training' else: base += '5x01235x9x18x6x2x96x96-testing' fname = base + '-%s.npy' % desc fname = datasetCache.cache_file(fname) fp = open(fname, 'r') data = np.load(fp) fp.close() return data
def load(cls, which_set, desc): """ .. todo:: WRITEME """ assert desc in ["dat", "cat", "info"] base = "%s/norb_small/original_npy/smallnorb-" base = base % os.getenv("PYLEARN2_DATA_PATH") if which_set == "train": base += "5x46789x9x18x6x2x96x96-training" else: base += "5x01235x9x18x6x2x96x96-testing" fname = base + "-%s.npy" % desc fname = datasetCache.cache_file(fname) fp = open(fname, "r") data = numpy.load(fp) fp.close() return data
def load(cls, which_set, filetype, subtensor): """Reads and returns a single file as a numpy array.""" assert which_set in ['train', 'test'] assert filetype in ['dat', 'cat', 'info'] def getPath(which_set): dirname = os.path.join(os.getenv('PYLEARN2_DATA_PATH'), 'norb_small/original') if which_set == 'train': instance_list = '46789' elif which_set == 'test': instance_list = '01235' filename = 'smallnorb-5x%sx9x18x6x2x96x96-%s-%s.mat' % \ (instance_list, which_set + 'ing', filetype) return os.path.join(dirname, filename) def parseNORBFile(file_handle, subtensor=None, debug=False): """ Load all or part of file 'file_handle' into a numpy ndarray .. todo:: WRITEME properly :param file_handle: file from which to read file can be opended with open(), gzip.open() and bz2.BZ2File() @type file_handle: file-like object. Can be a gzip open file. :param subtensor: If subtensor is not None, it should be like the argument to numpy.ndarray.__getitem__. The following two expressions should return equivalent ndarray objects, but the one on the left may be faster and more memory efficient if the underlying file f is big. read(file_handle, subtensor) <===> read(file_handle)[*subtensor] Support for subtensors is currently spotty, so check the code to see if your particular type of subtensor is supported. """ def readNums(file_handle, num_type, count): """ Reads 4 bytes from file, returns it as a 32-bit integer. """ num_bytes = count * numpy.dtype(num_type).itemsize string = file_handle.read(num_bytes) return numpy.fromstring(string, dtype=num_type) def readHeader(file_handle, debug=False, from_gzip=None): """ .. todo:: WRITEME properly :param file_handle: an open file handle. :type file_handle: a file or gzip.GzipFile object :param from_gzip: bool or None :type from_gzip: if None determine the type of file handle. :returns: data type, element size, rank, shape, size """ if from_gzip is None: from_gzip = isinstance(file_handle, (gzip.GzipFile, bz2.BZ2File)) key_to_type = {0x1E3D4C51: ('float32', 4), # what is a packed matrix? # 0x1E3D4C52: ('packed matrix', 0), 0x1E3D4C53: ('float64', 8), 0x1E3D4C54: ('int32', 4), 0x1E3D4C55: ('uint8', 1), 0x1E3D4C56: ('int16', 2)} type_key = readNums(file_handle, 'int32', 1)[0] elem_type, elem_size = key_to_type[type_key] if debug: logger.debug("header's type key, type, type size: " "{0} {1} {2}".format(type_key, elem_type, elem_size)) if elem_type == 'packed matrix': raise NotImplementedError('packed matrix not supported') num_dims = readNums(file_handle, 'int32', 1)[0] if debug: logger.debug('# of dimensions, according to header: ' '{0}'.format(num_dims)) if from_gzip: shape = readNums(file_handle, 'int32', max(num_dims, 3))[:num_dims] else: shape = numpy.fromfile(file_handle, dtype='int32', count=max(num_dims, 3))[:num_dims] if debug: logger.debug('Tensor shape, as listed in header: ' '{0}'.format(shape)) return elem_type, elem_size, shape elem_type, elem_size, shape = readHeader(file_handle, debug) beginning = file_handle.tell() num_elems = numpy.prod(shape) result = None if isinstance(file_handle, (gzip.GzipFile, bz2.BZ2File)): assert subtensor is None, \ "Subtensors on gzip files are not implemented." result = readNums(file_handle, elem_type, num_elems * elem_size).reshape(shape) elif subtensor is None: result = numpy.fromfile(file_handle, dtype=elem_type, count=num_elems).reshape(shape) elif isinstance(subtensor, slice): if subtensor.step not in (None, 1): raise NotImplementedError('slice with step', subtensor.step) if subtensor.start not in (None, 0): bytes_per_row = numpy.prod(shape[1:]) * elem_size file_handle.seek( beginning + subtensor.start * bytes_per_row) shape[0] = min(shape[0], subtensor.stop) - subtensor.start num_elems = numpy.prod(shape) result = numpy.fromfile(file_handle, dtype=elem_type, count=num_elems).reshape(shape) else: raise NotImplementedError('subtensor access not written yet:', subtensor) return result fname = getPath(which_set) fname = datasetCache.cache_file(fname) file_handle = open(fname) return parseNORBFile(file_handle, subtensor)
def load(cls, which_set, filetype, subtensor): """Reads and returns a single file as a numpy array.""" assert which_set in ['train', 'test'] assert filetype in ['dat', 'cat', 'info'] def getPath(which_set): dirname = os.path.join(os.getenv('PYLEARN2_DATA_PATH'), 'norb_small/original') if which_set == 'train': instance_list = '46789' elif which_set == 'test': instance_list = '01235' filename = 'smallnorb-5x%sx9x18x6x2x96x96-%s-%s.mat' % \ (instance_list, which_set + 'ing', filetype) return os.path.join(dirname, filename) def parseNORBFile(file_handle, subtensor=None, debug=False): """ Load all or part of file 'file_handle' into a numpy ndarray .. todo:: WRITEME properly :param file_handle: file from which to read file can be opended with open(), gzip.open() and bz2.BZ2File() @type file_handle: file-like object. Can be a gzip open file. :param subtensor: If subtensor is not None, it should be like the argument to numpy.ndarray.__getitem__. The following two expressions should return equivalent ndarray objects, but the one on the left may be faster and more memory efficient if the underlying file f is big. read(file_handle, subtensor) <===> read(file_handle)[*subtensor] Support for subtensors is currently spotty, so check the code to see if your particular type of subtensor is supported. """ def readNums(file_handle, num_type, count): """ Reads 4 bytes from file, returns it as a 32-bit integer. """ num_bytes = count * numpy.dtype(num_type).itemsize string = file_handle.read(num_bytes) return numpy.fromstring(string, dtype=num_type) def readHeader(file_handle, debug=False, from_gzip=None): """ .. todo:: WRITEME properly :param file_handle: an open file handle. :type file_handle: a file or gzip.GzipFile object :param from_gzip: bool or None :type from_gzip: if None determine the type of file handle. :returns: data type, element size, rank, shape, size """ if from_gzip is None: from_gzip = isinstance(file_handle, (gzip.GzipFile, bz2.BZ2File)) key_to_type = { 0x1E3D4C51: ('float32', 4), # what is a packed matrix? # 0x1E3D4C52: ('packed matrix', 0), 0x1E3D4C53: ('float64', 8), 0x1E3D4C54: ('int32', 4), 0x1E3D4C55: ('uint8', 1), 0x1E3D4C56: ('int16', 2) } type_key = readNums(file_handle, 'int32', 1)[0] elem_type, elem_size = key_to_type[type_key] if debug: logger.debug("header's type key, type, type size: " "{0} {1} {2}".format(type_key, elem_type, elem_size)) if elem_type == 'packed matrix': raise NotImplementedError('packed matrix not supported') num_dims = readNums(file_handle, 'int32', 1)[0] if debug: logger.debug('# of dimensions, according to header: ' '{0}'.format(num_dims)) if from_gzip: shape = readNums(file_handle, 'int32', max(num_dims, 3))[:num_dims] else: shape = numpy.fromfile(file_handle, dtype='int32', count=max(num_dims, 3))[:num_dims] if debug: logger.debug('Tensor shape, as listed in header: ' '{0}'.format(shape)) return elem_type, elem_size, shape elem_type, elem_size, shape = readHeader(file_handle, debug) beginning = file_handle.tell() num_elems = numpy.prod(shape) result = None if isinstance(file_handle, (gzip.GzipFile, bz2.BZ2File)): assert subtensor is None, \ "Subtensors on gzip files are not implemented." result = readNums(file_handle, elem_type, num_elems * elem_size).reshape(shape) elif subtensor is None: result = numpy.fromfile(file_handle, dtype=elem_type, count=num_elems).reshape(shape) elif isinstance(subtensor, slice): if subtensor.step not in (None, 1): raise NotImplementedError('slice with step', subtensor.step) if subtensor.start not in (None, 0): bytes_per_row = numpy.prod(shape[1:]) * elem_size file_handle.seek(beginning + subtensor.start * bytes_per_row) shape[0] = min(shape[0], subtensor.stop) - subtensor.start num_elems = numpy.prod(shape) result = numpy.fromfile(file_handle, dtype=elem_type, count=num_elems).reshape(shape) else: raise NotImplementedError('subtensor access not written yet:', subtensor) return result fname = getPath(which_set) fname = datasetCache.cache_file(fname) file_handle = open(fname) return parseNORBFile(file_handle, subtensor)