Esempio n. 1
0
    def __init__(self, npy_filename, which_set, one_hot, split):
        assert which_set in ['train', 'valid', 'test']

        self.one_hot = one_hot
        self.split = split

        # Load data from .npy file
        npy_filename_root = os.path.join(preprocess('${PYLEARN2_DATA_PATH}'),
                                         'icml07data',
                                         'npy',
                                         npy_filename)

        x_file = npy_filename_root + '_inputs.npy'
        y_file = npy_filename_root + '_labels.npy'
        x_file = datasetCache.cache_file(x_file)
        y_file = datasetCache.cache_file(y_file)
        data_x = np.load(x_file, mmap_mode='r')
        data_y = np.load(y_file, mmap_mode='r')

        # some sanity checkes
        assert np.isfinite(data_x).all()
        assert np.isfinite(data_y).all()
        assert data_x.shape[0] == data_y.shape[0]

        # extract
        n_train, n_valid, n_test = split
        sets = {
            'train': (0, n_train),
            'valid': (n_train, n_train + n_valid),
            'test': (n_train + n_valid, n_train + n_valid + n_test)
        }
        start, end = sets[which_set]

        data_x = data_x[start:end]
        data_y = data_y[start:end]

        if one_hot:
            n_examples = data_y.shape[0]
            n_classes = data_y.max() + 1

            data_oh = np.zeros((n_examples, n_classes), dtype='float32')
            for i in xrange(data_y.shape[0]):
                data_oh[i, data_y[i]] = 1.
            data_y = data_oh

        view_converter = DefaultViewConverter((28, 28, 1))
        super(ICML07DataSet, self).__init__(
            X=data_x, y=data_y, view_converter=view_converter)
Esempio n. 2
0
    def __init__(self, npy_filename, which_set, one_hot, split):
        assert which_set in ['train', 'valid', 'test']

        self.one_hot = one_hot
        self.split = split

        # Load data from .npy file
        npy_filename_root = os.path.join(preprocess('${PYLEARN2_DATA_PATH}'),
                                         'icml07data', 'npy', npy_filename)

        x_file = npy_filename_root + '_inputs.npy'
        y_file = npy_filename_root + '_labels.npy'
        x_file = datasetCache.cache_file(x_file)
        y_file = datasetCache.cache_file(y_file)
        data_x = np.load(x_file, mmap_mode='r')
        data_y = np.load(y_file, mmap_mode='r')

        # some sanity checkes
        assert np.isfinite(data_x).all()
        assert np.isfinite(data_y).all()
        assert data_x.shape[0] == data_y.shape[0]

        # extract
        n_train, n_valid, n_test = split
        sets = {
            'train': (0, n_train),
            'valid': (n_train, n_train + n_valid),
            'test': (n_train + n_valid, n_train + n_valid + n_test)
        }
        start, end = sets[which_set]

        data_x = data_x[start:end]
        data_y = data_y[start:end]

        if one_hot:
            n_examples = data_y.shape[0]
            n_classes = data_y.max() + 1

            data_oh = np.zeros((n_examples, n_classes), dtype='float32')
            for i in xrange(data_y.shape[0]):
                data_oh[i, data_y[i]] = 1.
            data_y = data_oh

        view_converter = DefaultViewConverter((28, 28, 1))
        super(ICML07DataSet, self).__init__(X=data_x,
                                            y=data_y,
                                            view_converter=view_converter)
Esempio n. 3
0
    def load(cls, which_set):

        base = '%s/norb_small/foveated/smallnorb-'
        base = base % os.getenv('PYLEARN2_DATA_PATH')
        if which_set == 'train':
            base += '5x46789x9x18x6x2x96x96-training-dat'
        else:
            base += '5x01235x9x18x6x2x96x96-testing-dat'

        fname = base + '.npy'
        fname = datasetCache.cache_file(fname)
        data = np.load(fname, 'r')
        return data
Esempio n. 4
0
    def load(cls, which_set):

        base = "%s/norb_small/foveated/smallnorb-"
        base = base % os.getenv("PYLEARN2_DATA_PATH")
        if which_set == "train":
            base += "5x46789x9x18x6x2x96x96-training-dat"
        else:
            base += "5x01235x9x18x6x2x96x96-testing-dat"

        fname = base + ".npy"
        fname = datasetCache.cache_file(fname)
        data = numpy.load(fname, "r")
        return data
Esempio n. 5
0
    def load(cls, which_set, desc):
        """
        .. todo::

            WRITEME
        """
        assert desc in ['dat', 'cat', 'info']

        base = '%s/norb_small/original_npy/smallnorb-'
        base = base % os.getenv('PYLEARN2_DATA_PATH')
        if which_set == 'train':
            base += '5x46789x9x18x6x2x96x96-training'
        else:
            base += '5x01235x9x18x6x2x96x96-testing'

        fname = base + '-%s.npy' % desc
        fname = datasetCache.cache_file(fname)
        fp = open(fname, 'r')
        data = np.load(fp)
        fp.close()

        return data
Esempio n. 6
0
    def load(cls, which_set, desc):
        """
        .. todo::

            WRITEME
        """
        assert desc in ["dat", "cat", "info"]

        base = "%s/norb_small/original_npy/smallnorb-"
        base = base % os.getenv("PYLEARN2_DATA_PATH")
        if which_set == "train":
            base += "5x46789x9x18x6x2x96x96-training"
        else:
            base += "5x01235x9x18x6x2x96x96-testing"

        fname = base + "-%s.npy" % desc
        fname = datasetCache.cache_file(fname)
        fp = open(fname, "r")
        data = numpy.load(fp)
        fp.close()

        return data
Esempio n. 7
0
    def load(cls, which_set, filetype, subtensor):
        """Reads and returns a single file as a numpy array."""

        assert which_set in ['train', 'test']
        assert filetype in ['dat', 'cat', 'info']

        def getPath(which_set):
            dirname = os.path.join(os.getenv('PYLEARN2_DATA_PATH'),
                                   'norb_small/original')
            if which_set == 'train':
                instance_list = '46789'
            elif which_set == 'test':
                instance_list = '01235'

            filename = 'smallnorb-5x%sx9x18x6x2x96x96-%s-%s.mat' % \
                (instance_list, which_set + 'ing', filetype)

            return os.path.join(dirname, filename)

        def parseNORBFile(file_handle, subtensor=None, debug=False):
            """
            Load all or part of file 'file_handle' into a numpy ndarray

            .. todo::

                WRITEME properly

            :param file_handle: file from which to read file can be opended
              with open(), gzip.open() and bz2.BZ2File()
              @type file_handle: file-like object. Can be a gzip open file.

            :param subtensor: If subtensor is not None, it should be like the
              argument to numpy.ndarray.__getitem__.  The following two
              expressions should return equivalent ndarray objects, but the one
              on the left may be faster and more memory efficient if the
              underlying file f is big.

              read(file_handle, subtensor) <===> read(file_handle)[*subtensor]

              Support for subtensors is currently spotty, so check the code to
              see if your particular type of subtensor is supported.
              """

            def readNums(file_handle, num_type, count):
                """
                Reads 4 bytes from file, returns it as a 32-bit integer.
                """
                num_bytes = count * numpy.dtype(num_type).itemsize
                string = file_handle.read(num_bytes)
                return numpy.fromstring(string, dtype=num_type)

            def readHeader(file_handle, debug=False, from_gzip=None):
                """
                .. todo::

                    WRITEME properly

                :param file_handle: an open file handle.
                :type file_handle: a file or gzip.GzipFile object

                :param from_gzip: bool or None
                :type from_gzip: if None determine the type of file handle.

                :returns: data type, element size, rank, shape, size
                """

                if from_gzip is None:
                    from_gzip = isinstance(file_handle,
                                           (gzip.GzipFile, bz2.BZ2File))

                key_to_type = {0x1E3D4C51: ('float32', 4),
                               # what is a packed matrix?
                               # 0x1E3D4C52: ('packed matrix', 0),
                               0x1E3D4C53: ('float64', 8),
                               0x1E3D4C54: ('int32', 4),
                               0x1E3D4C55: ('uint8', 1),
                               0x1E3D4C56: ('int16', 2)}

                type_key = readNums(file_handle, 'int32', 1)[0]
                elem_type, elem_size = key_to_type[type_key]
                if debug:
                    logger.debug("header's type key, type, type size: "
                                 "{0} {1} {2}".format(type_key, elem_type,
                                                      elem_size))
                if elem_type == 'packed matrix':
                    raise NotImplementedError('packed matrix not supported')

                num_dims = readNums(file_handle, 'int32', 1)[0]
                if debug:
                    logger.debug('# of dimensions, according to header: '
                                 '{0}'.format(num_dims))

                if from_gzip:
                    shape = readNums(file_handle,
                                     'int32',
                                     max(num_dims, 3))[:num_dims]
                else:
                    shape = numpy.fromfile(file_handle,
                                           dtype='int32',
                                           count=max(num_dims, 3))[:num_dims]

                if debug:
                    logger.debug('Tensor shape, as listed in header: '
                                 '{0}'.format(shape))

                return elem_type, elem_size, shape

            elem_type, elem_size, shape = readHeader(file_handle, debug)
            beginning = file_handle.tell()

            num_elems = numpy.prod(shape)

            result = None
            if isinstance(file_handle, (gzip.GzipFile, bz2.BZ2File)):
                assert subtensor is None, \
                    "Subtensors on gzip files are not implemented."
                result = readNums(file_handle,
                                  elem_type,
                                  num_elems * elem_size).reshape(shape)
            elif subtensor is None:
                result = numpy.fromfile(file_handle,
                                        dtype=elem_type,
                                        count=num_elems).reshape(shape)
            elif isinstance(subtensor, slice):
                if subtensor.step not in (None, 1):
                    raise NotImplementedError('slice with step',
                                              subtensor.step)
                if subtensor.start not in (None, 0):
                    bytes_per_row = numpy.prod(shape[1:]) * elem_size
                    file_handle.seek(
                        beginning + subtensor.start * bytes_per_row)
                shape[0] = min(shape[0], subtensor.stop) - subtensor.start
                num_elems = numpy.prod(shape)
                result = numpy.fromfile(file_handle,
                                        dtype=elem_type,
                                        count=num_elems).reshape(shape)
            else:
                raise NotImplementedError('subtensor access not written yet:',
                                          subtensor)

            return result
        fname = getPath(which_set)
        fname = datasetCache.cache_file(fname)
        file_handle = open(fname)

        return parseNORBFile(file_handle, subtensor)
Esempio n. 8
0
    def load(cls, which_set, filetype, subtensor):
        """Reads and returns a single file as a numpy array."""

        assert which_set in ['train', 'test']
        assert filetype in ['dat', 'cat', 'info']

        def getPath(which_set):
            dirname = os.path.join(os.getenv('PYLEARN2_DATA_PATH'),
                                   'norb_small/original')
            if which_set == 'train':
                instance_list = '46789'
            elif which_set == 'test':
                instance_list = '01235'

            filename = 'smallnorb-5x%sx9x18x6x2x96x96-%s-%s.mat' % \
                (instance_list, which_set + 'ing', filetype)

            return os.path.join(dirname, filename)

        def parseNORBFile(file_handle, subtensor=None, debug=False):
            """
            Load all or part of file 'file_handle' into a numpy ndarray

            .. todo::

                WRITEME properly

            :param file_handle: file from which to read file can be opended
              with open(), gzip.open() and bz2.BZ2File()
              @type file_handle: file-like object. Can be a gzip open file.

            :param subtensor: If subtensor is not None, it should be like the
              argument to numpy.ndarray.__getitem__.  The following two
              expressions should return equivalent ndarray objects, but the one
              on the left may be faster and more memory efficient if the
              underlying file f is big.

              read(file_handle, subtensor) <===> read(file_handle)[*subtensor]

              Support for subtensors is currently spotty, so check the code to
              see if your particular type of subtensor is supported.
              """
            def readNums(file_handle, num_type, count):
                """
                Reads 4 bytes from file, returns it as a 32-bit integer.
                """
                num_bytes = count * numpy.dtype(num_type).itemsize
                string = file_handle.read(num_bytes)
                return numpy.fromstring(string, dtype=num_type)

            def readHeader(file_handle, debug=False, from_gzip=None):
                """
                .. todo::

                    WRITEME properly

                :param file_handle: an open file handle.
                :type file_handle: a file or gzip.GzipFile object

                :param from_gzip: bool or None
                :type from_gzip: if None determine the type of file handle.

                :returns: data type, element size, rank, shape, size
                """

                if from_gzip is None:
                    from_gzip = isinstance(file_handle,
                                           (gzip.GzipFile, bz2.BZ2File))

                key_to_type = {
                    0x1E3D4C51: ('float32', 4),
                    # what is a packed matrix?
                    # 0x1E3D4C52: ('packed matrix', 0),
                    0x1E3D4C53: ('float64', 8),
                    0x1E3D4C54: ('int32', 4),
                    0x1E3D4C55: ('uint8', 1),
                    0x1E3D4C56: ('int16', 2)
                }

                type_key = readNums(file_handle, 'int32', 1)[0]
                elem_type, elem_size = key_to_type[type_key]
                if debug:
                    logger.debug("header's type key, type, type size: "
                                 "{0} {1} {2}".format(type_key, elem_type,
                                                      elem_size))
                if elem_type == 'packed matrix':
                    raise NotImplementedError('packed matrix not supported')

                num_dims = readNums(file_handle, 'int32', 1)[0]
                if debug:
                    logger.debug('# of dimensions, according to header: '
                                 '{0}'.format(num_dims))

                if from_gzip:
                    shape = readNums(file_handle, 'int32', max(num_dims,
                                                               3))[:num_dims]
                else:
                    shape = numpy.fromfile(file_handle,
                                           dtype='int32',
                                           count=max(num_dims, 3))[:num_dims]

                if debug:
                    logger.debug('Tensor shape, as listed in header: '
                                 '{0}'.format(shape))

                return elem_type, elem_size, shape

            elem_type, elem_size, shape = readHeader(file_handle, debug)
            beginning = file_handle.tell()

            num_elems = numpy.prod(shape)

            result = None
            if isinstance(file_handle, (gzip.GzipFile, bz2.BZ2File)):
                assert subtensor is None, \
                    "Subtensors on gzip files are not implemented."
                result = readNums(file_handle, elem_type,
                                  num_elems * elem_size).reshape(shape)
            elif subtensor is None:
                result = numpy.fromfile(file_handle,
                                        dtype=elem_type,
                                        count=num_elems).reshape(shape)
            elif isinstance(subtensor, slice):
                if subtensor.step not in (None, 1):
                    raise NotImplementedError('slice with step',
                                              subtensor.step)
                if subtensor.start not in (None, 0):
                    bytes_per_row = numpy.prod(shape[1:]) * elem_size
                    file_handle.seek(beginning +
                                     subtensor.start * bytes_per_row)
                shape[0] = min(shape[0], subtensor.stop) - subtensor.start
                num_elems = numpy.prod(shape)
                result = numpy.fromfile(file_handle,
                                        dtype=elem_type,
                                        count=num_elems).reshape(shape)
            else:
                raise NotImplementedError('subtensor access not written yet:',
                                          subtensor)

            return result

        fname = getPath(which_set)
        fname = datasetCache.cache_file(fname)
        file_handle = open(fname)

        return parseNORBFile(file_handle, subtensor)