Beispiel #1
0
    def __init__(self,
                 data=None,
                 data_specs=None,
                 rng=_default_seed,
                 preprocessor=None,
                 fit_preprocessor=False):
        # data_specs should be flat, and there should be no
        # duplicates in source, as we keep only one version
        assert is_flat_specs(data_specs)
        if isinstance(data_specs[1], tuple):
            assert sorted(set(data_specs[1])) == sorted(data_specs[1])
        space, source = data_specs
        space.np_validate(data)
        assert len(set(elem.shape[0] for elem in list(data))) <= 1
        self.data = data
        self.data_specs = data_specs
        self.num_examples = list(data)[0].shape[0]

        self.compress = False
        self.design_loc = None
        self.rng = make_np_rng(rng, which_method='random_integers')
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor
    def __init__(self, data=None, data_specs=None, rng=_default_seed,
                 preprocessor=None, fit_preprocessor=False):
        # data_specs should be flat, and there should be no
        # duplicates in source, as we keep only one version
        assert is_flat_specs(data_specs)
        if isinstance(data_specs[1], tuple):
            assert sorted(set(data_specs[1])) == sorted(data_specs[1])
        space, source = data_specs
        space.np_validate(data)
        # TODO: assume that data[0] is num example => error if channel in c01b
        # assert len(set(elem.shape[0] for elem in list(data))) <= 1
        self.data = data
        self.data_specs = data_specs
        # TODO: assume that data[0] is num example => error if channel in c01b
        self.num_examples = list(data)[-1].shape[0] # TODO: list(data)[0].shape[0]

        self.compress = False
        self.design_loc = None
        self.rng = make_np_rng(rng, which_method='random_integers')
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor
    def iterator(self,
                 mode=None,
                 batch_size=None,
                 num_batches=None,
                 rng=None,
                 data_specs=None,
                 return_tuple=False):
        """
        .. todo::

            WRITEME
        """
        # Build the right data_specs to query self.raw
        if data_specs is not None:
            assert is_flat_specs(data_specs)
            space, source = data_specs
            if not isinstance(source, tuple):
                source = (source, )
            if isinstance(space, CompositeSpace):
                space = tuple(space.components)
            else:
                space = (space, )

            # Put 'features' first, as this is what TransformerIterator
            # is expecting
            if 'features' not in source:
                # 'features is not needed, get things directly from
                # the original data
                raw_data_specs = data_specs
            else:
                feature_idx = source.index('features')
                if self.space_preserving:
                    # Ask self.raw for the data in the expected space,
                    # and self.transformer will operate in that space
                    feature_input_space = space[feature_idx]
                else:
                    # We need to ask the transformer what its input space is
                    feature_input_space = self.transformer.get_input_space()

                raw_space = CompositeSpace((feature_input_space, ) +
                                           space[:feature_idx] +
                                           space[feature_idx + 1:])
                raw_source = (('features', ) + source[:feature_idx] +
                              source[feature_idx + 1:])
                raw_data_specs = (raw_space, raw_source)
        else:
            raw_data_specs = None

        raw_iterator = self.raw.iterator(mode=mode,
                                         batch_size=batch_size,
                                         num_batches=num_batches,
                                         rng=rng,
                                         data_specs=raw_data_specs,
                                         return_tuple=return_tuple)

        final_iterator = TransformerIterator(raw_iterator,
                                             self,
                                             data_specs=data_specs)

        return final_iterator
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 topo=None, targets=None, rng=None, data_specs=None,
                 return_tuple=False):
        """
        .. todo::

            WRITEME
        """
        # Build the right data_specs to query self.raw
        if data_specs is not None:
            assert is_flat_specs(data_specs)
            space, source = data_specs
            if not isinstance(source, tuple):
                source = (source,)
            if isinstance(space, CompositeSpace):
                space = tuple(space.components)
            else:
                space = (space,)

            # Put 'features' first, as this is what TransformerIterator
            # is expecting
            if 'features' not in source:
                # 'features is not needed, get things directly from
                # the original data
                raw_data_specs = data_specs
            else:
                feature_idx = source.index('features')
                if self.space_preserving:
                    # Ask self.raw for the data in the expected space,
                    # and self.transformer will operate in that space
                    feature_input_space = space[feature_idx]
                else:
                    # We need to ask the transformer what its input space is
                    feature_input_space = self.transformer.get_input_space()

                raw_space = CompositeSpace((feature_input_space,)
                                           + space[:feature_idx]
                                           + space[feature_idx + 1:])
                raw_source = (('features',)
                              + source[:feature_idx]
                              + source[feature_idx + 1:])
                raw_data_specs = (raw_space, raw_source)
        else:
            raw_data_specs = None

        raw_iterator = self.raw.iterator(
            mode=mode, batch_size=batch_size,
            num_batches=num_batches, topo=topo, targets=targets, rng=rng,
            data_specs=raw_data_specs, return_tuple=return_tuple)

        final_iterator = TransformerIterator(raw_iterator, self,
                                             data_specs=data_specs)

        return final_iterator
    def __init__(self,
                 data=None,
                 data_specs=None,
                 rng=_default_seed,
                 preprocessor=None,
                 fit_preprocessor=False):
        """
        Parameters
        ----------
        data: ndarray, or tuple of ndarrays, containing the data.
            It is formatted as specified in `data_specs`.
            For instance, if `data_specs` is (VectorSpace(nfeat), 'features'),
            then `data` has to be a 2-d ndarray, of shape (nb examples,
            nfeat), that defines an unlabeled dataset. If `data_specs`
            is (CompositeSpace(Conv2DSpace(...), VectorSpace(1)),
            ('features', 'target')), then `data` has to be an (X, y) pair,
            with X being an ndarray containing images stored in the topological
            view specified by the `Conv2DSpace`, and y being a 2-D ndarray
            of width 1, containing the labels or targets for each example.

        data_specs: A (space, source) pair, where space is an instance of
            `Space` (possibly a `CompositeSpace`), and `source` is a
            string (or tuple of strings, if `space` is a `CompositeSpace`),
            defining the format and labels associated to `data`.

        rng : object, optional
            A random number generator used for picking random
            indices into the design matrix when choosing minibatches.

        preprocessor: WRITEME

        fit_preprocessor: WRITEME
        """
        # data_specs should be flat, and there should be no
        # duplicates in source, as we keep only one version
        assert is_flat_specs(data_specs)
        if isinstance(data_specs[1], tuple):
            assert sorted(set(data_specs[1])) == sorted(data_specs[1])
        self.data = data
        self.data_specs = data_specs

        self.compress = False
        self.design_loc = None
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = np.random.RandomState(rng)
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor
    def __init__(self, data=None, data_specs=None, rng=_default_seed,
                 preprocessor=None, fit_preprocessor=False):
        """
        Parameters
        ----------
        data: ndarray, or tuple of ndarrays, containing the data.
            It is formatted as specified in `data_specs`.
            For instance, if `data_specs` is (VectorSpace(nfeat), 'features'),
            then `data` has to be a 2-d ndarray, of shape (nb examples,
            nfeat), that defines an unlabeled dataset. If `data_specs`
            is (CompositeSpace(Conv2DSpace(...), VectorSpace(1)),
            ('features', 'target')), then `data` has to be an (X, y) pair,
            with X being an ndarray containing images stored in the topological
            view specified by the `Conv2DSpace`, and y being a 2-D ndarray
            of width 1, containing the labels or targets for each example.

        data_specs: A (space, source) pair, where space is an instance of
            `Space` (possibly a `CompositeSpace`), and `source` is a
            string (or tuple of strings, if `space` is a `CompositeSpace`),
            defining the format and labels associated to `data`.

        rng : object, optional
            A random number generator used for picking random
            indices into the design matrix when choosing minibatches.

        preprocessor: WRITEME

        fit_preprocessor: WRITEME
        """
        # data_specs should be flat, and there should be no
        # duplicates in source, as we keep only one version
        assert is_flat_specs(data_specs)
        if isinstance(data_specs[1], tuple):
            assert sorted(set(data_specs[1])) == sorted(data_specs[1])
        self.data = data
        self.data_specs = data_specs

        self.compress = False
        self.design_loc = None
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = np.random.RandomState(rng)
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 rng=None, data_specs=None,
                 return_tuple=False):
        """
        .. todo::

            WRITEME
        """
        # Build the right data_specs to query self.raw
        feature_idx = None
        if data_specs is not None:
            assert is_flat_specs(data_specs)
            space, source = data_specs

            if not isinstance(source, tuple):
                source = (source,)

            if isinstance(space, CompositeSpace):
                space = tuple(space.components)
            else:
                space = (space,)

            if self.transform_source not in source:
                # 'features is not needed, get things directly from
                # the original data
                raw_data_specs = data_specs
            else:
                feature_idx = source.index(self.transform_source)
                feature_input_space = self.transformer.get_input_space()

                raw_space = CompositeSpace(space[:feature_idx]
                                           + (feature_input_space,)
                                           + space[feature_idx + 1:])
                raw_source = source 
                raw_data_specs = (raw_space, raw_source)
        else:
            raw_data_specs = None

        raw_iterator = self.raw.iterator(
            mode=mode, batch_size=batch_size,
            num_batches=num_batches, rng=rng,
            data_specs=raw_data_specs, return_tuple=return_tuple)

        final_iterator = TransformerIterator(raw_iterator, self,
                                             feature_idx=feature_idx,
                                             data_specs=data_specs)

        return final_iterator
    def __init__(self, data=None, data_specs=None, rng=_default_seed,
                 preprocessor=None, fit_preprocessor=False):
        # data_specs should be flat, and there should be no
        # duplicates in source, as we keep only one version
        assert is_flat_specs(data_specs)
        if isinstance(data_specs[1], tuple):
            assert sorted(set(data_specs[1])) == sorted(data_specs[1])
        self.data = data
        self.data_specs = data_specs

        self.compress = False
        self.design_loc = None
        self.rng = make_np_rng(rng, which_method='random_integers')
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 topo=None, targets=None, rng=None, data_specs=None,
                 return_tuple=False):

        # Build the right data_specs to query self.raw
        if data_specs is not None:
            assert is_flat_specs(data_specs)
            space, source = data_specs
            if not isinstance(source, tuple):
                source = (source,)
            if isinstance(space, CompositeSpace):
                space = tuple(space.components)
            else:
                space = (space,)

            # Put 'features' first, as this is what TransformerIterator
            # is expecting
            if 'features' not in source:
                # 'features is not needed, get things directly from
                # the original data
                raw_data_specs = data_specs
            else:
                feature_idx = source.index('features')
                raw_space = CompositeSpace(
                                (self.transformer.get_input_space(),)
                                + space[:feature_idx]
                                + space[feature_idx + 1:])
                raw_source = (('features',)
                              + source[:feature_idx]
                              + source[feature_idx + 1:])
                raw_data_specs = (raw_space, raw_source)
        else:
            raw_data_specs = None

        raw_iterator = self.raw.iterator(mode=mode, batch_size=batch_size,
                num_batches=num_batches, topo=topo, targets=targets, rng=rng,
                data_specs=raw_data_specs, return_tuple=return_tuple)

        final_iterator = TransformerIterator(raw_iterator, self,
                                             data_specs=data_specs)

        return final_iterator
Beispiel #10
0
    def __init__(self, dataset, subset_iterator, topo=None, targets=None,
                 data_specs=None, return_tuple=False):

        if topo is not None or targets is not None:
            if data_specs is not None:
                raise ValueError("In FiniteDatasetIterator, both "
                        "the `data_specs` argument and deprecated arguments "
                        "`topo` or `targets` were provided.",
                        (data_specs, topo, targets))

            warnings.warn("Usage of `topo` and `target` arguments are being "
                    "deprecated, and will be removed around November 7th, "
                    "2013. `data_specs` should be used instead.",
                    stacklevel=2)
            topo = False
            targets = False
            self._deprecated_interface = True
        else:
            self._deprecated_interface = False

        self._topo = topo
        self._targets = targets
        self._data_specs = data_specs
        self._dataset = dataset
        self._subset_iterator = subset_iterator
        self._return_tuple = return_tuple

        # TODO: More thought about how to handle things where this
        # fails (gigantic HDF5 files, etc.)
        if self._deprecated_interface:
            self._raw_data = self._dataset.get_design_matrix()
            if self._targets:
                self._raw_targets = self._dataset.get_targets()
                if self._raw_targets is None:
                    raise ValueError("Can't iterate with targets=True on a "
                                     "dataset object with no targets")
                self._targets_need_cast = not np.dtype(config.floatX) == self._raw_targets.dtype
            self._needs_cast = not np.dtype(config.floatX) == self._raw_data.dtype
        else:
            # Keep only the needed sources in self._raw_data.
            # Remember what source they correspond to in self._source
            assert is_flat_specs(data_specs)

            dataset_space, dataset_source = self._dataset.get_data_specs()
            assert is_flat_specs((dataset_space, dataset_source))

            # the dataset's data spec is either a single (space, source) pair,
            # or a pair of (non-nested CompositeSpace, non-nested tuple).
            # We could build a mapping and call flatten(..., return_tuple=True)
            # but simply putting spaces, sources and data in tuples is simpler.
            if not isinstance(dataset_source, tuple):
                dataset_source = (dataset_source,)

            if not isinstance(dataset_space, CompositeSpace):
                dataset_sub_spaces = (dataset_space,)
            else:
                dataset_sub_spaces = dataset_space.components
            assert len(dataset_source) == len(dataset_sub_spaces)

            all_data = self._dataset.get_data()
            if not isinstance(all_data, tuple):
                all_data = (all_data,)

            space, source = data_specs
            if not isinstance(source, tuple):
                source = (source,)
            if not isinstance(space, CompositeSpace):
                sub_spaces = (space,)
            else:
                sub_spaces = space.components
            assert len(source) == len(sub_spaces)

            self._raw_data = tuple(all_data[dataset_source.index(s)]
                                   for s in source)
            self._source = source

            self._convert = []

            for i, (so, sp) in enumerate(safe_zip(source, sub_spaces)):
                idx = dataset_source.index(so)
                dspace = dataset_sub_spaces[idx]

                # Compose the functions
                fn = None
                needs_cast = not np.dtype(config.floatX) == \
                                        self._raw_data[i].dtype
                if needs_cast:
                    fn = lambda batch: numpy.cast[config.floatX](batch)

                needs_format = not sp == dspace
                if needs_format:
                    # "dspace" and "sp" have to be passed as parameters
                    # to lambda, in order to capture their current value,
                    # otherwise they would change in the next iteration
                    # of the loop.
                    if fn is None:
                        fn = (lambda batch, dspace=dspace, sp=sp:
                                dspace.np_format_as(batch, sp))
                    else:
                        fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn:
                                dspace.np_format_as(fn_(batch), sp))

                self._convert.append(fn)
Beispiel #11
0
    def __init__(self, dataset, subset_iterator, data_specs=None,
                 return_tuple=False, convert=None):
        self._data_specs = data_specs
        self._dataset = dataset
        self._subset_iterator = subset_iterator
        self._return_tuple = return_tuple

        # Keep only the needed sources in self._raw_data.
        # Remember what source they correspond to in self._source
        assert is_flat_specs(data_specs)

        dataset_space, dataset_source = self._dataset.get_data_specs()
        assert is_flat_specs((dataset_space, dataset_source))

        # the dataset's data spec is either a single (space, source) pair,
        # or a pair of (non-nested CompositeSpace, non-nested tuple).
        # We could build a mapping and call flatten(..., return_tuple=True)
        # but simply putting spaces, sources and data in tuples is simpler.
        if not isinstance(dataset_source, (tuple, list)):
            dataset_source = (dataset_source,)

        if not isinstance(dataset_space, CompositeSpace):
            dataset_sub_spaces = (dataset_space,)
        else:
            dataset_sub_spaces = dataset_space.components
        assert len(dataset_source) == len(dataset_sub_spaces)

        space, source = data_specs
        if not isinstance(source, tuple):
            source = (source,)
        if not isinstance(space, CompositeSpace):
            sub_spaces = (space,)
        else:
            sub_spaces = space.components
        assert len(source) == len(sub_spaces)

        # If `dataset` is incompatible with the new interface, fall back to the
        # old interface
        if not hasattr(self._dataset, 'get'):
            all_data = self._dataset.get_data()
            if not isinstance(all_data, tuple):
                all_data = (all_data,)
            raw_data = []
            for s in source:
                try:
                    raw_data.append(all_data[dataset_source.index(s)])
                except ValueError as e:
                    msg = str(e) + '\nThe dataset does not provide '\
                                   'a source with name: ' + s + '.'
                    reraise_as(ValueError(msg))
            self._raw_data = tuple(raw_data)

        self._source = source
        self._space = sub_spaces

        if convert is None:
            self._convert = [None for s in source]
        else:
            assert len(convert) == len(source)
            self._convert = convert

        for i, (so, sp) in enumerate(safe_izip(source, sub_spaces)):
            try:
                idx = dataset_source.index(so)
            except ValueError as e:
                msg = str(e) + '\nThe dataset does not provide '\
                               'a source with name: ' + so + '.'
                reraise_as(ValueError(msg))
            dspace = dataset_sub_spaces[idx]

            fn = self._convert[i]

            # If there is a fn, it is supposed to take care of the formatting,
            # and it should be an error if it does not. If there was no fn,
            # then the iterator will try to format using the generic
            # space-formatting functions.
            if fn is None:
                # "dspace" and "sp" have to be passed as parameters
                # to lambda, in order to capture their current value,
                # otherwise they would change in the next iteration
                # of the loop.
                fn = (lambda batch, dspace=dspace, sp=sp:
                      dspace.np_format_as(batch, sp))

            self._convert[i] = fn
Beispiel #12
0
    def __init__(self,
                 dataset,
                 subset_iterator,
                 data_specs=None,
                 return_tuple=False,
                 convert=None):
        """
        .. todo::

            WRITEME
        """

        self._data_specs = data_specs
        self._dataset = dataset
        self._subset_iterator = subset_iterator
        self._return_tuple = return_tuple

        assert is_flat_specs(data_specs)

        dataset_space, dataset_source = self._dataset.get_data_specs()
        assert is_flat_specs((dataset_space, dataset_source))

        # the dataset's data spec is either a single (space, source) pair,
        # or a pair of (non-nested CompositeSpace, non-nested tuple).
        # We could build a mapping and call flatten(..., return_tuple=True)
        # but simply putting spaces, sources and data in tuples is simpler.
        if not isinstance(dataset_source, tuple):
            dataset_source = (dataset_source, )

        if not isinstance(dataset_space, CompositeSpace):
            dataset_sub_spaces = (dataset_space, )
        else:
            dataset_sub_spaces = dataset_space.components
        assert len(dataset_source) == len(dataset_sub_spaces)

        space, source = data_specs
        if not isinstance(source, tuple):
            source = (source, )
        if not isinstance(space, CompositeSpace):
            sub_spaces = (space, )
        else:
            sub_spaces = space.components
        assert len(source) == len(sub_spaces)

        self._source = source

        if convert is None:
            self._convert = [None for s in source]
        else:
            assert len(convert) == len(source)
            self._convert = convert

        for i, (so, sp) in enumerate(safe_zip(source, sub_spaces)):
            idx = dataset_source.index(so)
            dspace = dataset_sub_spaces[idx]

            init_fn = self._convert[i]
            fn = init_fn

            # If there is an init_fn, it is supposed to take
            # care of the formatting, and it should be an error
            # if it does not. If there was no init_fn, then
            # the iterator will try to format using the generic
            # space-formatting functions.
            if init_fn is None:
                # "dspace" and "sp" have to be passed as parameters
                # to lambda, in order to capture their current value,
                # otherwise they would change in the next iteration
                # of the loop.
                if fn is None:
                    fn = (lambda batch, dspace=dspace, sp=sp: dspace.
                          np_format_as(batch, sp))
                else:
                    fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn: dspace.
                          np_format_as(fn_(batch), sp))

            self._convert[i] = fn
Beispiel #13
0
    def __init__(self,
                 dataset,
                 subset_iterator,
                 data_specs=None,
                 return_tuple=False,
                 convert=None):
        self._data_specs = data_specs
        self._dataset = dataset
        self._subset_iterator = subset_iterator
        self._return_tuple = return_tuple

        # Keep only the needed sources in self._raw_data.
        # Remember what source they correspond to in self._source
        assert is_flat_specs(data_specs)

        dataset_space, dataset_source = self._dataset.get_data_specs()
        assert is_flat_specs((dataset_space, dataset_source))

        # the dataset's data spec is either a single (space, source) pair,
        # or a pair of (non-nested CompositeSpace, non-nested tuple).
        # We could build a mapping and call flatten(..., return_tuple=True)
        # but simply putting spaces, sources and data in tuples is simpler.
        if not isinstance(dataset_source, (tuple, list)):
            dataset_source = (dataset_source, )

        if not isinstance(dataset_space, CompositeSpace):
            dataset_sub_spaces = (dataset_space, )
        else:
            dataset_sub_spaces = dataset_space.components
        assert len(dataset_source) == len(dataset_sub_spaces)

        space, source = data_specs
        if not isinstance(source, tuple):
            source = (source, )
        if not isinstance(space, CompositeSpace):
            sub_spaces = (space, )
        else:
            sub_spaces = space.components
        assert len(source) == len(sub_spaces)

        # If `dataset` is incompatible with the new interface, fall back to the
        # old interface
        if not hasattr(self._dataset, 'get'):
            all_data = self._dataset.get_data()
            if not isinstance(all_data, tuple):
                all_data = (all_data, )
            raw_data = []
            for s in source:
                try:
                    raw_data.append(all_data[dataset_source.index(s)])
                except ValueError as e:
                    msg = str(e) + '\nThe dataset does not provide '\
                                   'a source with name: ' + s + '.'
                    reraise_as(ValueError(msg))
            self._raw_data = tuple(raw_data)

        self._source = source
        self._space = sub_spaces

        if convert is None:
            self._convert = [None for s in source]
        else:
            assert len(convert) == len(source)
            self._convert = convert

        for i, (so, sp) in enumerate(safe_izip(source, sub_spaces)):
            try:
                idx = dataset_source.index(so)
            except ValueError as e:
                msg = str(e) + '\nThe dataset does not provide '\
                               'a source with name: ' + so + '.'
                reraise_as(ValueError(msg))
            dspace = dataset_sub_spaces[idx]

            fn = self._convert[i]

            # If there is a fn, it is supposed to take care of the formatting,
            # and it should be an error if it does not. If there was no fn,
            # then the iterator will try to format using the generic
            # space-formatting functions.
            if fn is None:
                # "dspace" and "sp" have to be passed as parameters
                # to lambda, in order to capture their current value,
                # otherwise they would change in the next iteration
                # of the loop.
                fn = (lambda batch, dspace=dspace, sp=sp: dspace.np_format_as(
                    batch, sp))

            self._convert[i] = fn
    def __init__(self, dataset, subset_iterator, data_specs=None,
                 return_tuple=False, convert=None):
        self._data_specs = data_specs
        self._dataset = dataset
        self._subset_iterator = subset_iterator
        self._return_tuple = return_tuple

        assert is_flat_specs(data_specs)

        dataset_space, dataset_source = self._dataset.get_data_specs()
        assert is_flat_specs((dataset_space, dataset_source))

        if not isinstance(dataset_source, tuple):
            dataset_source = (dataset_source,)

        if not isinstance(dataset_space, CompositeSpace):
            dataset_sub_spaces = (dataset_space,)
        else:
            dataset_sub_spaces = dataset_space.components
        assert len(dataset_source) == len(dataset_sub_spaces)

        space, source = data_specs
        if not isinstance(source, tuple):
            source = (source,)
        if not isinstance(space, CompositeSpace):
            sub_spaces = (space,)
        else:
            sub_spaces = space.components
        assert len(source) == len(sub_spaces)

        self._source = source

        if convert is None:
            self._convert = [None for s in source]
        else:
            assert len(convert) == len(source)
            self._convert = convert

        for i, (so, sp) in enumerate(safe_izip(source, sub_spaces)):
            idx = dataset_source.index(so)
            dspace = dataset_sub_spaces[idx]

            init_fn = self._convert[i]
            fn = init_fn

            # If there is an init_fn, it is supposed to take
            # care of the formatting, and it should be an error
            # if it does not. If there was no init_fn, then
            # the iterator will try to format using the generic
            # space-formatting functions.
            if init_fn is None:
                # "dspace" and "sp" have to be passed as parameters
                # to lambda, in order to capture their current value,
                # otherwise they would change in the next iteration
                # of the loop.
                if fn is None:
                    fn = (lambda batch, dspace=dspace, sp=sp:
                          dspace.np_format_as(batch, sp))
                else:
                    fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn:
                          dspace.np_format_as(fn_(batch), sp))

            self._convert[i] = fn
Beispiel #15
0
    def __init__(self, dataset, subset_iterator, topo=None, targets=None,
                 data_specs=None, return_tuple=False):

        if topo is not None or targets is not None:
            if data_specs is not None:
                raise ValueError("In FiniteDatasetIterator, both "
                        "the `data_specs` argument and deprecated arguments "
                        "`topo` or `targets` were provided.",
                        (data_specs, topo, targets))

            warnings.warn("Usage of `topo` and `target` arguments are being "
                    "deprecated, and will be removed around November 7th, "
                    "2013. `data_specs` should be used instead.",
                    stacklevel=2)
            topo = False
            targets = False
            self._deprecated_interface = True
        else:
            self._deprecated_interface = False

        self._topo = topo
        self._targets = targets
        self._data_specs = data_specs
        self._dataset = dataset
        self._subset_iterator = subset_iterator
        self._return_tuple = return_tuple

        # TODO: More thought about how to handle things where this
        # fails (gigantic HDF5 files, etc.)
        if self._deprecated_interface:
            self._raw_data = self._dataset.get_design_matrix()
            if self._targets:
                self._raw_targets = self._dataset.get_targets()
                if self._raw_targets is None:
                    raise ValueError("Can't iterate with targets=True on a "
                                     "dataset object with no targets")
                self._targets_need_cast = not np.dtype(config.floatX) == self._raw_targets.dtype
            self._needs_cast = not np.dtype(config.floatX) == self._raw_data.dtype
        else:
            # Keep only the needed sources in self._raw_data.
            # Remember what source they correspond to in self._source
            assert is_flat_specs(data_specs)

            dataset_space, dataset_source = self._dataset.get_data_specs()
            assert is_flat_specs((dataset_space, dataset_source))

            # the dataset's data spec is either a single (space, source) pair,
            # or a pair of (non-nested CompositeSpace, non-nested tuple).
            # We could build a mapping and call flatten(..., return_tuple=True)
            # but simply putting spaces, sources and data in tuples is simpler.
            if not isinstance(dataset_source, tuple):
                dataset_source = (dataset_source,)

            if not isinstance(dataset_space, CompositeSpace):
                dataset_sub_spaces = (dataset_space,)
            else:
                dataset_sub_spaces = dataset_space.components
            assert len(dataset_source) == len(dataset_sub_spaces)

            all_data = self._dataset.get_data()
            if not isinstance(all_data, tuple):
                all_data = (all_data,)

            space, source = data_specs
            if not isinstance(source, tuple):
                source = (source,)
            if not isinstance(space, CompositeSpace):
                sub_spaces = (space,)
            else:
                sub_spaces = space.components
            assert len(source) == len(sub_spaces)

            self._raw_data = tuple(all_data[dataset_source.index(s)]
                                   for s in source)
            self._source = source

            self._convert = []

            for i, (so, sp) in enumerate(safe_zip(source, sub_spaces)):
                idx = dataset_source.index(so)
                dspace = dataset_sub_spaces[idx]

                # Compose the functions
                fn = None
                needs_cast = not np.dtype(config.floatX) == \
                                        self._raw_data[i].dtype
                if needs_cast:
                    fn = lambda batch: numpy.cast[config.floatX](batch)

                needs_format = not sp == dspace
                if needs_format:
                    # "dspace" and "sp" have to be passed as parameters
                    # to lambda, in order to capture their current value,
                    # otherwise they would change in the next iteration
                    # of the loop.
                    if fn is None:
                        fn = (lambda batch, dspace=dspace, sp=sp:
                                dspace.np_format_as(batch, sp))
                    else:
                        fn = (lambda batch, dspace=dspace, sp=sp:
                                dspace.np_format_as(fn(batch), sp))

                self._convert.append(fn)
Beispiel #16
0
    def __init__(self, dataset, subset_iterator, data_specs=None,
                 return_tuple=False, convert=None):
        self._data_specs = data_specs
        self._dataset = dataset
        self._subset_iterator = subset_iterator
        self._return_tuple = return_tuple

        # Keep only the needed sources in self._raw_data.
        # Remember what source they correspond to in self._source
        assert is_flat_specs(data_specs)

        dataset_space, dataset_source = self._dataset.get_data_specs()
        assert is_flat_specs((dataset_space, dataset_source))

        # the dataset's data spec is either a single (space, source) pair,
        # or a pair of (non-nested CompositeSpace, non-nested tuple).
        # We could build a mapping and call flatten(..., return_tuple=True)
        # but simply putting spaces, sources and data in tuples is simpler.
        if not isinstance(dataset_source, tuple):
            dataset_source = (dataset_source,)

        if not isinstance(dataset_space, CompositeSpace):
            dataset_sub_spaces = (dataset_space,)
        else:
            dataset_sub_spaces = dataset_space.components
        assert len(dataset_source) == len(dataset_sub_spaces)

        all_data = self._dataset.get_data()
        if not isinstance(all_data, tuple):
            all_data = (all_data,)

        space, source = data_specs
        if not isinstance(source, tuple):
            source = (source,)
        if not isinstance(space, CompositeSpace):
            sub_spaces = (space,)
        else:
            sub_spaces = space.components
        assert len(source) == len(sub_spaces)

        self._raw_data = tuple(all_data[dataset_source.index(s)]
                               for s in source)
        self._source = source

        if convert is None:
            self._convert = [None for s in source]
        else:
            assert len(convert) == len(source)
            self._convert = convert

        for i, (so, sp, dt) in enumerate(safe_izip(source,
                                                   sub_spaces,
                                                   self._raw_data)):
            idx = dataset_source.index(so)
            dspace = dataset_sub_spaces[idx]

            init_fn = self._convert[i]
            fn = init_fn

            # If there is an init_fn, it is supposed to take
            # care of the formatting, and it should be an error
            # if it does not. If there was no init_fn, then
            # the iterator will try to format using the generic
            # space-formatting functions.
            if init_fn is None:
                # "dspace" and "sp" have to be passed as parameters
                # to lambda, in order to capture their current value,
                # otherwise they would change in the next iteration
                # of the loop.
                if fn is None:
                
                    def fn(batch, dspace=dspace, sp=sp):
                        try:
                              return dspace.np_format_as(batch, sp)
                        except ValueError as e:
                            msg = str(e) + '\nMake sure that the model and '\
                                           'dataset have been initialized with '\
                                           'correct values.'
                            reraise_as(ValueError(msg))
                else:
                    fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn:
                          dspace.np_format_as(fn_(batch), sp))

            self._convert[i] = fn
Beispiel #17
0
    def __init__(self, dataset, subset_iterator, data_specs=None, return_tuple=False, convert_fns=None):
        """
        convert_fns: function or tuple of function, organized as
            in data_specs, to be applied on the raw batches of
            data. "None" can be used as placeholder for the identity.
        """
        self._deprecated_interface = False
        if data_specs is None:
            raise TypeError("data_specs not provided")
        self._data_specs = data_specs
        self._dataset = dataset
        self._subset_iterator = subset_iterator
        self._return_tuple = return_tuple

        # Keep only the needed sources in self._raw_data.
        # Remember what source they correspond to in self._source
        assert is_flat_specs(data_specs)

        dataset_space, dataset_source = self._dataset.get_data_specs()
        assert is_flat_specs((dataset_space, dataset_source))

        # the dataset's data spec is either a single (space, source) pair,
        # or a pair of (non-nested CompositeSpace, non-nested tuple).
        # We could build a mapping and call flatten(..., return_tuple=True)
        # but simply putting spaces, sources and data in tuples is simpler.
        if not isinstance(dataset_source, tuple):
            dataset_source = (dataset_source,)

        if not isinstance(dataset_space, CompositeSpace):
            dataset_sub_spaces = (dataset_space,)
        else:
            dataset_sub_spaces = dataset_space.components
        assert len(dataset_source) == len(dataset_sub_spaces)

        all_data = self._dataset.get_data()
        if not isinstance(all_data, tuple):
            all_data = (all_data,)

        space, source = data_specs
        if not isinstance(source, tuple):
            source = (source,)
        if not isinstance(space, CompositeSpace):
            sub_spaces = (space,)
        else:
            sub_spaces = space.components
        assert len(source) == len(sub_spaces)

        self._raw_data = tuple(all_data[dataset_source.index(s)] for s in source)
        self._source = source

        if convert_fns is None:
            self._convert = [None for s in source]
        else:
            if not isinstance(convert_fns, (list, tuple)):
                convert_fns = (convert_fns,)
            assert len(convert_fns) == len(source)
            self._convert = list(convert_fns)

        for i, (so, sp) in enumerate(safe_zip(source, sub_spaces)):
            idx = dataset_source.index(so)
            dspace = dataset_sub_spaces[idx]

            # Compose the functions
            fn = self._convert[i]
            needs_cast = not (self._raw_data[i][0].dtype == config.floatX)
            if needs_cast:
                if fn is None:
                    fn = lambda batch: np.cast[config.floatX](batch)
                else:
                    fn = lambda batch, prev_fn=fn: np.cast[config.floatX](prev_fn(batch))

            needs_format = not sp == dspace
            if needs_format:
                # "dspace" and "sp" have to be passed as parameters
                # to lambda, in order to capture their current value,
                # otherwise they would change in the next iteration
                # of the loop.
                if fn is None:
                    fn = lambda batch, dspace=dspace, sp=sp: dspace.np_format_as(batch, sp)
                else:
                    fn = lambda batch, dspace=dspace, sp=sp, fn_=fn: dspace.np_format_as(fn_(batch), sp)

            self._convert[i] = fn
Beispiel #18
0
    def __init__(self, dataset, subset_iterator, data_specs=None,
                 return_tuple=False, convert=None):
        self._data_specs = data_specs
        self._dataset = dataset
        self._subset_iterator = subset_iterator
        self._return_tuple = return_tuple

        # Keep only the needed sources in self._raw_data.
        # Remember what source they correspond to in self._source
        assert is_flat_specs(data_specs)

        dataset_space, dataset_source = self._dataset.get_data_specs()
        assert is_flat_specs((dataset_space, dataset_source))

        # the dataset's data spec is either a single (space, source) pair,
        # or a pair of (non-nested CompositeSpace, non-nested tuple).
        # We could build a mapping and call flatten(..., return_tuple=True)
        # but simply putting spaces, sources and data in tuples is simpler.
        if not isinstance(dataset_source, tuple):
            dataset_source = (dataset_source,)

        if not isinstance(dataset_space, CompositeSpace):
            dataset_sub_spaces = (dataset_space,)
        else:
            dataset_sub_spaces = dataset_space.components
        assert len(dataset_source) == len(dataset_sub_spaces)

        all_data = self._dataset.get_data()
        if not isinstance(all_data, tuple):
            all_data = (all_data,)

        space, source = data_specs
        if not isinstance(source, tuple):
            source = (source,)
        if not isinstance(space, CompositeSpace):
            sub_spaces = (space,)
        else:
            sub_spaces = space.components
        assert len(source) == len(sub_spaces)

        self._raw_data = ()
        for s in source:
            try:
                self._raw_data += (all_data[dataset_source.index(s)],)
            except ValueError as e:
                msg = str(e) + '\nThe dataset does not provide '\
                               'a source with name: '+s+'.'
                reraise_as(ValueError(msg))

        self._source = source
        self._space = sub_spaces

        if convert is None:
            self._convert = [None for s in source]
        else:
            assert len(convert) == len(source)
            self._convert = convert

        for i, (so, sp, dt) in enumerate(safe_izip(source,
                                                   sub_spaces,
                                                   self._raw_data)):
            idx = dataset_source.index(so)
            dspace = dataset_sub_spaces[idx]

            init_fn = self._convert[i]
            fn = init_fn

            # If there is an init_fn, it is supposed to take
            # care of the formatting, and it should be an error
            # if it does not. If there was no init_fn, then
            # the iterator will try to format using the generic
            # space-formatting functions.
            if init_fn is None:
                # "dspace" and "sp" have to be passed as parameters
                # to lambda, in order to capture their current value,
                # otherwise they would change in the next iteration
                # of the loop.
                if fn is None:

                    def fn(batch, dspace=dspace, sp=sp):
                        try:
                              return dspace.np_format_as(batch, sp)
                        except ValueError as e:
                            msg = str(e) + '\nMake sure that the model and '\
                                           'dataset have been initialized with '\
                                           'correct values.'
                            reraise_as(ValueError(msg))
                else:
                    fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn:
                          dspace.np_format_as(fn_(batch), sp))

            self._convert[i] = fn
Beispiel #19
0
    def __init__(self,
                 dataset,
                 subset_iterator,
                 data_specs=None,
                 return_tuple=False,
                 convert_fns=None):
        """
        convert_fns: function or tuple of function, organized as
            in data_specs, to be applied on the raw batches of
            data. "None" can be used as placeholder for the identity.
        """
        self._deprecated_interface = False
        if data_specs is None:
            raise TypeError("data_specs not provided")
        self._data_specs = data_specs
        self._dataset = dataset
        self._subset_iterator = subset_iterator
        self._return_tuple = return_tuple

        # Keep only the needed sources in self._raw_data.
        # Remember what source they correspond to in self._source
        assert is_flat_specs(data_specs)

        dataset_space, dataset_source = self._dataset.get_data_specs()
        assert is_flat_specs((dataset_space, dataset_source))

        # the dataset's data spec is either a single (space, source) pair,
        # or a pair of (non-nested CompositeSpace, non-nested tuple).
        # We could build a mapping and call flatten(..., return_tuple=True)
        # but simply putting spaces, sources and data in tuples is simpler.
        if not isinstance(dataset_source, tuple):
            dataset_source = (dataset_source, )

        if not isinstance(dataset_space, CompositeSpace):
            dataset_sub_spaces = (dataset_space, )
        else:
            dataset_sub_spaces = dataset_space.components
        assert len(dataset_source) == len(dataset_sub_spaces)

        all_data = self._dataset.get_data()
        if not isinstance(all_data, tuple):
            all_data = (all_data, )

        space, source = data_specs
        if not isinstance(source, tuple):
            source = (source, )
        if not isinstance(space, CompositeSpace):
            sub_spaces = (space, )
        else:
            sub_spaces = space.components
        assert len(source) == len(sub_spaces)

        self._raw_data = tuple(all_data[dataset_source.index(s)]
                               for s in source)
        self._source = source

        if convert_fns is None:
            self._convert = [None for s in source]
        else:
            if not isinstance(convert_fns, (list, tuple)):
                convert_fns = (convert_fns, )
            assert len(convert_fns) == len(source)
            self._convert = list(convert_fns)

        for i, (so, sp) in enumerate(safe_zip(source, sub_spaces)):
            idx = dataset_source.index(so)
            dspace = dataset_sub_spaces[idx]

            # Compose the functions
            fn = self._convert[i]
            needs_cast = not (self._raw_data[i][0].dtype == config.floatX)
            if needs_cast:
                if fn is None:
                    fn = lambda batch: np.cast[config.floatX](batch)
                else:
                    fn = (lambda batch, prev_fn=fn: np.cast[config.floatX]
                          (prev_fn(batch)))

            needs_format = not sp == dspace
            if needs_format:
                # "dspace" and "sp" have to be passed as parameters
                # to lambda, in order to capture their current value,
                # otherwise they would change in the next iteration
                # of the loop.
                if fn is None:
                    fn = (lambda batch, dspace=dspace, sp=sp: dspace.
                          np_format_as(batch, sp))
                else:
                    fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn: dspace.
                          np_format_as(fn_(batch), sp))

            self._convert[i] = fn
Beispiel #20
0
    def __init__(self, dataset, subset_iterator,
                 data_specs=None, return_tuple=False, convert=None):
        """
        .. todo::

            WRITEME
        """

        self._data_specs = data_specs
        self._dataset = dataset
        self._subset_iterator = subset_iterator
        self._return_tuple = return_tuple

        assert is_flat_specs(data_specs)

        dataset_space, dataset_source = self._dataset.get_data_specs()
        assert is_flat_specs((dataset_space, dataset_source))

        # the dataset's data spec is either a single (space, source) pair,
        # or a pair of (non-nested CompositeSpace, non-nested tuple).
        # We could build a mapping and call flatten(..., return_tuple=True)
        # but simply putting spaces, sources and data in tuples is simpler.
        if not isinstance(dataset_source, tuple):
            dataset_source = (dataset_source,)

        if not isinstance(dataset_space, CompositeSpace):
            dataset_sub_spaces = (dataset_space,)
        else:
            dataset_sub_spaces = dataset_space.components
        assert len(dataset_source) == len(dataset_sub_spaces)

        space, source = data_specs
        if not isinstance(source, tuple):
            source = (source,)
        if not isinstance(space, CompositeSpace):
            sub_spaces = (space,)
        else:
            sub_spaces = space.components
        assert len(source) == len(sub_spaces)

        self._source = source

        if convert is None:
            self._convert = [None for s in source]
        else:
            assert len(convert) == len(source)
            self._convert = convert

        dtypes = self._dataset.dtype_of(self._source)
        for i, (so, sp, dt) in enumerate(safe_zip(source, sub_spaces, dtypes)):
            idx = dataset_source.index(so)
            dspace = dataset_sub_spaces[idx]

            init_fn = self._convert[i]
            fn = init_fn
            # Compose the functions
            needs_cast = not (numpy.dtype(config.floatX) == dt)
            if needs_cast:
                if fn is None:
                    fn = lambda batch: numpy.cast[config.floatX](batch)
                else:
                    fn = (lambda batch, fn_=fn:
                          numpy.cast[config.floatX](fn_(batch)))

            # If there is an init_fn, it is supposed to take
            # care of the formatting, and it should be an error
            # if it does not. If there was no init_fn, then
            # the iterator will try to format using the generic
            # space-formatting functions.
            needs_format = not init_fn and not sp == dspace
            if needs_format:
                # "dspace" and "sp" have to be passed as parameters
                # to lambda, in order to capture their current value,
                # otherwise they would change in the next iteration
                # of the loop.
                if fn is None:
                    fn = (lambda batch, dspace=dspace, sp=sp:
                          dspace.np_format_as(batch, sp))
                else:
                    fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn:
                          dspace.np_format_as(fn_(batch), sp))

            self._convert[i] = fn