def __init__(self, data=None, data_specs=None, rng=_default_seed, preprocessor=None, fit_preprocessor=False): # data_specs should be flat, and there should be no # duplicates in source, as we keep only one version assert is_flat_specs(data_specs) if isinstance(data_specs[1], tuple): assert sorted(set(data_specs[1])) == sorted(data_specs[1]) space, source = data_specs space.np_validate(data) assert len(set(elem.shape[0] for elem in list(data))) <= 1 self.data = data self.data_specs = data_specs self.num_examples = list(data)[0].shape[0] self.compress = False self.design_loc = None self.rng = make_np_rng(rng, which_method='random_integers') # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor
def __init__(self, data=None, data_specs=None, rng=_default_seed, preprocessor=None, fit_preprocessor=False): # data_specs should be flat, and there should be no # duplicates in source, as we keep only one version assert is_flat_specs(data_specs) if isinstance(data_specs[1], tuple): assert sorted(set(data_specs[1])) == sorted(data_specs[1]) space, source = data_specs space.np_validate(data) # TODO: assume that data[0] is num example => error if channel in c01b # assert len(set(elem.shape[0] for elem in list(data))) <= 1 self.data = data self.data_specs = data_specs # TODO: assume that data[0] is num example => error if channel in c01b self.num_examples = list(data)[-1].shape[0] # TODO: list(data)[0].shape[0] self.compress = False self.design_loc = None self.rng = make_np_rng(rng, which_method='random_integers') # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor
def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): """ .. todo:: WRITEME """ # Build the right data_specs to query self.raw if data_specs is not None: assert is_flat_specs(data_specs) space, source = data_specs if not isinstance(source, tuple): source = (source, ) if isinstance(space, CompositeSpace): space = tuple(space.components) else: space = (space, ) # Put 'features' first, as this is what TransformerIterator # is expecting if 'features' not in source: # 'features is not needed, get things directly from # the original data raw_data_specs = data_specs else: feature_idx = source.index('features') if self.space_preserving: # Ask self.raw for the data in the expected space, # and self.transformer will operate in that space feature_input_space = space[feature_idx] else: # We need to ask the transformer what its input space is feature_input_space = self.transformer.get_input_space() raw_space = CompositeSpace((feature_input_space, ) + space[:feature_idx] + space[feature_idx + 1:]) raw_source = (('features', ) + source[:feature_idx] + source[feature_idx + 1:]) raw_data_specs = (raw_space, raw_source) else: raw_data_specs = None raw_iterator = self.raw.iterator(mode=mode, batch_size=batch_size, num_batches=num_batches, rng=rng, data_specs=raw_data_specs, return_tuple=return_tuple) final_iterator = TransformerIterator(raw_iterator, self, data_specs=data_specs) return final_iterator
def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None, data_specs=None, return_tuple=False): """ .. todo:: WRITEME """ # Build the right data_specs to query self.raw if data_specs is not None: assert is_flat_specs(data_specs) space, source = data_specs if not isinstance(source, tuple): source = (source,) if isinstance(space, CompositeSpace): space = tuple(space.components) else: space = (space,) # Put 'features' first, as this is what TransformerIterator # is expecting if 'features' not in source: # 'features is not needed, get things directly from # the original data raw_data_specs = data_specs else: feature_idx = source.index('features') if self.space_preserving: # Ask self.raw for the data in the expected space, # and self.transformer will operate in that space feature_input_space = space[feature_idx] else: # We need to ask the transformer what its input space is feature_input_space = self.transformer.get_input_space() raw_space = CompositeSpace((feature_input_space,) + space[:feature_idx] + space[feature_idx + 1:]) raw_source = (('features',) + source[:feature_idx] + source[feature_idx + 1:]) raw_data_specs = (raw_space, raw_source) else: raw_data_specs = None raw_iterator = self.raw.iterator( mode=mode, batch_size=batch_size, num_batches=num_batches, topo=topo, targets=targets, rng=rng, data_specs=raw_data_specs, return_tuple=return_tuple) final_iterator = TransformerIterator(raw_iterator, self, data_specs=data_specs) return final_iterator
def __init__(self, data=None, data_specs=None, rng=_default_seed, preprocessor=None, fit_preprocessor=False): """ Parameters ---------- data: ndarray, or tuple of ndarrays, containing the data. It is formatted as specified in `data_specs`. For instance, if `data_specs` is (VectorSpace(nfeat), 'features'), then `data` has to be a 2-d ndarray, of shape (nb examples, nfeat), that defines an unlabeled dataset. If `data_specs` is (CompositeSpace(Conv2DSpace(...), VectorSpace(1)), ('features', 'target')), then `data` has to be an (X, y) pair, with X being an ndarray containing images stored in the topological view specified by the `Conv2DSpace`, and y being a 2-D ndarray of width 1, containing the labels or targets for each example. data_specs: A (space, source) pair, where space is an instance of `Space` (possibly a `CompositeSpace`), and `source` is a string (or tuple of strings, if `space` is a `CompositeSpace`), defining the format and labels associated to `data`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. preprocessor: WRITEME fit_preprocessor: WRITEME """ # data_specs should be flat, and there should be no # duplicates in source, as we keep only one version assert is_flat_specs(data_specs) if isinstance(data_specs[1], tuple): assert sorted(set(data_specs[1])) == sorted(data_specs[1]) self.data = data self.data_specs = data_specs self.compress = False self.design_loc = None if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = np.random.RandomState(rng) # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor
def __init__(self, data=None, data_specs=None, rng=_default_seed, preprocessor=None, fit_preprocessor=False): """ Parameters ---------- data: ndarray, or tuple of ndarrays, containing the data. It is formatted as specified in `data_specs`. For instance, if `data_specs` is (VectorSpace(nfeat), 'features'), then `data` has to be a 2-d ndarray, of shape (nb examples, nfeat), that defines an unlabeled dataset. If `data_specs` is (CompositeSpace(Conv2DSpace(...), VectorSpace(1)), ('features', 'target')), then `data` has to be an (X, y) pair, with X being an ndarray containing images stored in the topological view specified by the `Conv2DSpace`, and y being a 2-D ndarray of width 1, containing the labels or targets for each example. data_specs: A (space, source) pair, where space is an instance of `Space` (possibly a `CompositeSpace`), and `source` is a string (or tuple of strings, if `space` is a `CompositeSpace`), defining the format and labels associated to `data`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. preprocessor: WRITEME fit_preprocessor: WRITEME """ # data_specs should be flat, and there should be no # duplicates in source, as we keep only one version assert is_flat_specs(data_specs) if isinstance(data_specs[1], tuple): assert sorted(set(data_specs[1])) == sorted(data_specs[1]) self.data = data self.data_specs = data_specs self.compress = False self.design_loc = None if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = np.random.RandomState(rng) # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor
def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): """ .. todo:: WRITEME """ # Build the right data_specs to query self.raw feature_idx = None if data_specs is not None: assert is_flat_specs(data_specs) space, source = data_specs if not isinstance(source, tuple): source = (source,) if isinstance(space, CompositeSpace): space = tuple(space.components) else: space = (space,) if self.transform_source not in source: # 'features is not needed, get things directly from # the original data raw_data_specs = data_specs else: feature_idx = source.index(self.transform_source) feature_input_space = self.transformer.get_input_space() raw_space = CompositeSpace(space[:feature_idx] + (feature_input_space,) + space[feature_idx + 1:]) raw_source = source raw_data_specs = (raw_space, raw_source) else: raw_data_specs = None raw_iterator = self.raw.iterator( mode=mode, batch_size=batch_size, num_batches=num_batches, rng=rng, data_specs=raw_data_specs, return_tuple=return_tuple) final_iterator = TransformerIterator(raw_iterator, self, feature_idx=feature_idx, data_specs=data_specs) return final_iterator
def __init__(self, data=None, data_specs=None, rng=_default_seed, preprocessor=None, fit_preprocessor=False): # data_specs should be flat, and there should be no # duplicates in source, as we keep only one version assert is_flat_specs(data_specs) if isinstance(data_specs[1], tuple): assert sorted(set(data_specs[1])) == sorted(data_specs[1]) self.data = data self.data_specs = data_specs self.compress = False self.design_loc = None self.rng = make_np_rng(rng, which_method='random_integers') # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor
def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None, data_specs=None, return_tuple=False): # Build the right data_specs to query self.raw if data_specs is not None: assert is_flat_specs(data_specs) space, source = data_specs if not isinstance(source, tuple): source = (source,) if isinstance(space, CompositeSpace): space = tuple(space.components) else: space = (space,) # Put 'features' first, as this is what TransformerIterator # is expecting if 'features' not in source: # 'features is not needed, get things directly from # the original data raw_data_specs = data_specs else: feature_idx = source.index('features') raw_space = CompositeSpace( (self.transformer.get_input_space(),) + space[:feature_idx] + space[feature_idx + 1:]) raw_source = (('features',) + source[:feature_idx] + source[feature_idx + 1:]) raw_data_specs = (raw_space, raw_source) else: raw_data_specs = None raw_iterator = self.raw.iterator(mode=mode, batch_size=batch_size, num_batches=num_batches, topo=topo, targets=targets, rng=rng, data_specs=raw_data_specs, return_tuple=return_tuple) final_iterator = TransformerIterator(raw_iterator, self, data_specs=data_specs) return final_iterator
def __init__(self, dataset, subset_iterator, topo=None, targets=None, data_specs=None, return_tuple=False): if topo is not None or targets is not None: if data_specs is not None: raise ValueError("In FiniteDatasetIterator, both " "the `data_specs` argument and deprecated arguments " "`topo` or `targets` were provided.", (data_specs, topo, targets)) warnings.warn("Usage of `topo` and `target` arguments are being " "deprecated, and will be removed around November 7th, " "2013. `data_specs` should be used instead.", stacklevel=2) topo = False targets = False self._deprecated_interface = True else: self._deprecated_interface = False self._topo = topo self._targets = targets self._data_specs = data_specs self._dataset = dataset self._subset_iterator = subset_iterator self._return_tuple = return_tuple # TODO: More thought about how to handle things where this # fails (gigantic HDF5 files, etc.) if self._deprecated_interface: self._raw_data = self._dataset.get_design_matrix() if self._targets: self._raw_targets = self._dataset.get_targets() if self._raw_targets is None: raise ValueError("Can't iterate with targets=True on a " "dataset object with no targets") self._targets_need_cast = not np.dtype(config.floatX) == self._raw_targets.dtype self._needs_cast = not np.dtype(config.floatX) == self._raw_data.dtype else: # Keep only the needed sources in self._raw_data. # Remember what source they correspond to in self._source assert is_flat_specs(data_specs) dataset_space, dataset_source = self._dataset.get_data_specs() assert is_flat_specs((dataset_space, dataset_source)) # the dataset's data spec is either a single (space, source) pair, # or a pair of (non-nested CompositeSpace, non-nested tuple). # We could build a mapping and call flatten(..., return_tuple=True) # but simply putting spaces, sources and data in tuples is simpler. if not isinstance(dataset_source, tuple): dataset_source = (dataset_source,) if not isinstance(dataset_space, CompositeSpace): dataset_sub_spaces = (dataset_space,) else: dataset_sub_spaces = dataset_space.components assert len(dataset_source) == len(dataset_sub_spaces) all_data = self._dataset.get_data() if not isinstance(all_data, tuple): all_data = (all_data,) space, source = data_specs if not isinstance(source, tuple): source = (source,) if not isinstance(space, CompositeSpace): sub_spaces = (space,) else: sub_spaces = space.components assert len(source) == len(sub_spaces) self._raw_data = tuple(all_data[dataset_source.index(s)] for s in source) self._source = source self._convert = [] for i, (so, sp) in enumerate(safe_zip(source, sub_spaces)): idx = dataset_source.index(so) dspace = dataset_sub_spaces[idx] # Compose the functions fn = None needs_cast = not np.dtype(config.floatX) == \ self._raw_data[i].dtype if needs_cast: fn = lambda batch: numpy.cast[config.floatX](batch) needs_format = not sp == dspace if needs_format: # "dspace" and "sp" have to be passed as parameters # to lambda, in order to capture their current value, # otherwise they would change in the next iteration # of the loop. if fn is None: fn = (lambda batch, dspace=dspace, sp=sp: dspace.np_format_as(batch, sp)) else: fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn: dspace.np_format_as(fn_(batch), sp)) self._convert.append(fn)
def __init__(self, dataset, subset_iterator, data_specs=None, return_tuple=False, convert=None): self._data_specs = data_specs self._dataset = dataset self._subset_iterator = subset_iterator self._return_tuple = return_tuple # Keep only the needed sources in self._raw_data. # Remember what source they correspond to in self._source assert is_flat_specs(data_specs) dataset_space, dataset_source = self._dataset.get_data_specs() assert is_flat_specs((dataset_space, dataset_source)) # the dataset's data spec is either a single (space, source) pair, # or a pair of (non-nested CompositeSpace, non-nested tuple). # We could build a mapping and call flatten(..., return_tuple=True) # but simply putting spaces, sources and data in tuples is simpler. if not isinstance(dataset_source, (tuple, list)): dataset_source = (dataset_source,) if not isinstance(dataset_space, CompositeSpace): dataset_sub_spaces = (dataset_space,) else: dataset_sub_spaces = dataset_space.components assert len(dataset_source) == len(dataset_sub_spaces) space, source = data_specs if not isinstance(source, tuple): source = (source,) if not isinstance(space, CompositeSpace): sub_spaces = (space,) else: sub_spaces = space.components assert len(source) == len(sub_spaces) # If `dataset` is incompatible with the new interface, fall back to the # old interface if not hasattr(self._dataset, 'get'): all_data = self._dataset.get_data() if not isinstance(all_data, tuple): all_data = (all_data,) raw_data = [] for s in source: try: raw_data.append(all_data[dataset_source.index(s)]) except ValueError as e: msg = str(e) + '\nThe dataset does not provide '\ 'a source with name: ' + s + '.' reraise_as(ValueError(msg)) self._raw_data = tuple(raw_data) self._source = source self._space = sub_spaces if convert is None: self._convert = [None for s in source] else: assert len(convert) == len(source) self._convert = convert for i, (so, sp) in enumerate(safe_izip(source, sub_spaces)): try: idx = dataset_source.index(so) except ValueError as e: msg = str(e) + '\nThe dataset does not provide '\ 'a source with name: ' + so + '.' reraise_as(ValueError(msg)) dspace = dataset_sub_spaces[idx] fn = self._convert[i] # If there is a fn, it is supposed to take care of the formatting, # and it should be an error if it does not. If there was no fn, # then the iterator will try to format using the generic # space-formatting functions. if fn is None: # "dspace" and "sp" have to be passed as parameters # to lambda, in order to capture their current value, # otherwise they would change in the next iteration # of the loop. fn = (lambda batch, dspace=dspace, sp=sp: dspace.np_format_as(batch, sp)) self._convert[i] = fn
def __init__(self, dataset, subset_iterator, data_specs=None, return_tuple=False, convert=None): """ .. todo:: WRITEME """ self._data_specs = data_specs self._dataset = dataset self._subset_iterator = subset_iterator self._return_tuple = return_tuple assert is_flat_specs(data_specs) dataset_space, dataset_source = self._dataset.get_data_specs() assert is_flat_specs((dataset_space, dataset_source)) # the dataset's data spec is either a single (space, source) pair, # or a pair of (non-nested CompositeSpace, non-nested tuple). # We could build a mapping and call flatten(..., return_tuple=True) # but simply putting spaces, sources and data in tuples is simpler. if not isinstance(dataset_source, tuple): dataset_source = (dataset_source, ) if not isinstance(dataset_space, CompositeSpace): dataset_sub_spaces = (dataset_space, ) else: dataset_sub_spaces = dataset_space.components assert len(dataset_source) == len(dataset_sub_spaces) space, source = data_specs if not isinstance(source, tuple): source = (source, ) if not isinstance(space, CompositeSpace): sub_spaces = (space, ) else: sub_spaces = space.components assert len(source) == len(sub_spaces) self._source = source if convert is None: self._convert = [None for s in source] else: assert len(convert) == len(source) self._convert = convert for i, (so, sp) in enumerate(safe_zip(source, sub_spaces)): idx = dataset_source.index(so) dspace = dataset_sub_spaces[idx] init_fn = self._convert[i] fn = init_fn # If there is an init_fn, it is supposed to take # care of the formatting, and it should be an error # if it does not. If there was no init_fn, then # the iterator will try to format using the generic # space-formatting functions. if init_fn is None: # "dspace" and "sp" have to be passed as parameters # to lambda, in order to capture their current value, # otherwise they would change in the next iteration # of the loop. if fn is None: fn = (lambda batch, dspace=dspace, sp=sp: dspace. np_format_as(batch, sp)) else: fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn: dspace. np_format_as(fn_(batch), sp)) self._convert[i] = fn
def __init__(self, dataset, subset_iterator, data_specs=None, return_tuple=False, convert=None): self._data_specs = data_specs self._dataset = dataset self._subset_iterator = subset_iterator self._return_tuple = return_tuple # Keep only the needed sources in self._raw_data. # Remember what source they correspond to in self._source assert is_flat_specs(data_specs) dataset_space, dataset_source = self._dataset.get_data_specs() assert is_flat_specs((dataset_space, dataset_source)) # the dataset's data spec is either a single (space, source) pair, # or a pair of (non-nested CompositeSpace, non-nested tuple). # We could build a mapping and call flatten(..., return_tuple=True) # but simply putting spaces, sources and data in tuples is simpler. if not isinstance(dataset_source, (tuple, list)): dataset_source = (dataset_source, ) if not isinstance(dataset_space, CompositeSpace): dataset_sub_spaces = (dataset_space, ) else: dataset_sub_spaces = dataset_space.components assert len(dataset_source) == len(dataset_sub_spaces) space, source = data_specs if not isinstance(source, tuple): source = (source, ) if not isinstance(space, CompositeSpace): sub_spaces = (space, ) else: sub_spaces = space.components assert len(source) == len(sub_spaces) # If `dataset` is incompatible with the new interface, fall back to the # old interface if not hasattr(self._dataset, 'get'): all_data = self._dataset.get_data() if not isinstance(all_data, tuple): all_data = (all_data, ) raw_data = [] for s in source: try: raw_data.append(all_data[dataset_source.index(s)]) except ValueError as e: msg = str(e) + '\nThe dataset does not provide '\ 'a source with name: ' + s + '.' reraise_as(ValueError(msg)) self._raw_data = tuple(raw_data) self._source = source self._space = sub_spaces if convert is None: self._convert = [None for s in source] else: assert len(convert) == len(source) self._convert = convert for i, (so, sp) in enumerate(safe_izip(source, sub_spaces)): try: idx = dataset_source.index(so) except ValueError as e: msg = str(e) + '\nThe dataset does not provide '\ 'a source with name: ' + so + '.' reraise_as(ValueError(msg)) dspace = dataset_sub_spaces[idx] fn = self._convert[i] # If there is a fn, it is supposed to take care of the formatting, # and it should be an error if it does not. If there was no fn, # then the iterator will try to format using the generic # space-formatting functions. if fn is None: # "dspace" and "sp" have to be passed as parameters # to lambda, in order to capture their current value, # otherwise they would change in the next iteration # of the loop. fn = (lambda batch, dspace=dspace, sp=sp: dspace.np_format_as( batch, sp)) self._convert[i] = fn
def __init__(self, dataset, subset_iterator, data_specs=None, return_tuple=False, convert=None): self._data_specs = data_specs self._dataset = dataset self._subset_iterator = subset_iterator self._return_tuple = return_tuple assert is_flat_specs(data_specs) dataset_space, dataset_source = self._dataset.get_data_specs() assert is_flat_specs((dataset_space, dataset_source)) if not isinstance(dataset_source, tuple): dataset_source = (dataset_source,) if not isinstance(dataset_space, CompositeSpace): dataset_sub_spaces = (dataset_space,) else: dataset_sub_spaces = dataset_space.components assert len(dataset_source) == len(dataset_sub_spaces) space, source = data_specs if not isinstance(source, tuple): source = (source,) if not isinstance(space, CompositeSpace): sub_spaces = (space,) else: sub_spaces = space.components assert len(source) == len(sub_spaces) self._source = source if convert is None: self._convert = [None for s in source] else: assert len(convert) == len(source) self._convert = convert for i, (so, sp) in enumerate(safe_izip(source, sub_spaces)): idx = dataset_source.index(so) dspace = dataset_sub_spaces[idx] init_fn = self._convert[i] fn = init_fn # If there is an init_fn, it is supposed to take # care of the formatting, and it should be an error # if it does not. If there was no init_fn, then # the iterator will try to format using the generic # space-formatting functions. if init_fn is None: # "dspace" and "sp" have to be passed as parameters # to lambda, in order to capture their current value, # otherwise they would change in the next iteration # of the loop. if fn is None: fn = (lambda batch, dspace=dspace, sp=sp: dspace.np_format_as(batch, sp)) else: fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn: dspace.np_format_as(fn_(batch), sp)) self._convert[i] = fn
def __init__(self, dataset, subset_iterator, topo=None, targets=None, data_specs=None, return_tuple=False): if topo is not None or targets is not None: if data_specs is not None: raise ValueError("In FiniteDatasetIterator, both " "the `data_specs` argument and deprecated arguments " "`topo` or `targets` were provided.", (data_specs, topo, targets)) warnings.warn("Usage of `topo` and `target` arguments are being " "deprecated, and will be removed around November 7th, " "2013. `data_specs` should be used instead.", stacklevel=2) topo = False targets = False self._deprecated_interface = True else: self._deprecated_interface = False self._topo = topo self._targets = targets self._data_specs = data_specs self._dataset = dataset self._subset_iterator = subset_iterator self._return_tuple = return_tuple # TODO: More thought about how to handle things where this # fails (gigantic HDF5 files, etc.) if self._deprecated_interface: self._raw_data = self._dataset.get_design_matrix() if self._targets: self._raw_targets = self._dataset.get_targets() if self._raw_targets is None: raise ValueError("Can't iterate with targets=True on a " "dataset object with no targets") self._targets_need_cast = not np.dtype(config.floatX) == self._raw_targets.dtype self._needs_cast = not np.dtype(config.floatX) == self._raw_data.dtype else: # Keep only the needed sources in self._raw_data. # Remember what source they correspond to in self._source assert is_flat_specs(data_specs) dataset_space, dataset_source = self._dataset.get_data_specs() assert is_flat_specs((dataset_space, dataset_source)) # the dataset's data spec is either a single (space, source) pair, # or a pair of (non-nested CompositeSpace, non-nested tuple). # We could build a mapping and call flatten(..., return_tuple=True) # but simply putting spaces, sources and data in tuples is simpler. if not isinstance(dataset_source, tuple): dataset_source = (dataset_source,) if not isinstance(dataset_space, CompositeSpace): dataset_sub_spaces = (dataset_space,) else: dataset_sub_spaces = dataset_space.components assert len(dataset_source) == len(dataset_sub_spaces) all_data = self._dataset.get_data() if not isinstance(all_data, tuple): all_data = (all_data,) space, source = data_specs if not isinstance(source, tuple): source = (source,) if not isinstance(space, CompositeSpace): sub_spaces = (space,) else: sub_spaces = space.components assert len(source) == len(sub_spaces) self._raw_data = tuple(all_data[dataset_source.index(s)] for s in source) self._source = source self._convert = [] for i, (so, sp) in enumerate(safe_zip(source, sub_spaces)): idx = dataset_source.index(so) dspace = dataset_sub_spaces[idx] # Compose the functions fn = None needs_cast = not np.dtype(config.floatX) == \ self._raw_data[i].dtype if needs_cast: fn = lambda batch: numpy.cast[config.floatX](batch) needs_format = not sp == dspace if needs_format: # "dspace" and "sp" have to be passed as parameters # to lambda, in order to capture their current value, # otherwise they would change in the next iteration # of the loop. if fn is None: fn = (lambda batch, dspace=dspace, sp=sp: dspace.np_format_as(batch, sp)) else: fn = (lambda batch, dspace=dspace, sp=sp: dspace.np_format_as(fn(batch), sp)) self._convert.append(fn)
def __init__(self, dataset, subset_iterator, data_specs=None, return_tuple=False, convert=None): self._data_specs = data_specs self._dataset = dataset self._subset_iterator = subset_iterator self._return_tuple = return_tuple # Keep only the needed sources in self._raw_data. # Remember what source they correspond to in self._source assert is_flat_specs(data_specs) dataset_space, dataset_source = self._dataset.get_data_specs() assert is_flat_specs((dataset_space, dataset_source)) # the dataset's data spec is either a single (space, source) pair, # or a pair of (non-nested CompositeSpace, non-nested tuple). # We could build a mapping and call flatten(..., return_tuple=True) # but simply putting spaces, sources and data in tuples is simpler. if not isinstance(dataset_source, tuple): dataset_source = (dataset_source,) if not isinstance(dataset_space, CompositeSpace): dataset_sub_spaces = (dataset_space,) else: dataset_sub_spaces = dataset_space.components assert len(dataset_source) == len(dataset_sub_spaces) all_data = self._dataset.get_data() if not isinstance(all_data, tuple): all_data = (all_data,) space, source = data_specs if not isinstance(source, tuple): source = (source,) if not isinstance(space, CompositeSpace): sub_spaces = (space,) else: sub_spaces = space.components assert len(source) == len(sub_spaces) self._raw_data = tuple(all_data[dataset_source.index(s)] for s in source) self._source = source if convert is None: self._convert = [None for s in source] else: assert len(convert) == len(source) self._convert = convert for i, (so, sp, dt) in enumerate(safe_izip(source, sub_spaces, self._raw_data)): idx = dataset_source.index(so) dspace = dataset_sub_spaces[idx] init_fn = self._convert[i] fn = init_fn # If there is an init_fn, it is supposed to take # care of the formatting, and it should be an error # if it does not. If there was no init_fn, then # the iterator will try to format using the generic # space-formatting functions. if init_fn is None: # "dspace" and "sp" have to be passed as parameters # to lambda, in order to capture their current value, # otherwise they would change in the next iteration # of the loop. if fn is None: def fn(batch, dspace=dspace, sp=sp): try: return dspace.np_format_as(batch, sp) except ValueError as e: msg = str(e) + '\nMake sure that the model and '\ 'dataset have been initialized with '\ 'correct values.' reraise_as(ValueError(msg)) else: fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn: dspace.np_format_as(fn_(batch), sp)) self._convert[i] = fn
def __init__(self, dataset, subset_iterator, data_specs=None, return_tuple=False, convert_fns=None): """ convert_fns: function or tuple of function, organized as in data_specs, to be applied on the raw batches of data. "None" can be used as placeholder for the identity. """ self._deprecated_interface = False if data_specs is None: raise TypeError("data_specs not provided") self._data_specs = data_specs self._dataset = dataset self._subset_iterator = subset_iterator self._return_tuple = return_tuple # Keep only the needed sources in self._raw_data. # Remember what source they correspond to in self._source assert is_flat_specs(data_specs) dataset_space, dataset_source = self._dataset.get_data_specs() assert is_flat_specs((dataset_space, dataset_source)) # the dataset's data spec is either a single (space, source) pair, # or a pair of (non-nested CompositeSpace, non-nested tuple). # We could build a mapping and call flatten(..., return_tuple=True) # but simply putting spaces, sources and data in tuples is simpler. if not isinstance(dataset_source, tuple): dataset_source = (dataset_source,) if not isinstance(dataset_space, CompositeSpace): dataset_sub_spaces = (dataset_space,) else: dataset_sub_spaces = dataset_space.components assert len(dataset_source) == len(dataset_sub_spaces) all_data = self._dataset.get_data() if not isinstance(all_data, tuple): all_data = (all_data,) space, source = data_specs if not isinstance(source, tuple): source = (source,) if not isinstance(space, CompositeSpace): sub_spaces = (space,) else: sub_spaces = space.components assert len(source) == len(sub_spaces) self._raw_data = tuple(all_data[dataset_source.index(s)] for s in source) self._source = source if convert_fns is None: self._convert = [None for s in source] else: if not isinstance(convert_fns, (list, tuple)): convert_fns = (convert_fns,) assert len(convert_fns) == len(source) self._convert = list(convert_fns) for i, (so, sp) in enumerate(safe_zip(source, sub_spaces)): idx = dataset_source.index(so) dspace = dataset_sub_spaces[idx] # Compose the functions fn = self._convert[i] needs_cast = not (self._raw_data[i][0].dtype == config.floatX) if needs_cast: if fn is None: fn = lambda batch: np.cast[config.floatX](batch) else: fn = lambda batch, prev_fn=fn: np.cast[config.floatX](prev_fn(batch)) needs_format = not sp == dspace if needs_format: # "dspace" and "sp" have to be passed as parameters # to lambda, in order to capture their current value, # otherwise they would change in the next iteration # of the loop. if fn is None: fn = lambda batch, dspace=dspace, sp=sp: dspace.np_format_as(batch, sp) else: fn = lambda batch, dspace=dspace, sp=sp, fn_=fn: dspace.np_format_as(fn_(batch), sp) self._convert[i] = fn
def __init__(self, dataset, subset_iterator, data_specs=None, return_tuple=False, convert=None): self._data_specs = data_specs self._dataset = dataset self._subset_iterator = subset_iterator self._return_tuple = return_tuple # Keep only the needed sources in self._raw_data. # Remember what source they correspond to in self._source assert is_flat_specs(data_specs) dataset_space, dataset_source = self._dataset.get_data_specs() assert is_flat_specs((dataset_space, dataset_source)) # the dataset's data spec is either a single (space, source) pair, # or a pair of (non-nested CompositeSpace, non-nested tuple). # We could build a mapping and call flatten(..., return_tuple=True) # but simply putting spaces, sources and data in tuples is simpler. if not isinstance(dataset_source, tuple): dataset_source = (dataset_source,) if not isinstance(dataset_space, CompositeSpace): dataset_sub_spaces = (dataset_space,) else: dataset_sub_spaces = dataset_space.components assert len(dataset_source) == len(dataset_sub_spaces) all_data = self._dataset.get_data() if not isinstance(all_data, tuple): all_data = (all_data,) space, source = data_specs if not isinstance(source, tuple): source = (source,) if not isinstance(space, CompositeSpace): sub_spaces = (space,) else: sub_spaces = space.components assert len(source) == len(sub_spaces) self._raw_data = () for s in source: try: self._raw_data += (all_data[dataset_source.index(s)],) except ValueError as e: msg = str(e) + '\nThe dataset does not provide '\ 'a source with name: '+s+'.' reraise_as(ValueError(msg)) self._source = source self._space = sub_spaces if convert is None: self._convert = [None for s in source] else: assert len(convert) == len(source) self._convert = convert for i, (so, sp, dt) in enumerate(safe_izip(source, sub_spaces, self._raw_data)): idx = dataset_source.index(so) dspace = dataset_sub_spaces[idx] init_fn = self._convert[i] fn = init_fn # If there is an init_fn, it is supposed to take # care of the formatting, and it should be an error # if it does not. If there was no init_fn, then # the iterator will try to format using the generic # space-formatting functions. if init_fn is None: # "dspace" and "sp" have to be passed as parameters # to lambda, in order to capture their current value, # otherwise they would change in the next iteration # of the loop. if fn is None: def fn(batch, dspace=dspace, sp=sp): try: return dspace.np_format_as(batch, sp) except ValueError as e: msg = str(e) + '\nMake sure that the model and '\ 'dataset have been initialized with '\ 'correct values.' reraise_as(ValueError(msg)) else: fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn: dspace.np_format_as(fn_(batch), sp)) self._convert[i] = fn
def __init__(self, dataset, subset_iterator, data_specs=None, return_tuple=False, convert_fns=None): """ convert_fns: function or tuple of function, organized as in data_specs, to be applied on the raw batches of data. "None" can be used as placeholder for the identity. """ self._deprecated_interface = False if data_specs is None: raise TypeError("data_specs not provided") self._data_specs = data_specs self._dataset = dataset self._subset_iterator = subset_iterator self._return_tuple = return_tuple # Keep only the needed sources in self._raw_data. # Remember what source they correspond to in self._source assert is_flat_specs(data_specs) dataset_space, dataset_source = self._dataset.get_data_specs() assert is_flat_specs((dataset_space, dataset_source)) # the dataset's data spec is either a single (space, source) pair, # or a pair of (non-nested CompositeSpace, non-nested tuple). # We could build a mapping and call flatten(..., return_tuple=True) # but simply putting spaces, sources and data in tuples is simpler. if not isinstance(dataset_source, tuple): dataset_source = (dataset_source, ) if not isinstance(dataset_space, CompositeSpace): dataset_sub_spaces = (dataset_space, ) else: dataset_sub_spaces = dataset_space.components assert len(dataset_source) == len(dataset_sub_spaces) all_data = self._dataset.get_data() if not isinstance(all_data, tuple): all_data = (all_data, ) space, source = data_specs if not isinstance(source, tuple): source = (source, ) if not isinstance(space, CompositeSpace): sub_spaces = (space, ) else: sub_spaces = space.components assert len(source) == len(sub_spaces) self._raw_data = tuple(all_data[dataset_source.index(s)] for s in source) self._source = source if convert_fns is None: self._convert = [None for s in source] else: if not isinstance(convert_fns, (list, tuple)): convert_fns = (convert_fns, ) assert len(convert_fns) == len(source) self._convert = list(convert_fns) for i, (so, sp) in enumerate(safe_zip(source, sub_spaces)): idx = dataset_source.index(so) dspace = dataset_sub_spaces[idx] # Compose the functions fn = self._convert[i] needs_cast = not (self._raw_data[i][0].dtype == config.floatX) if needs_cast: if fn is None: fn = lambda batch: np.cast[config.floatX](batch) else: fn = (lambda batch, prev_fn=fn: np.cast[config.floatX] (prev_fn(batch))) needs_format = not sp == dspace if needs_format: # "dspace" and "sp" have to be passed as parameters # to lambda, in order to capture their current value, # otherwise they would change in the next iteration # of the loop. if fn is None: fn = (lambda batch, dspace=dspace, sp=sp: dspace. np_format_as(batch, sp)) else: fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn: dspace. np_format_as(fn_(batch), sp)) self._convert[i] = fn
def __init__(self, dataset, subset_iterator, data_specs=None, return_tuple=False, convert=None): """ .. todo:: WRITEME """ self._data_specs = data_specs self._dataset = dataset self._subset_iterator = subset_iterator self._return_tuple = return_tuple assert is_flat_specs(data_specs) dataset_space, dataset_source = self._dataset.get_data_specs() assert is_flat_specs((dataset_space, dataset_source)) # the dataset's data spec is either a single (space, source) pair, # or a pair of (non-nested CompositeSpace, non-nested tuple). # We could build a mapping and call flatten(..., return_tuple=True) # but simply putting spaces, sources and data in tuples is simpler. if not isinstance(dataset_source, tuple): dataset_source = (dataset_source,) if not isinstance(dataset_space, CompositeSpace): dataset_sub_spaces = (dataset_space,) else: dataset_sub_spaces = dataset_space.components assert len(dataset_source) == len(dataset_sub_spaces) space, source = data_specs if not isinstance(source, tuple): source = (source,) if not isinstance(space, CompositeSpace): sub_spaces = (space,) else: sub_spaces = space.components assert len(source) == len(sub_spaces) self._source = source if convert is None: self._convert = [None for s in source] else: assert len(convert) == len(source) self._convert = convert dtypes = self._dataset.dtype_of(self._source) for i, (so, sp, dt) in enumerate(safe_zip(source, sub_spaces, dtypes)): idx = dataset_source.index(so) dspace = dataset_sub_spaces[idx] init_fn = self._convert[i] fn = init_fn # Compose the functions needs_cast = not (numpy.dtype(config.floatX) == dt) if needs_cast: if fn is None: fn = lambda batch: numpy.cast[config.floatX](batch) else: fn = (lambda batch, fn_=fn: numpy.cast[config.floatX](fn_(batch))) # If there is an init_fn, it is supposed to take # care of the formatting, and it should be an error # if it does not. If there was no init_fn, then # the iterator will try to format using the generic # space-formatting functions. needs_format = not init_fn and not sp == dspace if needs_format: # "dspace" and "sp" have to be passed as parameters # to lambda, in order to capture their current value, # otherwise they would change in the next iteration # of the loop. if fn is None: fn = (lambda batch, dspace=dspace, sp=sp: dspace.np_format_as(batch, sp)) else: fn = (lambda batch, dspace=dspace, sp=sp, fn_=fn: dspace.np_format_as(fn_(batch), sp)) self._convert[i] = fn