def __init__(self, which_set, path=None): self.mapper = {'train': 0, 'valid': 1, 'test': 2} assert which_set in self.mapper.keys() self.__dict__.update(locals()) del self.self if path is not None: raise NotImplementedError("Data path is the current directory.") # load data file_n = "click_data.h5" self.h5file = tables.open_file(file_n, mode='r') if which_set == 'test': test_group = self.h5file.root.test.test_raw self.X = test_group.X_t self.y = None else: train_group = self.h5file.root.train.train_raw if which_set == 'train': self.X = train_group.X_train self.y = train_group.y_train else: self.X = train_group.X_valid self.y = train_group.y_valid self.samples = slice(0, self.X.shape[0]) self.sample_index = self.samples.start self.examples = self.X.shape[0] max_labels = 2 X_source = 'features' X_space = VectorSpace(dim=23) if self.y is None: space = X_space source = X_source else: y_space = IndexSpace(dim=1, max_labels=max_labels) y_source = 'targets' space = CompositeSpace((X_space, y_space)) source = (X_source, y_source) self.data_specs = (space, source) self.X_space = X_space self._iter_mode = resolve_iterator_class('sequential') self._iter_topo = False self._iter_targets = False self._iter_data_specs = (self.X_space, 'features') self._iter_subset_class = resolve_iterator_class('even_sequential')
def __init__(self, which_set, path=None): self.mapper = {"train": 0, "valid": 1, "test": 2} assert which_set in self.mapper.keys() self.__dict__.update(locals()) del self.self if path is not None: raise NotImplementedError("Data path is the current directory.") # load data file_n = "click_data.h5" self.h5file = tables.open_file(file_n, mode="r") if which_set == "test": test_group = self.h5file.root.test.test_raw self.X = test_group.X_t self.y = None else: train_group = self.h5file.root.train.train_raw if which_set == "train": self.X = train_group.X_train self.y = train_group.y_train else: self.X = train_group.X_valid self.y = train_group.y_valid self.samples = slice(0, self.X.shape[0]) self.sample_index = self.samples.start self.examples = self.X.shape[0] max_labels = 2 X_source = "features" X_space = VectorSpace(dim=23) if self.y is None: space = X_space source = X_source else: y_space = IndexSpace(dim=1, max_labels=max_labels) y_source = "targets" space = CompositeSpace((X_space, y_space)) source = (X_source, y_source) self.data_specs = (space, source) self.X_space = X_space self._iter_mode = resolve_iterator_class("sequential") self._iter_topo = False self._iter_targets = False self._iter_data_specs = (self.X_space, "features") self._iter_subset_class = resolve_iterator_class("even_sequential")
def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if rng is None and mode.stochastic: rng = self.rng if data_specs is None: data_specs = getattr(self, '_iter_data_specs', None) return FiniteDatasetIterator( self, mode(self.n_samples, batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple)
def _create_subset_iterator(self, mode, batch_size=None, num_batches=None, rng=None): subset_iterator = resolve_iterator_class(mode) if rng is None and subset_iterator.stochastic: rng = make_np_rng() return subset_iterator(self.get_num_examples(), batch_size, num_batches, rng)
def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None): # TODO: Refactor, deduplicate with set_iteration_scheme if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class else: raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if topo is None: topo = getattr(self, '_iter_topo', False) if targets is None: targets = getattr(self, '_iter_targets', False) if rng is None and mode.stochastic: rng = self.rng return FiniteDatasetIterator( self, mode(self.X.shape[0], batch_size, num_batches, rng), topo, targets)
def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None, data_specs=None, return_tuple=False): if topo is not None or targets is not None: raise ValueError("You should use the new interface iterator") if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class else: raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if rng is None and mode.stochastic: rng = self.rng if data_specs is None: data_specs = self.data_specs return FiniteDatasetIterator(self, mode(self.get_num_examples(), batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple)
def __init__(self, data=None, data_specs=None, rng=_default_seed, preprocessor=None, fit_preprocessor=False): # data_specs should be flat, and there should be no # duplicates in source, as we keep only one version assert is_flat_specs(data_specs) if isinstance(data_specs[1], tuple): assert sorted(set(data_specs[1])) == sorted(data_specs[1]) space, source = data_specs space.np_validate(data) assert len(set(elem.shape[0] for elem in list(data))) <= 1 self.data = data self.data_specs = data_specs self.num_examples = list(data)[0].shape[0] self.compress = False self.design_loc = None self.rng = make_np_rng(rng, which_method='random_integers') # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor
def __init__(self, data=None, data_specs=None, rng=_default_seed, preprocessor=None, fit_preprocessor=False): # data_specs should be flat, and there should be no # duplicates in source, as we keep only one version assert is_flat_specs(data_specs) if isinstance(data_specs[1], tuple): assert sorted(set(data_specs[1])) == sorted(data_specs[1]) space, source = data_specs space.np_validate(data) # TODO: assume that data[0] is num example => error if channel in c01b # assert len(set(elem.shape[0] for elem in list(data))) <= 1 self.data = data self.data_specs = data_specs # TODO: assume that data[0] is num example => error if channel in c01b self.num_examples = list(data)[-1].shape[0] # TODO: list(data)[0].shape[0] self.compress = False self.design_loc = None self.rng = make_np_rng(rng, which_method='random_integers') # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor
def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if rng is None and mode.stochastic: rng = self.rng if data_specs is None: data_specs = getattr(self, '_iter_data_specs', None) return FiniteDatasetIterator(self, mode(self.n_samples, batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple)
def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None, data_specs=None, return_tuple=False): if topo is not None or targets is not None: raise ValueError("You should use the new interface iterator") if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class else: raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if rng is None and mode.stochastic: rng = self.rng if data_specs is None: data_specs = self.data_specs return FiniteDatasetIterator( self, mode(self.get_num_examples(), batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple )
def __init__(self, which_set='debug', start=None, end=None, shuffle=True, lazy_load=False, rng=_default_seed): assert which_set in ['debug', 'train', 'test'] if which_set == 'debug': maxlen, n_samples, n_annotations, n_features = 10, 12, 13, 14 X = N.zeros(shape=(n_samples, maxlen)) X_mask = X # same with X Z = N.zeros(shape=(n_annotations, n_samples, n_features)) elif which_set == 'train': pass else: pass self.X, self.X_mask, self.Z = (X, X_mask, Z) self.sources = ('features', 'target') self.spaces = CompositeSpace([ SequenceSpace(space=VectorSpace(dim=self.X.shape[1])), SequenceDataSpace(space=VectorSpace(dim=self.Z.shape[-1])) ]) self.data_spces = (self.spaces, self.sources) # self.X_space, self.X_mask_space, self.Z_space # Default iterator self._iter_mode = resolve_iterator_class('sequential') self._iter_topo = False self._iter_target = False self._iter_data_specs = self.data_spces self.rng = make_np_rng(rng, which_method='random_intergers')
def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): allowed_modes = ('sequential', 'random_slice', 'even_sequential', 'batchwise_shuffled_sequential', 'even_batchwise_shuffled_sequential') if mode is not None and mode not in allowed_modes: raise ValueError("Due to HDF5 limitations on advanced indexing, " + "the '" + mode + "' iteration mode is not " + "supported") if data_specs is None: data_specs = self._iter_data_specs space, source = data_specs sub_spaces, sub_sources = ( (space.components, source) if isinstance(space, CompositeSpace) else ((space,), (source,))) convert = [None for sp, src in safe_izip(sub_spaces, sub_sources)] mode = (self._iter_subset_class if mode is None else resolve_iterator_class(mode)) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if rng is None and mode.stochastic: rng = self.rng return VariableImageDatasetIterator( dataset=self, subset_iterator=mode( self.num_examples, batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple, convert=convert)
def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): """ Copied from dense_design_matrix, in order to fix uneven problem. """ if data_specs is None: data_specs = self._iter_data_specs # If there is a view_converter, we have to use it to convert # the stored data for "features" into one that the iterator # can return. space, source = data_specs if isinstance(space, CompositeSpace): sub_spaces = space.components sub_sources = source else: sub_spaces = (space,) sub_sources = (source,) convert = [] for sp, src in safe_zip(sub_spaces, sub_sources): if src == 'features' and \ getattr(self, 'view_converter', None) is not None: conv_fn = (lambda batch, self=self, space=sp: self.view_converter.get_formatted_batch(batch, space)) else: conv_fn = None convert.append(conv_fn) # TODO: Refactor if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class else: raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if rng is None and mode.stochastic: rng = self.rng # hack to make the online augmentations run FiniteDatasetIterator.uneven = False iterator = FiniteDatasetIterator(self, mode(self.X.shape[0], batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple, convert=convert) return iterator
def _create_subset_iterator(self, mode, batch_size=None, num_batches=None, rng=None): subset_iterator = resolve_iterator_class(mode) if rng is None and subset_iterator.stochastic: rng = make_np_rng() return subset_iterator(self.get_num_examples(), batch_size, num_batches, rng)
def __init__(self, which_set, context_len, data_mode, shuffle=True): self.__dict__.update(locals()) del self.self # Load data into self._data (defined in PennTreebank) self._load_data(which_set, context_len, data_mode) print self._raw_data[0:30] print self._data[:, :-1][:10] print "_____________" print self._data[:, -1:][:10] super(PennTreebank_NGrams, self).__init__( X=self._data[:, :-1], y=self._data[:, -1:], X_labels=10000, y_labels=10000 ) if shuffle: warnings.warn("Note that the PennTreebank samples are only " "shuffled when the iterator method is used to " "retrieve them.") self._iter_subset_class = resolve_iterator_class( 'shuffled_sequential' )
def __init__(self, which_set, context_len, data_mode, shuffle=True): self.__dict__.update(locals()) del self.self # Load data into self._data (defined in PennTreebank) self._load_data(which_set, context_len, data_mode) self._data = as_strided(self._raw_data, shape=(len(self._raw_data) - context_len, context_len + 1), strides=(self._raw_data.itemsize, self._raw_data.itemsize)) super(PennTreebankNGrams, self).__init__( X=self._data[:, :-1], y=self._data[:, -1:], X_labels=self._max_labels, y_labels=self._max_labels ) if shuffle: warnings.warn("Note that the PennTreebank samples are only " "shuffled when the iterator method is used to " "retrieve them.") self._iter_subset_class = resolve_iterator_class( 'shuffled_sequential' )
def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None): # TODO: Refactor, deduplicate with set_iteration_scheme if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class else: raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if topo is None: topo = getattr(self, '_iter_topo', False) if targets is None: targets = getattr(self, '_iter_targets', False) if rng is None and mode.stochastic: rng = self.rng return FiniteDatasetIterator(self, mode(self.X.shape[0], batch_size, num_batches, rng), topo, targets)
def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): """ Method inherited from `pylearn2.datasets.dataset.Dataset`. """ self.mode = mode self.batch_size = batch_size self._return_tuple = return_tuple # TODO: If there is a view_converter, we have to use it to convert # the stored data for "features" into one that the iterator can return. space, source = data_specs or (self.X_space, 'features') assert isinstance(space, CompositeSpace),\ "Unexpected input space for the data." sub_spaces = space.components sub_sources = source conv_fn = lambda x: x.todense().astype(theano.config.floatX) convert = [] for sp, src in safe_zip(sub_spaces, sub_sources): convert.append(conv_fn if src in ('features', 'targets') else None) assert mode is not None,\ "Iteration mode not provided for %s" % str(self) mode = resolve_iterator_class(mode) subset_iterator = mode(self.X.shape[0], batch_size, num_batches, rng) return FiniteDatasetIterator(self, subset_iterator, data_specs=data_specs, return_tuple=return_tuple, convert=convert)
def __init__(self, min_x=-6.28, max_x=6.28, std=.05, rng=_default_seed): """ Constructor. """ super(CosDataset, self).__init__() #: lower limit for x as in cos(x) self.min_x = min_x #: higher limit for x as in cos(x) self.max_x = max_x #: standard deviation for the noise added to the values we generate self.std = std # argument to resolve_iterator_class() can be either # a string from [sequential, shuffled_sequential, random_slice, # random_uniform, batchwise_shuffled_sequential, even_sequential, # even_shuffled_sequential, even_batchwise_shuffled_sequential, # even_sequences] or a SubsetIterator sublass. #: default iterator implementation (a class to be instantiated) self._iter_subset_class = resolve_iterator_class('sequential') #: default data specifications for iterator self._iter_data_specs = (VectorSpace(2), 'features') #: default batch size for the iterator self._iter_batch_size = 100 #: default number of batches for the iterator self._iter_num_batches = 10 #: random number generator self.rng = make_np_rng(rng, which_method=['uniform', 'randn'])
def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): if data_specs is None: data_specs = self._iter_data_specs # If there is a view_converter, we have to use it to convert # the stored data for "features" into one that the iterator # can return. space, source = data_specs if isinstance(space, CompositeSpace): sub_spaces = space.components sub_sources = source else: sub_spaces = (space,) sub_sources = (source,) convert = [] for sp, src in safe_zip(sub_spaces, sub_sources): if (src == "features" and getattr(self, "view_converter", None) is not None): if self.distorter is None: conv_fn = (lambda batch, self=self, space=sp: self.view_converter.get_formatted_batch(batch, space)) else: conv_fn = (lambda batch, self=self, space=sp: self.distorter._distort( self.view_converter.get_formatted_batch(batch, space))) else: conv_fn = None convert.append(conv_fn) # TODO: Refactor if mode is None: if hasattr(self, "_iter_subset_class"): mode = self._iter_subset_class else: raise ValueError("iteration mode not provided and no default " "mode set for %s" % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, "_iter_batch_size", None) if num_batches is None: num_batches = getattr(self, "_iter_num_batches", None) if rng is None and mode.stochastic: rng = self.rng return FiniteDatasetIterator(self, mode(self.X.shape[0], batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple, convert=convert)
def __init__(self, X=None, topo_view=None, y=None, view_converter=None, axes=('b', 0, 1, 'c'), rng=_default_seed, preprocessor=None, fit_preprocessor=False): """ Parameters ---------- X : ndarray, 2-dimensional, optional Should be supplied if `topo_view` is not. A design matrix of shape (number examples, number features) that defines the dataset. topo_view : ndarray, optional Should be supplied if X is not. An array whose first dimension is of length number examples. The remaining dimensions are xamples with topological significance, e.g. for images the remaining axes are rows, columns, and channels. y : ndarray, 1-dimensional(?), optional Labels or targets for each example. The semantics here are not quite nailed down for this yet. view_converter : object, optional An object for converting between design matrices and topological views. Currently DefaultViewConverter is the only type available but later we may want to add one that uses the retina encoding that the U of T group uses. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.X = X if view_converter is not None: assert topo_view is None self.view_converter = view_converter else: if topo_view is not None: self.set_topological_view(topo_view, axes) self.y = y self.compress = False self.design_loc = None if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = np.random.RandomState(rng) # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') self._iter_topo = False self._iter_targets = False if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor
def __init__(self, X=None, topo_view=None, y=None, tags=None, view_converter=None, axes = ('b', 0, 1, 'c'), rng=_default_seed, preprocessor = None, fit_preprocessor=False): """ Parameters ---------- X : ndarray, 2-dimensional, optional Should be supplied if `topo_view` is not. A design matrix of shape (number examples, number features) that defines the dataset. topo_view : ndarray, optional Should be supplied if X is not. An array whose first dimension is of length number examples. The remaining dimensions are examples with topological significance, e.g. for images the remaining axes are rows, columns, and channels. y : ndarray, 1-dimensional(?), optional Labels or targets for each example. The semantics here are not quite nailed down for this yet. tags: ndarray, optional First dimension is the number of examples, other dimensions contain extra information about the examples. Used to keep track of position information for randomly cropped patches. view_converter : object, optional An object for converting between design matrices and topological views. Currently DefaultViewConverter is the only type available but later we may want to add one that uses the retina encoding that the U of T group uses. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.X = X if view_converter is not None: assert topo_view is None self.view_converter = view_converter else: if topo_view is not None: self.set_topological_view(topo_view, axes) self.y = y self.tags = tags self.compress = False self.design_loc = None if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = np.random.RandomState(rng) # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') self._iter_topo = False self._iter_targets = False if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor
def __init__(self, which_set, which_day, path=None): self.daylist = range(21, 32) self.mapper = {'train': 0, 'valid': 1, 'test': 2} assert which_set in self.mapper.keys() assert which_day in self.daylist f = open('/home/whale/Documents/click/dayrows.pkl', 'r') self.dayrows = cPickle.load(f) f.close() self.__dict__.update(locals()) del self.self if path is not None: raise NotImplementedError("Data path is the current directory.") # load data file_n = "click_data.h5" self.h5file = tables.open_file(file_n, mode='r') if which_set == 'test': test_group = self.h5file.root.test.test_raw self.X = test_group.X_t self.y = None self.samples = slice(0, self.X.shape[0]) self.sample_index = self.samples.start self.examples = self.X.shape[0] else: train_group = self.h5file.root.train.train_raw self.X = train_group.X self.y = train_group.y self.samples = slice(sum(self.dayrows[:which_day - 21]), sum(self.dayrows[:which_day - 20])) self.sample_index = self.samples.start self.examples = self.dayrows[which_day - 21] max_labels = 2 X_source = 'features' X_space = VectorSpace(dim=23) if self.y is None: space = X_space source = X_source else: y_space = IndexSpace(dim=1, max_labels=max_labels) y_source = 'targets' space = CompositeSpace((X_space, y_space)) source = (X_source, y_source) self.data_specs = (space, source) self.X_space = X_space self._iter_mode = resolve_iterator_class('sequential') self._iter_topo = False self._iter_targets = False self._iter_data_specs = (self.X_space, 'features')
def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None, data_specs=None, return_tuple=False): """ method inherited from Dataset """ self.mode = mode self.batch_size = batch_size self._targets = targets self._return_tuple = return_tuple if data_specs is None: data_specs = self._iter_data_specs # If there is a view_converter, we have to use it to convert # the stored data for "features" into one that the iterator # can return. # if self.conv_fn = lambda x: x.todense() space, source = data_specs if isinstance(space, CompositeSpace): sub_spaces = space.components sub_sources = source else: sub_spaces = (space, ) sub_sources = (source, ) convert = [] for sp, src in safe_zip(sub_spaces, sub_sources): if src == 'features' or 'targets': conv_fn = self.conv_fn else: conv_fn = None convert.append(conv_fn) if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class else: raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) return FiniteDatasetIterator(self, mode(self.X.shape[0], batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple, convert=convert)
def __init__(self, which_set, which_day, path=None): self.daylist = range(21, 32) self.mapper = {"train": 0, "valid": 1, "test": 2} assert which_set in self.mapper.keys() assert which_day in self.daylist f = open("/home/whale/Documents/click/dayrows.pkl", "r") self.dayrows = cPickle.load(f) f.close() self.__dict__.update(locals()) del self.self if path is not None: raise NotImplementedError("Data path is the current directory.") # load data file_n = "click_data.h5" self.h5file = tables.open_file(file_n, mode="r") if which_set == "test": test_group = self.h5file.root.test.test_raw self.X = test_group.X_t self.y = None self.samples = slice(0, self.X.shape[0]) self.sample_index = self.samples.start self.examples = self.X.shape[0] else: train_group = self.h5file.root.train.train_raw self.X = train_group.X self.y = train_group.y self.samples = slice(sum(self.dayrows[: which_day - 21]), sum(self.dayrows[: which_day - 20])) self.sample_index = self.samples.start self.examples = self.dayrows[which_day - 21] max_labels = 2 X_source = "features" X_space = VectorSpace(dim=23) if self.y is None: space = X_space source = X_source else: y_space = IndexSpace(dim=1, max_labels=max_labels) y_source = "targets" space = CompositeSpace((X_space, y_space)) source = (X_source, y_source) self.data_specs = (space, source) self.X_space = X_space self._iter_mode = resolve_iterator_class("sequential") self._iter_topo = False self._iter_targets = False self._iter_data_specs = (self.X_space, "features")
def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None, data_specs=None, return_tuple=False): """ method inherited from Dataset """ self.mode = mode self.batch_size = batch_size self._targets = targets self._return_tuple = return_tuple if data_specs is None: data_specs = self._iter_data_specs # If there is a view_converter, we have to use it to convert # the stored data for "features" into one that the iterator # can return. # if space, source = data_specs if isinstance(space, CompositeSpace): sub_spaces = space.components sub_sources = source else: sub_spaces = (space,) sub_sources = (source,) convert = [] for sp, src in safe_zip(sub_spaces, sub_sources): if src == 'features': conv_fn = lambda x: x.todense() elif src == 'targets': conv_fn = lambda x: x else: conv_fn = None convert.append(conv_fn) if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class else: raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) return FiniteDatasetIterator(self, mode(self.X.shape[0], batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple, convert=convert)
def __init__(self, data=None, data_specs=None, rng=_default_seed, preprocessor=None, fit_preprocessor=False): """ Parameters ---------- data: ndarray, or tuple of ndarrays, containing the data. It is formatted as specified in `data_specs`. For instance, if `data_specs` is (VectorSpace(nfeat), 'features'), then `data` has to be a 2-d ndarray, of shape (nb examples, nfeat), that defines an unlabeled dataset. If `data_specs` is (CompositeSpace(Conv2DSpace(...), VectorSpace(1)), ('features', 'target')), then `data` has to be an (X, y) pair, with X being an ndarray containing images stored in the topological view specified by the `Conv2DSpace`, and y being a 2-D ndarray of width 1, containing the labels or targets for each example. data_specs: A (space, source) pair, where space is an instance of `Space` (possibly a `CompositeSpace`), and `source` is a string (or tuple of strings, if `space` is a `CompositeSpace`), defining the format and labels associated to `data`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. preprocessor: WRITEME fit_preprocessor: WRITEME """ # data_specs should be flat, and there should be no # duplicates in source, as we keep only one version assert is_flat_specs(data_specs) if isinstance(data_specs[1], tuple): assert sorted(set(data_specs[1])) == sorted(data_specs[1]) self.data = data self.data_specs = data_specs self.compress = False self.design_loc = None if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = np.random.RandomState(rng) # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor
def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): """ .. todo:: WRITEME """ if data_specs is None: data_specs = self._iter_data_specs # If there is a view_converter, we have to use it to convert # the stored data for "features" into one that the iterator # can return. space, source = data_specs if isinstance(space, CompositeSpace): sub_spaces = space.components sub_sources = source else: sub_spaces = (space,) sub_sources = (source,) convert = [] for sp, src in safe_zip(sub_spaces, sub_sources): convert.append(None) # TODO: Refactor if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class else: raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if rng is None and mode.stochastic: rng = self.rng if self.noise != False: lengths = map( lambda x: len(x), self.samples_sequences ) self.noise_this_epoch = map( lambda length: numpy.random.normal( 0, self.noise, (length,1) ), lengths ) return FiniteDatasetIterator(self, mode(self.num_examples, batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple, convert=convert)
def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): """ .. todo:: WRITEME """ if data_specs is None: data_specs = self._iter_data_specs # If there is a view_converter, we have to use it to convert # the stored data for "features" into one that the iterator # can return. space, source = data_specs if isinstance(space, CompositeSpace): sub_spaces = space.components sub_sources = source else: sub_spaces = (space, ) sub_sources = (source, ) convert = [] for sp, src in safe_zip(sub_spaces, sub_sources): convert.append(None) # TODO: Refactor if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class else: raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if rng is None and mode.stochastic: rng = self.rng return FiniteDatasetIterator(self, mode(self.num_examples, batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple, convert=convert)
def iterator(self, mode="sequential", batch_size=None, num_batches=None, rng=None): """ Method inherited from the Dataset. """ if batch_size is None and mode == "sequential": batch_size = 100 # Has to be big enough or we'll never pick anything. self.batch_size = batch_size self.mode = resolve_iterator_class(mode) self.subset_iterator = self.mode(self.total_n_exs, batch_size, num_batches, rng=None) return EmotiwArrangerIter(self, self.subset_iterator, batch_size=batch_size)
def iterator(self, mode=None, batch_size=1, num_batches=None, rng=None, data_specs=None, return_tuple=False): if num_batches is None: num_batches = len(self.X1) / (batch_size) mode = resolve_iterator_class(mode) i = FiniteDatasetIterator( self, mode(len(self.X1), batch_size, num_batches, rng), data_specs=data_specs, ) return i
def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if rng is None and mode.stochastic: rng = self.rng if data_specs is None: data_specs = getattr(self, '_iter_data_specs', None) # TODO: figure out where to to the scaling more cleanly. def list_to_scaled_array(batch): # batch is either a 4D ndarray, or a list of length 1 # containing a 4D ndarray. Make it a 5D ndarray, # with shape 1 on the first dimension. # Also scale it from [0, 255] to [0, 1] if isinstance(batch, list): assert len(batch) == 1 batch = batch[0] batch = batch.astype(config.floatX) batch /= 255. return batch[np.newaxis] convert_fns = [] for space in data_specs[0].components: if (isinstance(space, FaceTubeSpace) and space.axes[0] == 'b'): convert_fns.append(list_to_scaled_array) else: convert_fns.append(None) return FiniteDatasetIteratorVariableSize(self, mode(self.n_samples, batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple, convert_fns=convert_fns)
def __init__(self, data=None, data_specs=None, rng=_default_seed, preprocessor=None, fit_preprocessor=False): """ Parameters ---------- data: ndarray, or tuple of ndarrays, containing the data. It is formatted as specified in `data_specs`. For instance, if `data_specs` is (VectorSpace(nfeat), 'features'), then `data` has to be a 2-d ndarray, of shape (nb examples, nfeat), that defines an unlabeled dataset. If `data_specs` is (CompositeSpace(Conv2DSpace(...), VectorSpace(1)), ('features', 'target')), then `data` has to be an (X, y) pair, with X being an ndarray containing images stored in the topological view specified by the `Conv2DSpace`, and y being a 2-D ndarray of width 1, containing the labels or targets for each example. data_specs: A (space, source) pair, where space is an instance of `Space` (possibly a `CompositeSpace`), and `source` is a string (or tuple of strings, if `space` is a `CompositeSpace`), defining the format and labels associated to `data`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. preprocessor: WRITEME fit_preprocessor: WRITEME """ # data_specs should be flat, and there should be no # duplicates in source, as we keep only one version assert is_flat_specs(data_specs) if isinstance(data_specs[1], tuple): assert sorted(set(data_specs[1])) == sorted(data_specs[1]) self.data = data self.data_specs = data_specs self.compress = False self.design_loc = None if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = np.random.RandomState(rng) # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor
def __init__(self, X, y=None, rng=None): self.X = X.astype(theano.config.floatX) self.y = y self.compress = False self.design_loc = None if rng is None: rng = np.random.RandomState(SparseDesignMatrix._default_seed) self.rng = rng # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') self._iter_topo = False self._iter_targets = False
def set_iteration_scheme(self, mode=None, batch_size=None, num_batches=None, topo=False, targets=False): if mode is not None: self._iter_subset_class = mode = resolve_iterator_class(mode) elif hasattr(self, "_iter_subset_class"): mode = self._iter_subset_class else: raise ValueError("iteration mode not provided and no default " "mode set for %s" % str(self)) # If this didn't raise an exception, we should be fine. self._iter_batch_size = batch_size self._iter_num_batches = num_batches self._iter_topo = topo self._iter_targets = targets # Try to create an iterator with these settings. rng = self.rng if mode.stochastic else None test = self.iterator(mode, batch_size, num_batches, topo, rng=rng)
def __init__(self, data_generator=None, n_classes=101, n_examples=10, n_frames=10, n_features=4096): """ :type data_generator: function :param data_generator: function used to generate data in the form of X, y tuple. X is a 3-dimensional array with dimensions (examples, frames/time, features). y is a 2-dimensional array with dimensions (examples, target values). Optional value defaults to generating random therefore 'hard' data. :type n_classes: int :param n_classes: the number of possible target values or n_classes :type n_examples: int :param n_examples: the number of examples to be generated in the dataset :type n_frames: int :param n_frames: the number of frames or time steps in each example :type n_features: int :param n_features: the number of features in each time step """ rng = np.random.RandomState(seed=42) self.n_features = n_features self.n_examples = n_examples if data_generator is None: data_generator = hard_data_generator self.data_generator = data_generator self.X, self.y = self.data_generator(n_classes, n_examples, n_frames, n_features) features_space = VectorSequenceSpace(dim=self.n_features) # features_space = SequenceDataSpace(VectorSpace(dim=self.n_features)) targets_space = VectorSequenceSpace(dim=1) # targets_space = SequenceDataSpace(VectorSpace(dim=1)) space_components = [features_space, targets_space] space = CompositeSpace(space_components) source = ('features', 'targets') self.data_specs = (space, source) self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace( (features_space, targets_space)), source)
def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): """ .. todo:: WRITEME """ if data_specs is None: data_specs = self._iter_data_specs # If there is a view_converter, we have to use it to convert # the stored data for "features" into one that the iterator # can return. space, source = data_specs if isinstance(space, CompositeSpace): sub_spaces = space.components sub_sources = source else: sub_spaces = (space,) sub_sources = (source,) convert = [] for sp, src in safe_zip(sub_spaces, sub_sources): convert.append(None) # TODO: Refactor if mode is None: if hasattr(self, "_iter_subset_class"): mode = self._iter_subset_class else: raise ValueError("iteration mode not provided and no default " "mode set for %s" % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, "_iter_batch_size", None) # if num_batches is None: # num_batches = getattr(self, '_iter_num_batches', None) num_batches = self._iter_num_batches if rng is None and mode.stochastic: rng = self.rng return FiniteDatasetIterator( self, mode(self.num_examples, batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple, convert=convert, )
def __init__(self, dataset, batch_size, num_batches, mode, iterator_post_processors=(GaussianNoisePostProcessor( .01, 0, .5), GaussianNoisePostProcessor(.1, 0, .001))): def _validate_batch_size(batch_size, dataset): if not batch_size: raise ValueError("batch size is none") num_examples = dataset.get_num_examples() if batch_size > num_examples: raise ValueError( "batch size:%i is to large, dataset has %i examples", batch_size, num_examples) if batch_size < 0: raise ValueError("batch size: %i cannot be negative", batch_size) if not isinstance(batch_size, int): raise ValueError("batch_size is not an int") def _validate_num_batches(num_batches): if not num_batches: raise ValueError("num_batches is none") if num_batches < 0: raise ValueError("num_batches: %i cannot be negative", num_batches) if not isinstance(num_batches, int): raise ValueError("num_batches is not an int") self.dataset = dataset dataset_size = dataset.get_num_examples() _validate_batch_size(batch_size, dataset) _validate_num_batches(num_batches) subset_iterator_class = resolve_iterator_class(mode) self._subset_iterator = subset_iterator_class(dataset_size, batch_size, num_batches) self.iterator_post_processors = iterator_post_processors
def iterator(self, mode=None, batch_size=1, num_batches=None, rng=None, data_specs=None, return_tuple=False): if num_batches is None: num_batches = len(self.X1) / (batch_size) mode = resolve_iterator_class(mode) i = FiniteDatasetIterator( self, mode(len(self.X1), batch_size, num_batches, rng), data_specs=data_specs, ) return i
def __init__(self, which_set, context_len, shuffle=False): """ Loads the data and turns it into n-grams """ self.__dict__.update(locals()) del self.self path = ("${PYLEARN2_DATA_PATH}/PennTreebankCorpus/" + "penntree_char_and_word.npz") npz_data = serial.load(path) if which_set == 'train': self._raw_data = npz_data['train_words'] elif which_set == 'valid': self._raw_data = npz_data['valid_words'] elif which_set == 'test': self._raw_data = npz_data['test_words'] else: raise ValueError("Dataset must be one of 'train', 'valid' " "or 'test'") del npz_data # Free up some memory? self._data = as_strided(self._raw_data, shape=(len(self._raw_data) - context_len, context_len + 1), strides=(self._raw_data.itemsize, self._raw_data.itemsize)) super(PennTreebank, self).__init__( X=self._data[:, :-1], y=self._data[:, -1:], X_labels=10000, y_labels=10000 ) print which_set print self._data[:,-1:].shape if shuffle: warnings.warn("Note that the PennTreebank samples are only " "shuffled when the iterator method is used to " "retrieve them.") self._iter_subset_class = resolve_iterator_class( 'shuffled_sequential' )
def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): if mode is None: if hasattr(self, "_iter_subset_class"): mode = self._iter_subset_class raise ValueError("iteration mode not provided and no default " "mode set for %s" % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, "_iter_batch_size", None) if num_batches is None: num_batches = getattr(self, "_iter_num_batches", None) if rng is None and mode.stochastic: rng = self.rng if data_specs is None: data_specs = getattr(self, "_iter_data_specs", None) # TODO: figure out where to to the scaling more cleanly. def list_to_scaled_array(batch): # batch is either a 4D ndarray, or a list of length 1 # containing a 4D ndarray. Make it a 5D ndarray, # with shape 1 on the first dimension. # Also scale it from [0, 255] to [0, 1] if isinstance(batch, list): assert len(batch) == 1 batch = batch[0] batch = batch.astype(config.floatX) batch /= 255.0 return batch[np.newaxis] convert_fns = [] for space in data_specs[0].components: if isinstance(space, FaceTubeSpace) and space.axes[0] == "b": convert_fns.append(list_to_scaled_array) else: convert_fns.append(None) return FiniteDatasetIteratorVariableSize( self, mode(self.n_samples, batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple, convert_fns=convert_fns, )
def __init__(self, alpha_list=[1.4], beta_list=[0.3], init_state_list=[numpy.array([0, 0])], num_samples=1000, frame_length=1, rng=None): # Validate parameters and set member variables self.alpha_list = alpha_list self.beta_list = beta_list if num_samples <= 0: raise ValueError("num_samples must be positive.") self.num_samples = num_samples self.num_examples = len(alpha_list) self.frame_length = frame_length self.init_state_list = init_state_list # Initialize RNG if rng is None: self.rng = numpy.random.RandomState(self._default_seed) else: self.rng = numpy.random.RandomState(rng) X, y = self._generate_data() self.data = (X, y) # DataSpecs features_space = VectorSpace(dim=2 * self.frame_length) features_source = 'features' targets_space = VectorSpace(dim=2) targets_source = 'targets' space = CompositeSpace([features_space, targets_space]) source = tuple([features_source, targets_source]) self.data_specs = (space, source) # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace( (features_space, targets_space)), (features_source, targets_source))
def __init__(self, data=None, data_specs=None, rng=_default_seed, preprocessor=None, fit_preprocessor=False): # data_specs should be flat, and there should be no # duplicates in source, as we keep only one version assert is_flat_specs(data_specs) if isinstance(data_specs[1], tuple): assert sorted(set(data_specs[1])) == sorted(data_specs[1]) self.data = data self.data_specs = data_specs self.compress = False self.design_loc = None self.rng = make_np_rng(rng, which_method='random_integers') # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor
def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, return_tuple=False, targets=None, rng=None, data_specs=None): mode = resolve_iterator_class(mode) self.data_specs = data_specs if self.do_dropoout: return MultiViewDatasetIteratorDropout( self, mode(self.data[0].shape[0], batch_size, num_batches, rng),self.p_dropout,0.5,0.5, data_specs=data_specs ) else: return FiniteDatasetIterator( self, mode(self.data[0].shape[0], batch_size, num_batches, rng), data_specs=data_specs )
def __init__(self, dataset, batch_size, num_batches, mode, iterator_post_processors=(GaussianNoisePostProcessor(.01, 0, .5), GaussianNoisePostProcessor(.1, 0, .001))): def _validate_batch_size(batch_size, dataset): if not batch_size: raise ValueError("batch size is none") num_examples = dataset.get_num_examples() if batch_size > num_examples: raise ValueError("batch size:%i is to large, dataset has %i examples", batch_size, num_examples) if batch_size < 0: raise ValueError("batch size: %i cannot be negative", batch_size) if not isinstance(batch_size, int): raise ValueError("batch_size is not an int") def _validate_num_batches(num_batches): if not num_batches: raise ValueError("num_batches is none") if num_batches < 0: raise ValueError("num_batches: %i cannot be negative", num_batches) if not isinstance(num_batches, int): raise ValueError("num_batches is not an int") self.dataset = dataset dataset_size = dataset.get_num_examples() _validate_batch_size(batch_size, dataset) _validate_num_batches(num_batches) subset_iterator_class = resolve_iterator_class(mode) self._subset_iterator = subset_iterator_class(dataset_size, batch_size, num_batches) self.iterator_post_processors = iterator_post_processors
def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None): """ Method inherited from the Dataset. """ self.mode = mode self.batch_size = batch_size self._targets = targets mode = resolve_iterator_class(mode) self.subset_iterator = mode(self.total_n_exs, batch_size, num_batches, rng=None) return EmotiwArrangerIter(self, mode, batch_size=batch_size)
def set_iteration_scheme(self, mode=None, batch_size=None, num_batches=None, topo=False, targets=False): if mode is not None: self._iter_subset_class = mode = resolve_iterator_class(mode) elif hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class else: raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) # If this didn't raise an exception, we should be fine. self._iter_batch_size = batch_size self._iter_num_batches = num_batches self._iter_topo = topo self._iter_targets = targets # Try to create an iterator with these settings. rng = self.rng if mode.stochastic else None test = self.iterator(mode, batch_size, num_batches, topo, rng=rng)
def __init__(self, data_generator=None, n_classes=101, n_examples=10, n_frames=10, n_features=4096): """ :type data_generator: function :param data_generator: function used to generate data in the form of X, y tuple. X is a 3-dimensional array with dimensions (examples, frames/time, features). y is a 2-dimensional array with dimensions (examples, target values). Optional value defaults to generating random therefore 'hard' data. :type n_classes: int :param n_classes: the number of possible target values or n_classes :type n_examples: int :param n_examples: the number of examples to be generated in the dataset :type n_frames: int :param n_frames: the number of frames or time steps in each example :type n_features: int :param n_features: the number of features in each time step """ rng = np.random.RandomState(seed=42) self.n_features = n_features self.n_examples = n_examples if data_generator is None: data_generator = hard_data_generator self.data_generator = data_generator self.X, self.y = self.data_generator(n_classes, n_examples, n_frames, n_features) features_space = VectorSequenceSpace(dim=self.n_features) # features_space = SequenceDataSpace(VectorSpace(dim=self.n_features)) targets_space = VectorSequenceSpace(dim=1) # targets_space = SequenceDataSpace(VectorSpace(dim=1)) space_components = [features_space, targets_space] space = CompositeSpace(space_components) source = ('features', 'targets') self.data_specs = (space, source) self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace((features_space, targets_space)), source)
def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False): allowed_modes = ('sequential', 'random_slice', 'even_sequential', 'batchwise_shuffled_sequential', 'even_batchwise_shuffled_sequential') if mode is not None and mode not in allowed_modes: raise ValueError("Due to HDF5 limitations on advanced indexing, " + "the '" + mode + "' iteration mode is not " + "supported") if data_specs is None: data_specs = self._iter_data_specs space, source = data_specs sub_spaces, sub_sources = ((space.components, source) if isinstance( space, CompositeSpace) else ((space, ), (source, ))) convert = [None for sp, src in safe_izip(sub_spaces, sub_sources)] mode = (self._iter_subset_class if mode is None else resolve_iterator_class(mode)) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if rng is None and mode.stochastic: rng = self.rng return VariableImageDatasetIterator(dataset=self, subset_iterator=mode( self.num_examples, batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple, convert=convert)
def __init__(self, which_set, context_len, shuffle=True): """ Loads the data and turns it into n-grams """ self.__dict__.update(locals()) del self.self path = ("${PYLEARN2_DATA_PATH}/PennTreebankCorpus/" + "penntree_char_and_word.npz") npz_data = serial.load(path) if which_set == 'train': self._raw_data = npz_data['train_words'] elif which_set == 'valid': self._raw_data = npz_data['valid_words'] elif which_set == 'test': self._raw_data = npz_data['test_words'] else: raise ValueError("Dataset must be one of 'train', 'valid' " "or 'test'") del npz_data # Free up some memory? self._data = as_strided(self._raw_data, shape=(len(self._raw_data) - context_len, context_len + 1), strides=(self._raw_data.itemsize, self._raw_data.itemsize)) super(PennTreebank, self).__init__(X=self._data[:, :-1], y=self._data[:, -1:], X_labels=10000, y_labels=10000) if shuffle: warnings.warn("Note that the PennTreebank samples are only " "shuffled when the iterator method is used to " "retrieve them.") self._iter_subset_class = resolve_iterator_class( 'shuffled_sequential')
def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None, data_specs=None, return_tuple=False): space, source = self.data_specs subspaces = space.components subsources = source mode = resolve_iterator_class("shuffled_sequential") rng = self.rng # rng = None assert rng is not None subset_iterator = mode(self.y.shape[0], batch_size, num_batches, rng=rng) return FiniteDatasetIterator( self, subset_iterator=subset_iterator, data_specs=data_specs, return_tuple=return_tuple, convert=self.convert)
def __init__(self, which_set, context_len, data_mode, shuffle=True): self.__dict__.update(locals()) del self.self # Load data into self._data (defined in PennTreebank) self._load_data(which_set, context_len, data_mode) self._data = as_strided(self._raw_data, shape=(len(self._raw_data) - context_len, context_len + 1), strides=(self._raw_data.itemsize, self._raw_data.itemsize)) super(PennTreebankNGrams, self).__init__(X=self._data[:, :-1], y=self._data[:, -1:], X_labels=self._max_labels, y_labels=self._max_labels) if shuffle: warnings.warn("Note that the PennTreebank samples are only " "shuffled when the iterator method is used to " "retrieve them.") self._iter_subset_class = resolve_iterator_class( 'shuffled_sequential')
def __init__(self, X=None, topo_view=None, y=None, view_converter=None, axes=('b', 0, 1, 'c'), rng=_default_seed, preprocessor=None, fit_preprocessor=False, max_labels=None): self.X = X self.y = y self.max_labels = max_labels if max_labels is not None: assert y is not None assert np.all(y < max_labels) if topo_view is not None: assert view_converter is None self.set_topological_view(topo_view, axes) else: assert X is not None, ("DenseDesignMatrix needs to be provided " "with either topo_view, or X") if view_converter is not None: self.view_converter = view_converter # Get the topo_space (usually Conv2DSpace) from the # view_converter if not hasattr(view_converter, 'topo_space'): raise NotImplementedError("Not able to get a topo_space " "from this converter: %s" % view_converter) # self.X_topo_space stores a "default" topological space that # will be used only when self.iterator is called without a # data_specs, and with "topo=True", which is deprecated. self.X_topo_space = view_converter.topo_space else: self.X_topo_space = None # Update data specs, if not done in set_topological_view X_space = VectorSpace(dim=self.X.shape[1]) X_source = 'features' if y is None: space = X_space source = X_source else: if self.y.ndim == 1: dim = 1 else: dim = self.y.shape[-1] y_space = VectorSpace(dim=dim) y_source = 'targets' space = CompositeSpace((X_space, y_space)) source = (X_source, y_source) self.data_specs = (space, source) self.X_space = X_space self.compress = False self.design_loc = None self.rng = make_np_rng(rng, which_method="random_integers") # Defaults for iterators self._iter_mode = resolve_iterator_class('sequential') self._iter_topo = False self._iter_targets = False self._iter_data_specs = (self.X_space, 'features') if preprocessor: preprocessor.apply(self, can_fit=fit_preprocessor) self.preprocessor = preprocessor
def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None, data_specs=None, return_tuple=False): warnings.warn( "Overloading this method is not necessary with the new " "interface change and this will be removed around November " "7th 2013", stacklevel=2) if topo is not None or targets is not None: if data_specs is not None: raise ValueError( "In DenseDesignMatrix.iterator, both " "the `data_specs` argument and deprecated arguments " "`topo` or `targets` were provided.", (data_specs, topo, targets)) warnings.warn( "Usage of `topo` and `target` arguments are being " "deprecated, and will be removed around November 7th, " "2013. `data_specs` should be used instead.", stacklevel=2) # build data_specs from topo and targets if needed if topo is None: topo = getattr(self, '_iter_topo', False) if topo: # self.iterator is called without a data_specs, and with # "topo=True", so we use the default topological space # stored in self.X_topo_space assert self.X_topo_space is not None X_space = self.X_topo_space else: X_space = self.X_space if targets is None: targets = getattr(self, '_iter_targets', False) if targets: assert self.y is not None y_space = self.data_specs[0][1] space = (X_space, y_space) source = ('features', 'targets') else: space = X_space source = 'features' data_specs = (space, source) _deprecated_interface = True else: _deprecated_interface = False # TODO: Refactor if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class else: raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if rng is None and mode.stochastic: rng = self.rng if data_specs is None: data_specs = self._iter_data_specs if _deprecated_interface: return FiniteDatasetIteratorPyTables(self, mode(self.X.shape[0], batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple) else: return FiniteDatasetIterator(self, mode(self.X.shape[0], batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple)
def __init__(self, which_set, frame_length, overlap=0, frames_per_example=1, start=0, stop=None, audio_only=False, rng=_default_seed): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap self.audio_only = audio_only # RNG initialization if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) # Standardize data for i, sequence in enumerate(self.raw_wav): self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std if not self.audio_only: self.num_phones = numpy.max( [numpy.max(sequence) for sequence in self.phones]) + 1 self.num_phonemes = numpy.max( [numpy.max(sequence) for sequence in self.phonemes]) + 1 self.num_words = numpy.max( [numpy.max(sequence) for sequence in self.words]) + 1 # The following is hard coded. However, the way it is done above # could be problematic if a max value (the max over the whole # dataset (train + valid + test)) is not present in at least one # one of the three subsets. This is the case for speakers. This is # not the case for phones. self.num_speakers = 630 # Slice data if stop is not None: self.raw_wav = self.raw_wav[start:stop] if not self.audio_only: self.phones = self.phones[start:stop] self.phonemes = self.phonemes[start:stop] self.words = self.words[start:stop] self.speaker_id = self.speaker_id[start:stop] else: self.raw_wav = self.raw_wav[start:] if not self.audio_only: self.phones = self.phones[start:] self.phonemes = self.phonemes[start:] self.words = self.words[start:] self.speaker_id = self.speaker_id[start:] examples_per_sequence = [0] for sequence_id, samples_sequence in enumerate(self.raw_wav): if not self.audio_only: # Phones segmentation phones_sequence = self.phones[sequence_id] phones_segmented_sequence = segment_axis( phones_sequence, frame_length, overlap) self.phones[sequence_id] = phones_segmented_sequence # phones_segmented_sequence = scipy.stats.mode( # phones_segmented_sequence, # axis=1 # )[0].flatten() # phones_segmented_sequence = numpy.asarray( # phones_segmented_sequence, # dtype='int' # ) # phones_sequence_list.append(phones_segmented_sequence) # Phonemes segmentation phonemes_sequence = self.phonemes[sequence_id] phonemes_segmented_sequence = segment_axis( phonemes_sequence, frame_length, overlap) self.phonemes[sequence_id] = phonemes_segmented_sequence # phonemes_segmented_sequence = scipy.stats.mode( # phonemes_segmented_sequence, # axis=1 # )[0].flatten() # phonemes_segmented_sequence = numpy.asarray( # phonemes_segmented_sequence, # dtype='int' # ) # phonemes_sequence_list.append(phonemes_segmented_sequence) # Words segmentation words_sequence = self.words[sequence_id] words_segmented_sequence = segment_axis( words_sequence, frame_length, overlap) self.words[sequence_id] = words_segmented_sequence # words_segmented_sequence = scipy.stats.mode( # words_segmented_sequence, # axis=1 # )[0].flatten() # words_segmented_sequence = numpy.asarray(words_segmented_sequence, # dtype='int') # words_sequence_list.append(words_segmented_sequence) # TODO: look at this, does it force copying the data? # Sequence segmentation samples_segmented_sequence = segment_axis(samples_sequence, frame_length, overlap) self.raw_wav[sequence_id] = samples_segmented_sequence # TODO: change me # Generate features/targets/phones/phonemes/words map num_frames = samples_segmented_sequence.shape[0] num_examples = num_frames - self.frames_per_example examples_per_sequence.append(num_examples) self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence) self.samples_sequences = self.raw_wav if not self.audio_only: self.phones_sequences = self.phones self.phonemes_sequences = self.phonemes self.words_sequences = self.words self.num_examples = self.cumulative_example_indexes[-1] # DataSpecs features_space = VectorSpace(dim=self.frame_length * self.frames_per_example) features_source = 'features' def features_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index] [example_index:example_index + self.frames_per_example].ravel()) return rval targets_space = VectorSpace(dim=self.frame_length) targets_source = 'targets' def targets_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][ example_index + self.frames_per_example].ravel()) return rval space_components = [features_space, targets_space] source_components = [features_source, targets_source] map_fn_components = [features_map_fn, targets_map_fn] batch_components = [None, None] if not self.audio_only: phones_space = IndexSpace(max_labels=self.num_phones, dim=1, dtype=str( self.phones_sequences[0].dtype)) phones_source = 'phones' def phones_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): rval.append(self.phones_sequences[sequence_index][ example_index + self.frames_per_example].ravel()) return rval phonemes_space = IndexSpace(max_labels=self.num_phonemes, dim=1, dtype=str( self.phonemes_sequences[0].dtype)) phonemes_source = 'phonemes' def phonemes_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): rval.append(self.phonemes_sequences[sequence_index][ example_index + self.frames_per_example].ravel()) return rval words_space = IndexSpace(max_labels=self.num_words, dim=1, dtype=str(self.words_sequences[0].dtype)) words_source = 'words' def words_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): rval.append(self.words_sequences[sequence_index][ example_index + self.frames_per_example].ravel()) return rval speaker_id_space = IndexSpace(max_labels=self.num_speakers, dim=1, dtype=str(self.speaker_id.dtype)) speaker_id_source = 'speaker_id' def speaker_id_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): rval.append(self.speaker_id[sequence_index].ravel()) return rval dialect_space = IndexSpace(max_labels=8, dim=1, dtype='int32') dialect_source = 'dialect' def dialect_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): info = self.speaker_info_list[ self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[1:9])) return rval education_space = IndexSpace(max_labels=6, dim=1, dtype='int32') education_source = 'education' def education_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): info = self.speaker_info_list[ self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[9:15])) return rval race_space = IndexSpace(max_labels=8, dim=1, dtype='int32') race_source = 'race' def race_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): info = self.speaker_info_list[ self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[16:24])) return rval gender_space = IndexSpace(max_labels=2, dim=1, dtype='int32') gender_source = 'gender' def gender_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index( indexes): info = self.speaker_info_list[ self.speaker_id[sequence_index]] rval.append(index_from_one_hot(info[24:])) return rval space_components.extend([ phones_space, phonemes_space, words_space, speaker_id_space, dialect_space, education_space, race_space, gender_space ]) source_components.extend([ phones_source, phonemes_source, words_source, speaker_id_source, dialect_source, education_source, race_source, gender_source ]) map_fn_components.extend([ phones_map_fn, phonemes_map_fn, words_map_fn, speaker_id_map_fn, dialect_map_fn, education_map_fn, race_map_fn, gender_map_fn ]) batch_components.extend( [None, None, None, None, None, None, None, None]) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.map_functions = tuple(map_fn_components) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace( (features_space, targets_space)), (features_source, targets_source))
def __init__(self, which_set, frame_length, start=0, stop=None, audio_only=False, rng=_default_seed): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in the sliding window start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.audio_only = audio_only # RNG initialization if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) # Load data from disk self._load_data(which_set) # Standardize data for i, sequence in enumerate(self.raw_wav): self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std if not self.audio_only: self.num_phones = numpy.max( [numpy.max(sequence) for sequence in self.phones]) + 1 self.num_phonemes = numpy.max( [numpy.max(sequence) for sequence in self.phonemes]) + 1 self.num_words = numpy.max( [numpy.max(sequence) for sequence in self.words]) + 1 # Slice data if stop is not None: self.raw_wav = self.raw_wav[start:stop] if not self.audio_only: self.phones = self.phones[start:stop] self.phonemes = self.phonemes[start:stop] self.words = self.words[start:stop] else: self.raw_wav = self.raw_wav[start:] if not self.audio_only: self.phones = self.phones[start:] self.phonemes = self.phonemes[start:] self.words = self.words[start:] samples_sequences = [] targets_sequences = [] phones_sequences = [] phonemes_sequences = [] words_sequences = [] for sequence_id, samples_sequence in enumerate(self.raw_wav): # Sequence segmentation samples_segmented_sequence = segment_axis(samples_sequence, frame_length, frame_length - 1)[:-1] samples_sequences.append(samples_segmented_sequence) targets_sequences.append(samples_sequence[frame_length:].reshape( (samples_sequence[frame_length:].shape[0], 1))) if not self.audio_only: target_phones = self.phones[sequence_id][frame_length:] phones_sequences.append( target_phones.reshape((target_phones.shape[0], 1))) target_phonemes = self.phonemes[sequence_id][frame_length:] phonemes_sequences.append( target_phonemes.reshape((target_phonemes.shape[0], 1))) target_words = self.words[sequence_id][frame_length:] words_sequences.append( target_words.reshape((target_words.shape[0], 1))) del self.raw_wav self.samples_sequences = samples_sequences self.targets_sequences = targets_sequences self.data = [samples_sequences, targets_sequences] if not self.audio_only: del self.phones del self.phonemes del self.words self.phones_sequences = phones_sequences self.phonemes_sequences = phonemes_sequences self.words_sequences = words_sequences self.data.extend( [phones_sequences, phonemes_sequences, words_sequences]) self.num_examples = len(samples_sequences) # DataSpecs features_space = VectorSequenceSpace(dim=self.frame_length) features_source = 'features' targets_space = VectorSequenceSpace(dim=1) targets_source = 'targets' space_components = [features_space, targets_space] source_components = [features_source, targets_source] batch_components = [None, None] if not self.audio_only: phones_space = IndexSequenceSpace( max_labels=self.num_phones, dim=1, dtype=str(self.phones_sequences[0].dtype)) phones_source = 'phones' phonemes_space = IndexSequenceSpace( max_labels=self.num_phonemes, dim=1, dtype=str(self.phonemes_sequences[0].dtype)) phonemes_source = 'phonemes' words_space = IndexSequenceSpace( max_labels=self.num_words, dim=1, dtype=str(self.words_sequences[0].dtype)) words_source = 'words' space_components.extend( [phones_space, phonemes_space, words_space]) source_components.extend( [phones_source, phonemes_source, words_source]) batch_components.extend([None, None, None]) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace( (features_space, targets_space)), (features_source, targets_source))
def __init__(self, chromosomes="ALL", dataset_name="snp", read_only=False, balance_classes=False, start=None, stop=None, shuffle=False, add_noise=False, rng=_default_seed, flip_labels=False): print "Loading %r chromosomes for %s" % (chromosomes, dataset_name) if start is not None and stop is not None: print "Start: %d, stop: %d" % (start, stop) assert isinstance(chromosomes, int) or chromosomes == "ALL",\ "Can only set chromosomes to be an integer or ALL" p = serial.preprocess("${PYLEARN2_NI_PATH}/" + dataset_name) data_files = glob(path.join(p, "chr*.npy")) label_file = path.join(p, "labels.npy") get_int = lambda y: int(''.join(x for x in y if x.isdigit())) data_files.sort(key=get_int) if (chromosomes == "ALL" or chromosomes > len(data_files)): chromosomes = len(data_files) self.y = np.atleast_2d(np.load(label_file)).T[start:stop] if flip_labels: self.y = (self.y + 1) % 2 self.Xs = () space = () source = () balanced_idx = None if balance_classes: num_classes = np.amax(self.y) + 1 class_counts = [len(np.where(self.y == i)[0].tolist()) for i in range(num_classes)] min_count = min(class_counts) balanced_idx = [] for i in range(num_classes): idx = np.where(self.y == i)[0].tolist()[:min_count] balanced_idx += idx balanced_idx.sort() assert len(balanced_idx) / min_count == num_classes assert len(balanced_idx) % min_count == 0 self.y = self.y[balanced_idx] for i in range(num_classes): assert len(np.where(self.y == i)[0].tolist()) == min_count if read_only: print "Format is read-only for %s" % which_set h5_path = path.join(p, "gen." + which_set + ".h5") if not path.isfile(h5_path): self.make_h5(data_files, h5_path, start=start, stop=stop) h5file = tables.openFile(h5_path) datas = [h5file.getNode("/", "Chr%d" % (c + 1)) for c in range(chromosomes)] self.Xs = tuple(data.X for data in datas) sizes = [h5file.getNode("/", "Sizes")[c] for c in range(chromosomes)] else: print "Format is on-memory for %s" % dataset_name sizes = [] for c in range(0, chromosomes): X = np.load(data_files[c])[start:stop, :] assert "%d" % (c+1) in data_files[c] if balanced_idx is not None: X = X[balanced_idx] assert X.shape[0] == self.y.shape[0],\ "Data and labels have different number of samples (%d vs %d)" %\ (X.shape[0], self.y.shape[0]) self.Xs = self.Xs + (X / 2.0,) sizes.append(X.shape[1]) print "%s samples are %d" % (dataset_name, self.y.shape[0]) space = tuple(VectorSpace(dim=size) for size in sizes) source = tuple("chromosomes_%d" % (c + 1) for c in range(chromosomes)) self.X_space = CompositeSpace(space) self.X_source = source space = space + (IndexSpace(dim=1, max_labels=2),) source = source + ("targets",) space = CompositeSpace(space) self.data_specs = (space, source) self.rng = make_np_rng(rng, which_method="random_integers") assert self.rng is not None # Defaults for iterators self._iter_mode = resolve_iterator_class("shuffled_sequential") self._iter_topo = False self._iter_targets = False self._iter_data_specs = self.data_specs if add_noise: if add_noise is True: add_noise = 0.05 self.convert = list(randomize_snps.RandomizeSNPs(input_space=x_space, corruption_prob=add_noise) for x_space in self.X_space.components) + [None] else: self.convert = None
def iterator(self, mode=None, batch_size=None, num_batches=None, topo=None, targets=None, rng=None, data_specs=None, return_tuple=False): if topo is not None or targets is not None: if data_specs is not None: raise ValueError('In DenseDesignMatrix.iterator, both the ' '"data_specs" argument and deprecated ' 'arguments "topo" or "targets" were ' 'provided.', (data_specs, topo, targets)) warnings.warn("Usage of `topo` and `target` arguments are being " "deprecated, and will be removed around November " "7th, 2013. `data_specs` should be used instead.", stacklevel=2) # build data_specs from topo and targets if needed if topo is None: topo = getattr(self, '_iter_topo', False) if topo: # self.iterator is called without a data_specs, and with # "topo=True", so we use the default topological space # stored in self.X_topo_space assert self.X_topo_space is not None X_space = self.X_topo_space else: X_space = self.X_space if targets is None: targets = getattr(self, '_iter_targets', False) if targets: assert self.y is not None y_space = self.data_specs[0].components[1] space = CompositeSpace((X_space, y_space)) source = ('features', 'targets') else: space = X_space source = 'features' data_specs = (space, source) convert = None else: if data_specs is None: data_specs = self._iter_data_specs # If there is a view_converter, we have to use it to convert # the stored data for "features" into one that the iterator # can return. space, source = data_specs if isinstance(space, CompositeSpace): sub_spaces = space.components sub_sources = source else: sub_spaces = (space,) sub_sources = (source,) convert = [] for sp, src in safe_zip(sub_spaces, sub_sources): if src == 'features' and \ getattr(self, 'view_converter', None) is not None: conv_fn = (lambda batch, self=self, space=sp: self.view_converter.get_formatted_batch(batch, space)) else: conv_fn = None convert.append(conv_fn) # TODO: Refactor if mode is None: if hasattr(self, '_iter_subset_class'): mode = self._iter_subset_class else: raise ValueError('iteration mode not provided and no default ' 'mode set for %s' % str(self)) else: mode = resolve_iterator_class(mode) if batch_size is None: batch_size = getattr(self, '_iter_batch_size', None) if num_batches is None: num_batches = getattr(self, '_iter_num_batches', None) if rng is None and mode.stochastic: rng = self.rng return FiniteDatasetIterator(self, mode(self.X.shape[0], batch_size, num_batches, rng), data_specs=data_specs, return_tuple=return_tuple, convert=convert)