Beispiel #1
0
    def __init__(self, which_set, path=None):

        self.mapper = {'train': 0, 'valid': 1, 'test': 2}
        assert which_set in self.mapper.keys()

        self.__dict__.update(locals())
        del self.self

        if path is not None:
            raise NotImplementedError("Data path is the current directory.")

        # load data
        file_n = "click_data.h5"
        self.h5file = tables.open_file(file_n, mode='r')

        if which_set == 'test':
            test_group = self.h5file.root.test.test_raw
            self.X = test_group.X_t
            self.y = None

        else:
            train_group = self.h5file.root.train.train_raw
            if which_set == 'train':
                self.X = train_group.X_train
                self.y = train_group.y_train

            else:
                self.X = train_group.X_valid
                self.y = train_group.y_valid

        self.samples = slice(0, self.X.shape[0])
        self.sample_index = self.samples.start
        self.examples = self.X.shape[0]

        max_labels = 2

        X_source = 'features'
        X_space = VectorSpace(dim=23)
        if self.y is None:
            space = X_space
            source = X_source
        else:
            y_space = IndexSpace(dim=1, max_labels=max_labels)
            y_source = 'targets'
        space = CompositeSpace((X_space, y_space))
        source = (X_source, y_source)
        self.data_specs = (space, source)
        self.X_space = X_space

        self._iter_mode = resolve_iterator_class('sequential')
        self._iter_topo = False
        self._iter_targets = False
        self._iter_data_specs = (self.X_space, 'features')
        self._iter_subset_class = resolve_iterator_class('even_sequential')
Beispiel #2
0
    def __init__(self, which_set, path=None):

        self.mapper = {"train": 0, "valid": 1, "test": 2}
        assert which_set in self.mapper.keys()

        self.__dict__.update(locals())
        del self.self

        if path is not None:
            raise NotImplementedError("Data path is the current directory.")

        # load data
        file_n = "click_data.h5"
        self.h5file = tables.open_file(file_n, mode="r")

        if which_set == "test":
            test_group = self.h5file.root.test.test_raw
            self.X = test_group.X_t
            self.y = None

        else:
            train_group = self.h5file.root.train.train_raw
            if which_set == "train":
                self.X = train_group.X_train
                self.y = train_group.y_train

            else:
                self.X = train_group.X_valid
                self.y = train_group.y_valid

        self.samples = slice(0, self.X.shape[0])
        self.sample_index = self.samples.start
        self.examples = self.X.shape[0]

        max_labels = 2

        X_source = "features"
        X_space = VectorSpace(dim=23)
        if self.y is None:
            space = X_space
            source = X_source
        else:
            y_space = IndexSpace(dim=1, max_labels=max_labels)
            y_source = "targets"
        space = CompositeSpace((X_space, y_space))
        source = (X_source, y_source)
        self.data_specs = (space, source)
        self.X_space = X_space

        self._iter_mode = resolve_iterator_class("sequential")
        self._iter_topo = False
        self._iter_targets = False
        self._iter_data_specs = (self.X_space, "features")
        self._iter_subset_class = resolve_iterator_class("even_sequential")
Beispiel #3
0
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 rng=None, data_specs=None, return_tuple=False):
        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            raise ValueError('iteration mode not provided and no default '
                             'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)
        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)
        if rng is None and mode.stochastic:
            rng = self.rng
        if data_specs is None:
            data_specs = getattr(self, '_iter_data_specs', None)

        return FiniteDatasetIterator(
                self,
                mode(self.n_samples,
                     batch_size,
                     num_batches,
                     rng),
                data_specs=data_specs,
                return_tuple=return_tuple)
Beispiel #4
0
 def _create_subset_iterator(self, mode, batch_size=None, num_batches=None,
                             rng=None):
     subset_iterator = resolve_iterator_class(mode)
     if rng is None and subset_iterator.stochastic:
         rng = make_np_rng()
     return subset_iterator(self.get_num_examples(), batch_size,
                            num_batches, rng)
Beispiel #5
0
 def iterator(self,
              mode=None,
              batch_size=None,
              num_batches=None,
              topo=None,
              targets=None,
              rng=None):
     # TODO: Refactor, deduplicate with set_iteration_scheme
     if mode is None:
         if hasattr(self, '_iter_subset_class'):
             mode = self._iter_subset_class
         else:
             raise ValueError('iteration mode not provided and no default '
                              'mode set for %s' % str(self))
     else:
         mode = resolve_iterator_class(mode)
     if batch_size is None:
         batch_size = getattr(self, '_iter_batch_size', None)
     if num_batches is None:
         num_batches = getattr(self, '_iter_num_batches', None)
     if topo is None:
         topo = getattr(self, '_iter_topo', False)
     if targets is None:
         targets = getattr(self, '_iter_targets', False)
     if rng is None and mode.stochastic:
         rng = self.rng
     return FiniteDatasetIterator(
         self, mode(self.X.shape[0], batch_size, num_batches, rng), topo,
         targets)
Beispiel #6
0
    def iterator(self,
                 mode=None,
                 batch_size=None,
                 num_batches=None,
                 topo=None,
                 targets=None,
                 rng=None,
                 data_specs=None,
                 return_tuple=False):

        if topo is not None or targets is not None:
            raise ValueError("You should use the new interface iterator")

        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            else:
                raise ValueError('iteration mode not provided and no default '
                                 'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)
        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)
        if rng is None and mode.stochastic:
            rng = self.rng
        if data_specs is None:
            data_specs = self.data_specs
        return FiniteDatasetIterator(self,
                                     mode(self.get_num_examples(), batch_size,
                                          num_batches, rng),
                                     data_specs=data_specs,
                                     return_tuple=return_tuple)
Beispiel #7
0
    def __init__(self,
                 data=None,
                 data_specs=None,
                 rng=_default_seed,
                 preprocessor=None,
                 fit_preprocessor=False):
        # data_specs should be flat, and there should be no
        # duplicates in source, as we keep only one version
        assert is_flat_specs(data_specs)
        if isinstance(data_specs[1], tuple):
            assert sorted(set(data_specs[1])) == sorted(data_specs[1])
        space, source = data_specs
        space.np_validate(data)
        assert len(set(elem.shape[0] for elem in list(data))) <= 1
        self.data = data
        self.data_specs = data_specs
        self.num_examples = list(data)[0].shape[0]

        self.compress = False
        self.design_loc = None
        self.rng = make_np_rng(rng, which_method='random_integers')
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor
    def __init__(self, data=None, data_specs=None, rng=_default_seed,
                 preprocessor=None, fit_preprocessor=False):
        # data_specs should be flat, and there should be no
        # duplicates in source, as we keep only one version
        assert is_flat_specs(data_specs)
        if isinstance(data_specs[1], tuple):
            assert sorted(set(data_specs[1])) == sorted(data_specs[1])
        space, source = data_specs
        space.np_validate(data)
        # TODO: assume that data[0] is num example => error if channel in c01b
        # assert len(set(elem.shape[0] for elem in list(data))) <= 1
        self.data = data
        self.data_specs = data_specs
        # TODO: assume that data[0] is num example => error if channel in c01b
        self.num_examples = list(data)[-1].shape[0] # TODO: list(data)[0].shape[0]

        self.compress = False
        self.design_loc = None
        self.rng = make_np_rng(rng, which_method='random_integers')
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor
Beispiel #9
0
    def iterator(self,
                 mode=None,
                 batch_size=None,
                 num_batches=None,
                 rng=None,
                 data_specs=None,
                 return_tuple=False):
        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            raise ValueError('iteration mode not provided and no default '
                             'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)
        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)
        if rng is None and mode.stochastic:
            rng = self.rng
        if data_specs is None:
            data_specs = getattr(self, '_iter_data_specs', None)

        return FiniteDatasetIterator(self,
                                     mode(self.n_samples, batch_size,
                                          num_batches, rng),
                                     data_specs=data_specs,
                                     return_tuple=return_tuple)
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 topo=None, targets=None, rng=None, data_specs=None,
                 return_tuple=False):

        if topo is not None or targets is not None:
            raise ValueError("You should use the new interface iterator")

        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            else:
                raise ValueError('iteration mode not provided and no default '
                                 'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)
        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)
        if rng is None and mode.stochastic:
            rng = self.rng
        if data_specs is None:
            data_specs = self.data_specs
        return FiniteDatasetIterator(
            self,
            mode(self.get_num_examples(),
                 batch_size, num_batches, rng),
            data_specs=data_specs, return_tuple=return_tuple
        )
Beispiel #11
0
    def __init__(self, which_set='debug', start=None, end=None, shuffle=True,
                 lazy_load=False, rng=_default_seed):

        assert which_set in ['debug', 'train', 'test']
        if which_set == 'debug':
            maxlen, n_samples, n_annotations, n_features = 10, 12, 13, 14
            X = N.zeros(shape=(n_samples, maxlen))
            X_mask = X  # same with X
            Z = N.zeros(shape=(n_annotations, n_samples, n_features))
        elif which_set == 'train':
            pass
        else:
            pass

        self.X, self.X_mask, self.Z = (X, X_mask, Z)
        self.sources = ('features', 'target')

        self.spaces = CompositeSpace([
            SequenceSpace(space=VectorSpace(dim=self.X.shape[1])),
            SequenceDataSpace(space=VectorSpace(dim=self.Z.shape[-1]))
        ])
        self.data_spces = (self.spaces, self.sources)
        # self.X_space, self.X_mask_space, self.Z_space
        # Default iterator
        self._iter_mode = resolve_iterator_class('sequential')
        self._iter_topo = False
        self._iter_target = False
        self._iter_data_specs = self.data_spces
        self.rng = make_np_rng(rng, which_method='random_intergers')
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 rng=None, data_specs=None, return_tuple=False):
        allowed_modes = ('sequential', 'random_slice', 'even_sequential',
                         'batchwise_shuffled_sequential',
                         'even_batchwise_shuffled_sequential')
        if mode is not None and mode not in allowed_modes:
            raise ValueError("Due to HDF5 limitations on advanced indexing, " +
                             "the '" + mode + "' iteration mode is not " +
                             "supported")

        if data_specs is None:
            data_specs = self._iter_data_specs

        space, source = data_specs
        sub_spaces, sub_sources = (
            (space.components, source) if isinstance(space, CompositeSpace)
            else ((space,), (source,)))
        convert = [None for sp, src in safe_izip(sub_spaces, sub_sources)]

        mode = (self._iter_subset_class if mode is None
                else resolve_iterator_class(mode))

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)
        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)
        if rng is None and mode.stochastic:
            rng = self.rng
        return VariableImageDatasetIterator(
            dataset=self,
            subset_iterator=mode(
                self.num_examples, batch_size, num_batches, rng),
            data_specs=data_specs,
            return_tuple=return_tuple,
            convert=convert)
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 rng=None, data_specs=None,
                 return_tuple=False):
        """
        Copied from dense_design_matrix, in order to fix uneven problem.
        """

        if data_specs is None:
            data_specs = self._iter_data_specs

        # If there is a view_converter, we have to use it to convert
        # the stored data for "features" into one that the iterator
        # can return.
        space, source = data_specs
        if isinstance(space, CompositeSpace):
            sub_spaces = space.components
            sub_sources = source
        else:
            sub_spaces = (space,)
            sub_sources = (source,)

        convert = []
        for sp, src in safe_zip(sub_spaces, sub_sources):
            if src == 'features' and \
               getattr(self, 'view_converter', None) is not None:
                conv_fn = (lambda batch, self=self, space=sp:
                           self.view_converter.get_formatted_batch(batch,
                                                                   space))
            else:
                conv_fn = None

            convert.append(conv_fn)

        # TODO: Refactor
        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            else:
                raise ValueError('iteration mode not provided and no default '
                                 'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)
        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)
        if rng is None and mode.stochastic:
            rng = self.rng
        # hack to make the online augmentations run
        FiniteDatasetIterator.uneven = False
        iterator = FiniteDatasetIterator(self,
                                 mode(self.X.shape[0],
                                      batch_size,
                                      num_batches,
                                      rng),
                                 data_specs=data_specs,
                                 return_tuple=return_tuple,
                                 convert=convert)
        return iterator
Beispiel #14
0
 def _create_subset_iterator(self, mode, batch_size=None, num_batches=None,
                             rng=None):
     subset_iterator = resolve_iterator_class(mode)
     if rng is None and subset_iterator.stochastic:
         rng = make_np_rng()
     return subset_iterator(self.get_num_examples(), batch_size,
                            num_batches, rng)
Beispiel #15
0
    def __init__(self, which_set, context_len, data_mode, shuffle=True):

        self.__dict__.update(locals())
        del self.self

        # Load data into self._data (defined in PennTreebank)
        self._load_data(which_set, context_len, data_mode)

        print self._raw_data[0:30]
        print self._data[:, :-1][:10]
        print "_____________"
        print self._data[:, -1:][:10]
        super(PennTreebank_NGrams, self).__init__(
            X=self._data[:, :-1],
            y=self._data[:, -1:],
            X_labels=10000, y_labels=10000
        )

        if shuffle:
            warnings.warn("Note that the PennTreebank samples are only "
                          "shuffled when the iterator method is used to "
                          "retrieve them.")
            self._iter_subset_class = resolve_iterator_class(
                'shuffled_sequential'
            )
Beispiel #16
0
    def __init__(self, which_set, context_len, data_mode, shuffle=True):
        self.__dict__.update(locals())
        del self.self

        # Load data into self._data (defined in PennTreebank)
        self._load_data(which_set, context_len, data_mode)

        self._data = as_strided(self._raw_data,
                                shape=(len(self._raw_data) - context_len,
                                       context_len + 1),
                                strides=(self._raw_data.itemsize,
                                         self._raw_data.itemsize))

        super(PennTreebankNGrams, self).__init__(
            X=self._data[:, :-1],
            y=self._data[:, -1:],
            X_labels=self._max_labels, y_labels=self._max_labels
        )

        if shuffle:
            warnings.warn("Note that the PennTreebank samples are only "
                          "shuffled when the iterator method is used to "
                          "retrieve them.")
            self._iter_subset_class = resolve_iterator_class(
                'shuffled_sequential'
            )
 def iterator(self, mode=None, batch_size=None, num_batches=None,
              topo=None, targets=None, rng=None):
     # TODO: Refactor, deduplicate with set_iteration_scheme
     if mode is None:
         if hasattr(self, '_iter_subset_class'):
             mode = self._iter_subset_class
         else:
             raise ValueError('iteration mode not provided and no default '
                              'mode set for %s' % str(self))
     else:
         mode = resolve_iterator_class(mode)
     if batch_size is None:
         batch_size = getattr(self, '_iter_batch_size', None)
     if num_batches is None:
         num_batches = getattr(self, '_iter_num_batches', None)
     if topo is None:
         topo = getattr(self, '_iter_topo', False)
     if targets is None:
         targets = getattr(self, '_iter_targets', False)
     if rng is None and mode.stochastic:
         rng = self.rng
     return FiniteDatasetIterator(self,
                                  mode(self.X.shape[0], batch_size,
                                  num_batches, rng),
                                  topo, targets)
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 rng=None, data_specs=None, return_tuple=False):
        """
        Method inherited from `pylearn2.datasets.dataset.Dataset`.
        """
        self.mode = mode
        self.batch_size = batch_size
        self._return_tuple = return_tuple

        # TODO: If there is a view_converter, we have to use it to convert
        # the stored data for "features" into one that the iterator can return.
        space, source = data_specs or (self.X_space, 'features')
        assert isinstance(space, CompositeSpace),\
            "Unexpected input space for the data."
        sub_spaces = space.components
        sub_sources = source

        conv_fn = lambda x: x.todense().astype(theano.config.floatX)
        convert = []
        for sp, src in safe_zip(sub_spaces, sub_sources):
            convert.append(conv_fn if src in ('features', 'targets') else None)

        assert mode is not None,\
                "Iteration mode not provided for %s" % str(self)
        mode = resolve_iterator_class(mode)
        subset_iterator = mode(self.X.shape[0], batch_size, num_batches, rng)

        return FiniteDatasetIterator(self,
                                     subset_iterator,
                                     data_specs=data_specs,
                                     return_tuple=return_tuple,
                                     convert=convert)
Beispiel #19
0
    def __init__(self, min_x=-6.28, max_x=6.28, std=.05, rng=_default_seed):
        """
        Constructor.
        """
        super(CosDataset, self).__init__()
        
        #: lower limit for x as in cos(x)
        self.min_x = min_x
        
        #: higher limit for x as in cos(x)
        self.max_x = max_x
        
        #: standard deviation for the noise added to the values we generate
        self.std = std

        # argument to resolve_iterator_class() can be either
        # a string from [sequential, shuffled_sequential, random_slice,
        # random_uniform, batchwise_shuffled_sequential, even_sequential,
        # even_shuffled_sequential, even_batchwise_shuffled_sequential,
        # even_sequences] or a SubsetIterator sublass.

        #: default iterator implementation (a class to be instantiated)
        self._iter_subset_class = resolve_iterator_class('sequential')
        
        #: default data specifications for iterator
        self._iter_data_specs = (VectorSpace(2), 'features')
        
        #: default batch size for the iterator
        self._iter_batch_size = 100
        
        #: default number of batches for the iterator
        self._iter_num_batches = 10
        
        #: random number generator
        self.rng = make_np_rng(rng, which_method=['uniform', 'randn'])
Beispiel #20
0
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 rng=None, data_specs=None,
                 return_tuple=False):

        if data_specs is None:
            data_specs = self._iter_data_specs

        # If there is a view_converter, we have to use it to convert
        # the stored data for "features" into one that the iterator
        # can return.
        space, source = data_specs
        if isinstance(space, CompositeSpace):
            sub_spaces = space.components
            sub_sources = source
        else:
            sub_spaces = (space,)
            sub_sources = (source,)

        convert = []
        for sp, src in safe_zip(sub_spaces, sub_sources):
            if (src == "features"
                and getattr(self, "view_converter", None) is not None):
                if self.distorter is None:
                    conv_fn = (lambda batch, self=self, space=sp:
                        self.view_converter.get_formatted_batch(batch, space))
                else:
                    conv_fn = (lambda batch, self=self, space=sp:
                                   self.distorter._distort(
                            self.view_converter.get_formatted_batch(batch,
                                                                    space)))
            else:
                conv_fn = None

            convert.append(conv_fn)

        # TODO: Refactor
        if mode is None:
            if hasattr(self, "_iter_subset_class"):
                mode = self._iter_subset_class
            else:
                raise ValueError("iteration mode not provided and no default "
                                 "mode set for %s" % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, "_iter_batch_size", None)
        if num_batches is None:
            num_batches = getattr(self, "_iter_num_batches", None)
        if rng is None and mode.stochastic:
            rng = self.rng
        return FiniteDatasetIterator(self,
                                     mode(self.X.shape[0],
                                          batch_size,
                                          num_batches,
                                          rng),
                                     data_specs=data_specs,
                                     return_tuple=return_tuple,
                                     convert=convert)
Beispiel #21
0
    def __init__(self,
                 X=None,
                 topo_view=None,
                 y=None,
                 view_converter=None,
                 axes=('b', 0, 1, 'c'),
                 rng=_default_seed,
                 preprocessor=None,
                 fit_preprocessor=False):
        """
        Parameters
        ----------

        X : ndarray, 2-dimensional, optional
            Should be supplied if `topo_view` is not. A design
            matrix of shape (number examples, number features)
            that defines the dataset.
        topo_view : ndarray, optional
            Should be supplied if X is not.  An array whose first
            dimension is of length number examples. The remaining
            dimensions are xamples with topological significance,
            e.g. for images the remaining axes are rows, columns,
            and channels.
        y : ndarray, 1-dimensional(?), optional
            Labels or targets for each example. The semantics here
            are not quite nailed down for this yet.
        view_converter : object, optional
            An object for converting between design matrices and
            topological views. Currently DefaultViewConverter is
            the only type available but later we may want to add
            one that uses the retina encoding that the U of T group
            uses.
        rng : object, optional
            A random number generator used for picking random
            indices into the design matrix when choosing minibatches.
        """
        self.X = X
        if view_converter is not None:
            assert topo_view is None
            self.view_converter = view_converter
        else:
            if topo_view is not None:
                self.set_topological_view(topo_view, axes)
        self.y = y
        self.compress = False
        self.design_loc = None
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = np.random.RandomState(rng)
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')
        self._iter_topo = False
        self._iter_targets = False

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor
    def __init__(self, X=None, topo_view=None, y=None, tags=None,
                 view_converter=None, axes = ('b', 0, 1, 'c'),
                 rng=_default_seed, preprocessor = None, fit_preprocessor=False):
        """
        Parameters
        ----------

        X : ndarray, 2-dimensional, optional
            Should be supplied if `topo_view` is not. A design
            matrix of shape (number examples, number features)
            that defines the dataset.
        topo_view : ndarray, optional
            Should be supplied if X is not.  An array whose first
            dimension is of length number examples. The remaining
            dimensions are examples with topological significance,
            e.g. for images the remaining axes are rows, columns,
            and channels.
        y : ndarray, 1-dimensional(?), optional
            Labels or targets for each example. The semantics here
            are not quite nailed down for this yet.
        tags: ndarray, optional
            First dimension is the number of examples, other dimensions 
            contain extra information about the examples.  Used to keep 
            track of position information for randomly cropped patches.
        view_converter : object, optional
            An object for converting between design matrices and
            topological views. Currently DefaultViewConverter is
            the only type available but later we may want to add
            one that uses the retina encoding that the U of T group
            uses.
        rng : object, optional
            A random number generator used for picking random
            indices into the design matrix when choosing minibatches.
        """
        self.X = X
        if view_converter is not None:
            assert topo_view is None
            self.view_converter = view_converter
        else:
            if topo_view is not None:
                self.set_topological_view(topo_view, axes)
        self.y = y
        self.tags = tags
        self.compress = False
        self.design_loc = None
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = np.random.RandomState(rng)
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')
        self._iter_topo = False
        self._iter_targets = False

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor
Beispiel #23
0
    def __init__(self, which_set, which_day, path=None):

        self.daylist = range(21, 32)
        self.mapper = {'train': 0, 'valid': 1, 'test': 2}
        assert which_set in self.mapper.keys()
        assert which_day in self.daylist

        f = open('/home/whale/Documents/click/dayrows.pkl', 'r')
        self.dayrows = cPickle.load(f)
        f.close()

        self.__dict__.update(locals())
        del self.self

        if path is not None:
            raise NotImplementedError("Data path is the current directory.")

        # load data
        file_n = "click_data.h5"
        self.h5file = tables.open_file(file_n, mode='r')

        if which_set == 'test':
            test_group = self.h5file.root.test.test_raw
            self.X = test_group.X_t
            self.y = None
            self.samples = slice(0, self.X.shape[0])
            self.sample_index = self.samples.start
            self.examples = self.X.shape[0]
        else:
            train_group = self.h5file.root.train.train_raw
            self.X = train_group.X
            self.y = train_group.y
            self.samples = slice(sum(self.dayrows[:which_day - 21]),
                                 sum(self.dayrows[:which_day - 20]))
            self.sample_index = self.samples.start
            self.examples = self.dayrows[which_day - 21]

        max_labels = 2

        X_source = 'features'
        X_space = VectorSpace(dim=23)
        if self.y is None:
            space = X_space
            source = X_source
        else:
            y_space = IndexSpace(dim=1, max_labels=max_labels)
            y_source = 'targets'
        space = CompositeSpace((X_space, y_space))
        source = (X_source, y_source)
        self.data_specs = (space, source)
        self.X_space = X_space

        self._iter_mode = resolve_iterator_class('sequential')
        self._iter_topo = False
        self._iter_targets = False
        self._iter_data_specs = (self.X_space, 'features')
Beispiel #24
0
    def iterator(self,
                 mode=None,
                 batch_size=None,
                 num_batches=None,
                 topo=None,
                 targets=None,
                 rng=None,
                 data_specs=None,
                 return_tuple=False):
        """
        method inherited from Dataset
        """
        self.mode = mode
        self.batch_size = batch_size
        self._targets = targets
        self._return_tuple = return_tuple
        if data_specs is None:
            data_specs = self._iter_data_specs

        # If there is a view_converter, we have to use it to convert
        # the stored data for "features" into one that the iterator
        # can return.
        # if
        self.conv_fn = lambda x: x.todense()
        space, source = data_specs
        if isinstance(space, CompositeSpace):
            sub_spaces = space.components
            sub_sources = source
        else:
            sub_spaces = (space, )
            sub_sources = (source, )

        convert = []
        for sp, src in safe_zip(sub_spaces, sub_sources):
            if src == 'features' or 'targets':
                conv_fn = self.conv_fn
            else:
                conv_fn = None

            convert.append(conv_fn)

        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            else:
                raise ValueError('iteration mode not provided and no default '
                                 'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        return FiniteDatasetIterator(self,
                                     mode(self.X.shape[0], batch_size,
                                          num_batches, rng),
                                     data_specs=data_specs,
                                     return_tuple=return_tuple,
                                     convert=convert)
Beispiel #25
0
    def __init__(self, which_set, which_day, path=None):

        self.daylist = range(21, 32)
        self.mapper = {"train": 0, "valid": 1, "test": 2}
        assert which_set in self.mapper.keys()
        assert which_day in self.daylist

        f = open("/home/whale/Documents/click/dayrows.pkl", "r")
        self.dayrows = cPickle.load(f)
        f.close()

        self.__dict__.update(locals())
        del self.self

        if path is not None:
            raise NotImplementedError("Data path is the current directory.")

        # load data
        file_n = "click_data.h5"
        self.h5file = tables.open_file(file_n, mode="r")

        if which_set == "test":
            test_group = self.h5file.root.test.test_raw
            self.X = test_group.X_t
            self.y = None
            self.samples = slice(0, self.X.shape[0])
            self.sample_index = self.samples.start
            self.examples = self.X.shape[0]
        else:
            train_group = self.h5file.root.train.train_raw
            self.X = train_group.X
            self.y = train_group.y
            self.samples = slice(sum(self.dayrows[: which_day - 21]), sum(self.dayrows[: which_day - 20]))
            self.sample_index = self.samples.start
            self.examples = self.dayrows[which_day - 21]

        max_labels = 2

        X_source = "features"
        X_space = VectorSpace(dim=23)
        if self.y is None:
            space = X_space
            source = X_source
        else:
            y_space = IndexSpace(dim=1, max_labels=max_labels)
            y_source = "targets"
        space = CompositeSpace((X_space, y_space))
        source = (X_source, y_source)
        self.data_specs = (space, source)
        self.X_space = X_space

        self._iter_mode = resolve_iterator_class("sequential")
        self._iter_topo = False
        self._iter_targets = False
        self._iter_data_specs = (self.X_space, "features")
Beispiel #26
0
    def iterator(self, mode=None, batch_size=None, num_batches=None,
      topo=None, targets=None, rng=None, data_specs=None,
      return_tuple=False):
        """
        method inherited from Dataset
        """
        self.mode = mode
        self.batch_size = batch_size
        self._targets = targets
        self._return_tuple = return_tuple
        if data_specs is None:
                data_specs = self._iter_data_specs

            # If there is a view_converter, we have to use it to convert
            # the stored data for "features" into one that the iterator
            # can return.
        # if
        space, source = data_specs
        if isinstance(space, CompositeSpace):
            sub_spaces = space.components
            sub_sources = source
        else:
            sub_spaces = (space,)
            sub_sources = (source,)

        convert = []
        for sp, src in safe_zip(sub_spaces, sub_sources):
            if src == 'features':
                conv_fn = lambda x: x.todense()
            elif src == 'targets':
                conv_fn = lambda x: x
            else:
                conv_fn = None

            convert.append(conv_fn)

        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            else:
                raise ValueError('iteration mode not provided and no default '
                                 'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)


        return FiniteDatasetIterator(self,
                                     mode(self.X.shape[0],
                                          batch_size,
                                          num_batches,
                                          rng),
                                     data_specs=data_specs,
                                     return_tuple=return_tuple,
                                     convert=convert)
    def __init__(self,
                 data=None,
                 data_specs=None,
                 rng=_default_seed,
                 preprocessor=None,
                 fit_preprocessor=False):
        """
        Parameters
        ----------
        data: ndarray, or tuple of ndarrays, containing the data.
            It is formatted as specified in `data_specs`.
            For instance, if `data_specs` is (VectorSpace(nfeat), 'features'),
            then `data` has to be a 2-d ndarray, of shape (nb examples,
            nfeat), that defines an unlabeled dataset. If `data_specs`
            is (CompositeSpace(Conv2DSpace(...), VectorSpace(1)),
            ('features', 'target')), then `data` has to be an (X, y) pair,
            with X being an ndarray containing images stored in the topological
            view specified by the `Conv2DSpace`, and y being a 2-D ndarray
            of width 1, containing the labels or targets for each example.

        data_specs: A (space, source) pair, where space is an instance of
            `Space` (possibly a `CompositeSpace`), and `source` is a
            string (or tuple of strings, if `space` is a `CompositeSpace`),
            defining the format and labels associated to `data`.

        rng : object, optional
            A random number generator used for picking random
            indices into the design matrix when choosing minibatches.

        preprocessor: WRITEME

        fit_preprocessor: WRITEME
        """
        # data_specs should be flat, and there should be no
        # duplicates in source, as we keep only one version
        assert is_flat_specs(data_specs)
        if isinstance(data_specs[1], tuple):
            assert sorted(set(data_specs[1])) == sorted(data_specs[1])
        self.data = data
        self.data_specs = data_specs

        self.compress = False
        self.design_loc = None
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = np.random.RandomState(rng)
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor
Beispiel #28
0
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 rng=None, data_specs=None, return_tuple=False):
        """
        .. todo::

            WRITEME
        """
        if data_specs is None:
            data_specs = self._iter_data_specs

        # If there is a view_converter, we have to use it to convert
        # the stored data for "features" into one that the iterator
        # can return.
        space, source = data_specs
        if isinstance(space, CompositeSpace):
            sub_spaces = space.components
            sub_sources = source
        else:
            sub_spaces = (space,)
            sub_sources = (source,)

        convert = []
        for sp, src in safe_zip(sub_spaces, sub_sources):
            convert.append(None)

        # TODO: Refactor
        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            else:
                raise ValueError('iteration mode not provided and no default '
                                 'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)
        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)
        if rng is None and mode.stochastic:
            rng = self.rng
        
        if self.noise != False:
            lengths = map( lambda x: len(x), self.samples_sequences )
            self.noise_this_epoch = map( lambda length: numpy.random.normal( 0, self.noise, (length,1) ), lengths )
        
        return FiniteDatasetIterator(self,
                                     mode(self.num_examples, batch_size,
                                          num_batches, rng),
                                     data_specs=data_specs,
                                     return_tuple=return_tuple,
                                     convert=convert)
Beispiel #29
0
    def iterator(self,
                 mode=None,
                 batch_size=None,
                 num_batches=None,
                 rng=None,
                 data_specs=None,
                 return_tuple=False):
        """
        .. todo::

            WRITEME
        """
        if data_specs is None:
            data_specs = self._iter_data_specs

        # If there is a view_converter, we have to use it to convert
        # the stored data for "features" into one that the iterator
        # can return.
        space, source = data_specs
        if isinstance(space, CompositeSpace):
            sub_spaces = space.components
            sub_sources = source
        else:
            sub_spaces = (space, )
            sub_sources = (source, )

        convert = []
        for sp, src in safe_zip(sub_spaces, sub_sources):
            convert.append(None)

        # TODO: Refactor
        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            else:
                raise ValueError('iteration mode not provided and no default '
                                 'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)
        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)
        if rng is None and mode.stochastic:
            rng = self.rng
        return FiniteDatasetIterator(self,
                                     mode(self.num_examples, batch_size,
                                          num_batches, rng),
                                     data_specs=data_specs,
                                     return_tuple=return_tuple,
                                     convert=convert)
    def iterator(self, mode="sequential", batch_size=None, num_batches=None, rng=None):
        """
        Method inherited from the Dataset.
        """
        if batch_size is None and mode == "sequential":
            batch_size = 100  # Has to be big enough or we'll never pick anything.

        self.batch_size = batch_size
        self.mode = resolve_iterator_class(mode)

        self.subset_iterator = self.mode(self.total_n_exs, batch_size, num_batches, rng=None)

        return EmotiwArrangerIter(self, self.subset_iterator, batch_size=batch_size)
Beispiel #31
0
    def iterator(self, mode=None, batch_size=1, num_batches=None,
                 rng=None, data_specs=None, return_tuple=False):

        if num_batches is None:
            num_batches = len(self.X1) / (batch_size)

        mode = resolve_iterator_class(mode)
        i = FiniteDatasetIterator(
            self,
            mode(len(self.X1), batch_size, num_batches, rng),
            data_specs=data_specs,
        )
        return i
Beispiel #32
0
    def iterator(self,
                 mode=None,
                 batch_size=None,
                 num_batches=None,
                 rng=None,
                 data_specs=None,
                 return_tuple=False):
        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            raise ValueError('iteration mode not provided and no default '
                             'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)
        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)
        if rng is None and mode.stochastic:
            rng = self.rng
        if data_specs is None:
            data_specs = getattr(self, '_iter_data_specs', None)

        # TODO: figure out where to to the scaling more cleanly.
        def list_to_scaled_array(batch):
            # batch is either a 4D ndarray, or a list of length 1
            # containing a 4D ndarray. Make it a 5D ndarray,
            # with shape 1 on the first dimension.
            # Also scale it from [0, 255] to [0, 1]
            if isinstance(batch, list):
                assert len(batch) == 1
                batch = batch[0]
            batch = batch.astype(config.floatX)
            batch /= 255.
            return batch[np.newaxis]

        convert_fns = []
        for space in data_specs[0].components:
            if (isinstance(space, FaceTubeSpace) and space.axes[0] == 'b'):
                convert_fns.append(list_to_scaled_array)
            else:
                convert_fns.append(None)

        return FiniteDatasetIteratorVariableSize(self,
                                                 mode(self.n_samples,
                                                      batch_size, num_batches,
                                                      rng),
                                                 data_specs=data_specs,
                                                 return_tuple=return_tuple,
                                                 convert_fns=convert_fns)
    def __init__(self, data=None, data_specs=None, rng=_default_seed,
                 preprocessor=None, fit_preprocessor=False):
        """
        Parameters
        ----------
        data: ndarray, or tuple of ndarrays, containing the data.
            It is formatted as specified in `data_specs`.
            For instance, if `data_specs` is (VectorSpace(nfeat), 'features'),
            then `data` has to be a 2-d ndarray, of shape (nb examples,
            nfeat), that defines an unlabeled dataset. If `data_specs`
            is (CompositeSpace(Conv2DSpace(...), VectorSpace(1)),
            ('features', 'target')), then `data` has to be an (X, y) pair,
            with X being an ndarray containing images stored in the topological
            view specified by the `Conv2DSpace`, and y being a 2-D ndarray
            of width 1, containing the labels or targets for each example.

        data_specs: A (space, source) pair, where space is an instance of
            `Space` (possibly a `CompositeSpace`), and `source` is a
            string (or tuple of strings, if `space` is a `CompositeSpace`),
            defining the format and labels associated to `data`.

        rng : object, optional
            A random number generator used for picking random
            indices into the design matrix when choosing minibatches.

        preprocessor: WRITEME

        fit_preprocessor: WRITEME
        """
        # data_specs should be flat, and there should be no
        # duplicates in source, as we keep only one version
        assert is_flat_specs(data_specs)
        if isinstance(data_specs[1], tuple):
            assert sorted(set(data_specs[1])) == sorted(data_specs[1])
        self.data = data
        self.data_specs = data_specs

        self.compress = False
        self.design_loc = None
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = np.random.RandomState(rng)
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor
    def __init__(self, X, y=None, rng=None):

        self.X = X.astype(theano.config.floatX)
        self.y = y
        
        self.compress = False
        self.design_loc = None
        if rng is None:
            rng = np.random.RandomState(SparseDesignMatrix._default_seed)
        self.rng = rng
        
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')
        self._iter_topo = False
        self._iter_targets = False
 def set_iteration_scheme(self, mode=None, batch_size=None, num_batches=None, topo=False, targets=False):
     if mode is not None:
         self._iter_subset_class = mode = resolve_iterator_class(mode)
     elif hasattr(self, "_iter_subset_class"):
         mode = self._iter_subset_class
     else:
         raise ValueError("iteration mode not provided and no default " "mode set for %s" % str(self))
     # If this didn't raise an exception, we should be fine.
     self._iter_batch_size = batch_size
     self._iter_num_batches = num_batches
     self._iter_topo = topo
     self._iter_targets = targets
     # Try to create an iterator with these settings.
     rng = self.rng if mode.stochastic else None
     test = self.iterator(mode, batch_size, num_batches, topo, rng=rng)
Beispiel #36
0
    def __init__(self,
                 data_generator=None,
                 n_classes=101,
                 n_examples=10,
                 n_frames=10,
                 n_features=4096):
        """
		:type data_generator: function
		:param data_generator: function used to generate data in the form of X, y tuple. X is a 3-dimensional array with dimensions (examples, frames/time, features). y is a 2-dimensional array with dimensions (examples, target values). Optional value defaults to generating random therefore 'hard' data.

		:type n_classes: int
		:param n_classes: the number of possible target values or n_classes

		:type n_examples: int
		:param n_examples: the number of examples to be generated in the dataset

		:type n_frames: int
		:param n_frames: the number of frames or time steps in each example

		:type n_features: int
		:param n_features: the number of features in each time step
		"""
        rng = np.random.RandomState(seed=42)
        self.n_features = n_features
        self.n_examples = n_examples
        if data_generator is None:
            data_generator = hard_data_generator
        self.data_generator = data_generator
        self.X, self.y = self.data_generator(n_classes, n_examples, n_frames,
                                             n_features)

        features_space = VectorSequenceSpace(dim=self.n_features)
        # features_space = SequenceDataSpace(VectorSpace(dim=self.n_features))

        targets_space = VectorSequenceSpace(dim=1)
        # targets_space = SequenceDataSpace(VectorSpace(dim=1))

        space_components = [features_space, targets_space]
        space = CompositeSpace(space_components)

        source = ('features', 'targets')

        self.data_specs = (space, source)

        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace(
            (features_space, targets_space)), source)
Beispiel #37
0
    def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False):
        """
        .. todo::

            WRITEME
        """
        if data_specs is None:
            data_specs = self._iter_data_specs

        # If there is a view_converter, we have to use it to convert
        # the stored data for "features" into one that the iterator
        # can return.
        space, source = data_specs
        if isinstance(space, CompositeSpace):
            sub_spaces = space.components
            sub_sources = source
        else:
            sub_spaces = (space,)
            sub_sources = (source,)

        convert = []
        for sp, src in safe_zip(sub_spaces, sub_sources):
            convert.append(None)

        # TODO: Refactor
        if mode is None:
            if hasattr(self, "_iter_subset_class"):
                mode = self._iter_subset_class
            else:
                raise ValueError("iteration mode not provided and no default " "mode set for %s" % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, "_iter_batch_size", None)
        # if num_batches is None:
        #    num_batches = getattr(self, '_iter_num_batches', None)
        num_batches = self._iter_num_batches
        if rng is None and mode.stochastic:
            rng = self.rng
        return FiniteDatasetIterator(
            self,
            mode(self.num_examples, batch_size, num_batches, rng),
            data_specs=data_specs,
            return_tuple=return_tuple,
            convert=convert,
        )
Beispiel #38
0
    def __init__(self,
                 dataset,
                 batch_size,
                 num_batches,
                 mode,
                 iterator_post_processors=(GaussianNoisePostProcessor(
                     .01, 0, .5), GaussianNoisePostProcessor(.1, 0, .001))):
        def _validate_batch_size(batch_size, dataset):
            if not batch_size:
                raise ValueError("batch size is none")

            num_examples = dataset.get_num_examples()
            if batch_size > num_examples:
                raise ValueError(
                    "batch size:%i is to large, dataset has %i examples",
                    batch_size, num_examples)

            if batch_size < 0:
                raise ValueError("batch size: %i cannot be negative",
                                 batch_size)

            if not isinstance(batch_size, int):
                raise ValueError("batch_size is not an int")

        def _validate_num_batches(num_batches):
            if not num_batches:
                raise ValueError("num_batches is none")

            if num_batches < 0:
                raise ValueError("num_batches: %i cannot be negative",
                                 num_batches)

            if not isinstance(num_batches, int):
                raise ValueError("num_batches is not an int")

        self.dataset = dataset
        dataset_size = dataset.get_num_examples()

        _validate_batch_size(batch_size, dataset)
        _validate_num_batches(num_batches)

        subset_iterator_class = resolve_iterator_class(mode)
        self._subset_iterator = subset_iterator_class(dataset_size, batch_size,
                                                      num_batches)

        self.iterator_post_processors = iterator_post_processors
Beispiel #39
0
    def iterator(self,
                 mode=None,
                 batch_size=1,
                 num_batches=None,
                 rng=None,
                 data_specs=None,
                 return_tuple=False):

        if num_batches is None:
            num_batches = len(self.X1) / (batch_size)

        mode = resolve_iterator_class(mode)
        i = FiniteDatasetIterator(
            self,
            mode(len(self.X1), batch_size, num_batches, rng),
            data_specs=data_specs,
        )
        return i
Beispiel #40
0
    def __init__(self, which_set, context_len, shuffle=False):
        """
        Loads the data and turns it into n-grams
        """

        self.__dict__.update(locals())
        del self.self

        path = ("${PYLEARN2_DATA_PATH}/PennTreebankCorpus/" +
                "penntree_char_and_word.npz")
        npz_data = serial.load(path)

        if which_set == 'train':
            self._raw_data = npz_data['train_words']
        elif which_set == 'valid':
            self._raw_data = npz_data['valid_words']
        elif which_set == 'test':
            self._raw_data = npz_data['test_words']
        else:
            raise ValueError("Dataset must be one of 'train', 'valid' "
                             "or 'test'")
        del npz_data  # Free up some memory?
        self._data = as_strided(self._raw_data,
                                shape=(len(self._raw_data) - context_len,
                                       context_len + 1),
                                strides=(self._raw_data.itemsize,
                                         self._raw_data.itemsize))
        super(PennTreebank, self).__init__(
            X=self._data[:, :-1],

            y=self._data[:, -1:],
            X_labels=10000, y_labels=10000
        )

        print which_set
        
        print self._data[:,-1:].shape
        if shuffle:
            warnings.warn("Note that the PennTreebank samples are only "
                          "shuffled when the iterator method is used to "
                          "retrieve them.")
            self._iter_subset_class = resolve_iterator_class(
                'shuffled_sequential'
            )
Beispiel #41
0
    def iterator(self, mode=None, batch_size=None, num_batches=None, rng=None, data_specs=None, return_tuple=False):
        if mode is None:
            if hasattr(self, "_iter_subset_class"):
                mode = self._iter_subset_class
            raise ValueError("iteration mode not provided and no default " "mode set for %s" % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, "_iter_batch_size", None)
        if num_batches is None:
            num_batches = getattr(self, "_iter_num_batches", None)
        if rng is None and mode.stochastic:
            rng = self.rng
        if data_specs is None:
            data_specs = getattr(self, "_iter_data_specs", None)

        # TODO: figure out where to to the scaling more cleanly.
        def list_to_scaled_array(batch):
            # batch is either a 4D ndarray, or a list of length 1
            # containing a 4D ndarray. Make it a 5D ndarray,
            # with shape 1 on the first dimension.
            # Also scale it from [0, 255] to [0, 1]
            if isinstance(batch, list):
                assert len(batch) == 1
                batch = batch[0]
            batch = batch.astype(config.floatX)
            batch /= 255.0
            return batch[np.newaxis]

        convert_fns = []
        for space in data_specs[0].components:
            if isinstance(space, FaceTubeSpace) and space.axes[0] == "b":
                convert_fns.append(list_to_scaled_array)
            else:
                convert_fns.append(None)

        return FiniteDatasetIteratorVariableSize(
            self,
            mode(self.n_samples, batch_size, num_batches, rng),
            data_specs=data_specs,
            return_tuple=return_tuple,
            convert_fns=convert_fns,
        )
Beispiel #42
0
    def __init__(self,
                 alpha_list=[1.4],
                 beta_list=[0.3],
                 init_state_list=[numpy.array([0, 0])],
                 num_samples=1000,
                 frame_length=1,
                 rng=None):
        # Validate parameters and set member variables
        self.alpha_list = alpha_list
        self.beta_list = beta_list

        if num_samples <= 0:
            raise ValueError("num_samples must be positive.")
        self.num_samples = num_samples
        self.num_examples = len(alpha_list)
        self.frame_length = frame_length

        self.init_state_list = init_state_list

        # Initialize RNG
        if rng is None:
            self.rng = numpy.random.RandomState(self._default_seed)
        else:
            self.rng = numpy.random.RandomState(rng)

        X, y = self._generate_data()
        self.data = (X, y)

        # DataSpecs
        features_space = VectorSpace(dim=2 * self.frame_length)
        features_source = 'features'

        targets_space = VectorSpace(dim=2)
        targets_source = 'targets'

        space = CompositeSpace([features_space, targets_space])
        source = tuple([features_source, targets_source])
        self.data_specs = (space, source)

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace(
            (features_space, targets_space)), (features_source,
                                               targets_source))
    def __init__(self, data=None, data_specs=None, rng=_default_seed,
                 preprocessor=None, fit_preprocessor=False):
        # data_specs should be flat, and there should be no
        # duplicates in source, as we keep only one version
        assert is_flat_specs(data_specs)
        if isinstance(data_specs[1], tuple):
            assert sorted(set(data_specs[1])) == sorted(data_specs[1])
        self.data = data
        self.data_specs = data_specs

        self.compress = False
        self.design_loc = None
        self.rng = make_np_rng(rng, which_method='random_integers')
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 topo=None, return_tuple=False, targets=None, rng=None, data_specs=None):


        mode = resolve_iterator_class(mode)
        self.data_specs = data_specs
        if self.do_dropoout:
            return MultiViewDatasetIteratorDropout(
                    self,
                    mode(self.data[0].shape[0],
                        batch_size, num_batches, rng),self.p_dropout,0.5,0.5,
                    data_specs=data_specs
                    )
        else:
            return FiniteDatasetIterator(
                    self,
                    mode(self.data[0].shape[0],
                        batch_size, num_batches, rng),
                    data_specs=data_specs
                    )
    def __init__(self, dataset,
                 batch_size,
                 num_batches,
                 mode,
                 iterator_post_processors=(GaussianNoisePostProcessor(.01, 0, .5),
                                           GaussianNoisePostProcessor(.1, 0, .001))):

        def _validate_batch_size(batch_size, dataset):
            if not batch_size:
                raise ValueError("batch size is none")

            num_examples = dataset.get_num_examples()
            if batch_size > num_examples:
                raise ValueError("batch size:%i is to large, dataset has %i examples", batch_size, num_examples)

            if batch_size < 0:
                raise ValueError("batch size: %i cannot be negative", batch_size)

            if not isinstance(batch_size, int):
                raise ValueError("batch_size is not an int")

        def _validate_num_batches(num_batches):
            if not num_batches:
                raise ValueError("num_batches is none")

            if num_batches < 0:
                raise ValueError("num_batches: %i cannot be negative", num_batches)

            if not isinstance(num_batches, int):
                raise ValueError("num_batches is not an int")

        self.dataset = dataset
        dataset_size = dataset.get_num_examples()

        _validate_batch_size(batch_size, dataset)
        _validate_num_batches(num_batches)

        subset_iterator_class = resolve_iterator_class(mode)
        self._subset_iterator = subset_iterator_class(dataset_size, batch_size, num_batches)

        self.iterator_post_processors = iterator_post_processors
Beispiel #46
0
    def iterator(self,
                 mode=None,
                 batch_size=None,
                 num_batches=None,
                 topo=None,
                 targets=None,
                 rng=None):
        """
        Method inherited from the Dataset.
        """
        self.mode = mode
        self.batch_size = batch_size
        self._targets = targets
        mode = resolve_iterator_class(mode)

        self.subset_iterator = mode(self.total_n_exs,
                                    batch_size,
                                    num_batches,
                                    rng=None)

        return EmotiwArrangerIter(self, mode, batch_size=batch_size)
Beispiel #47
0
 def set_iteration_scheme(self,
                          mode=None,
                          batch_size=None,
                          num_batches=None,
                          topo=False,
                          targets=False):
     if mode is not None:
         self._iter_subset_class = mode = resolve_iterator_class(mode)
     elif hasattr(self, '_iter_subset_class'):
         mode = self._iter_subset_class
     else:
         raise ValueError('iteration mode not provided and no default '
                          'mode set for %s' % str(self))
     # If this didn't raise an exception, we should be fine.
     self._iter_batch_size = batch_size
     self._iter_num_batches = num_batches
     self._iter_topo = topo
     self._iter_targets = targets
     # Try to create an iterator with these settings.
     rng = self.rng if mode.stochastic else None
     test = self.iterator(mode, batch_size, num_batches, topo, rng=rng)
Beispiel #48
0
	def __init__(self, data_generator=None, n_classes=101, n_examples=10, n_frames=10, n_features=4096):
		"""
		:type data_generator: function
		:param data_generator: function used to generate data in the form of X, y tuple. X is a 3-dimensional array with dimensions (examples, frames/time, features). y is a 2-dimensional array with dimensions (examples, target values). Optional value defaults to generating random therefore 'hard' data.

		:type n_classes: int
		:param n_classes: the number of possible target values or n_classes

		:type n_examples: int
		:param n_examples: the number of examples to be generated in the dataset

		:type n_frames: int
		:param n_frames: the number of frames or time steps in each example

		:type n_features: int
		:param n_features: the number of features in each time step
		"""
		rng = np.random.RandomState(seed=42)
		self.n_features = n_features
		self.n_examples = n_examples
		if data_generator is None:
			data_generator = hard_data_generator
		self.data_generator = data_generator
		self.X, self.y = self.data_generator(n_classes, n_examples, n_frames, n_features)

		features_space = VectorSequenceSpace(dim=self.n_features)
		# features_space = SequenceDataSpace(VectorSpace(dim=self.n_features))

		targets_space = VectorSequenceSpace(dim=1)
		# targets_space = SequenceDataSpace(VectorSpace(dim=1))

		space_components = [features_space, targets_space]
		space = CompositeSpace(space_components)

		source = ('features', 'targets')

		self.data_specs = (space, source)

		self._iter_mode = resolve_iterator_class('shuffled_sequential')
		self._iter_data_specs = (CompositeSpace((features_space, targets_space)), source)
    def iterator(self,
                 mode=None,
                 batch_size=None,
                 num_batches=None,
                 rng=None,
                 data_specs=None,
                 return_tuple=False):
        allowed_modes = ('sequential', 'random_slice', 'even_sequential',
                         'batchwise_shuffled_sequential',
                         'even_batchwise_shuffled_sequential')
        if mode is not None and mode not in allowed_modes:
            raise ValueError("Due to HDF5 limitations on advanced indexing, " +
                             "the '" + mode + "' iteration mode is not " +
                             "supported")

        if data_specs is None:
            data_specs = self._iter_data_specs

        space, source = data_specs
        sub_spaces, sub_sources = ((space.components, source) if isinstance(
            space, CompositeSpace) else ((space, ), (source, )))
        convert = [None for sp, src in safe_izip(sub_spaces, sub_sources)]

        mode = (self._iter_subset_class
                if mode is None else resolve_iterator_class(mode))

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)
        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)
        if rng is None and mode.stochastic:
            rng = self.rng
        return VariableImageDatasetIterator(dataset=self,
                                            subset_iterator=mode(
                                                self.num_examples, batch_size,
                                                num_batches, rng),
                                            data_specs=data_specs,
                                            return_tuple=return_tuple,
                                            convert=convert)
Beispiel #50
0
    def __init__(self, which_set, context_len, shuffle=True):
        """
        Loads the data and turns it into n-grams
        """

        self.__dict__.update(locals())
        del self.self

        path = ("${PYLEARN2_DATA_PATH}/PennTreebankCorpus/" +
                "penntree_char_and_word.npz")
        npz_data = serial.load(path)
        if which_set == 'train':
            self._raw_data = npz_data['train_words']
        elif which_set == 'valid':
            self._raw_data = npz_data['valid_words']
        elif which_set == 'test':
            self._raw_data = npz_data['test_words']
        else:
            raise ValueError("Dataset must be one of 'train', 'valid' "
                             "or 'test'")
        del npz_data  # Free up some memory?

        self._data = as_strided(self._raw_data,
                                shape=(len(self._raw_data) - context_len,
                                       context_len + 1),
                                strides=(self._raw_data.itemsize,
                                         self._raw_data.itemsize))

        super(PennTreebank, self).__init__(X=self._data[:, :-1],
                                           y=self._data[:, -1:],
                                           X_labels=10000,
                                           y_labels=10000)

        if shuffle:
            warnings.warn("Note that the PennTreebank samples are only "
                          "shuffled when the iterator method is used to "
                          "retrieve them.")
            self._iter_subset_class = resolve_iterator_class(
                'shuffled_sequential')
Beispiel #51
0
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 topo=None, targets=None, rng=None, data_specs=None,
                 return_tuple=False):

        space, source = self.data_specs
        subspaces = space.components
        subsources = source
        mode = resolve_iterator_class("shuffled_sequential")

        rng = self.rng
#        rng = None
        assert rng is not None
        subset_iterator = mode(self.y.shape[0],
                               batch_size,
                               num_batches,
                               rng=rng)

        return FiniteDatasetIterator(
            self,
            subset_iterator=subset_iterator,
            data_specs=data_specs,
            return_tuple=return_tuple,
            convert=self.convert)
Beispiel #52
0
    def __init__(self, which_set, context_len, data_mode, shuffle=True):
        self.__dict__.update(locals())
        del self.self

        # Load data into self._data (defined in PennTreebank)
        self._load_data(which_set, context_len, data_mode)

        self._data = as_strided(self._raw_data,
                                shape=(len(self._raw_data) - context_len,
                                       context_len + 1),
                                strides=(self._raw_data.itemsize,
                                         self._raw_data.itemsize))

        super(PennTreebankNGrams, self).__init__(X=self._data[:, :-1],
                                                 y=self._data[:, -1:],
                                                 X_labels=self._max_labels,
                                                 y_labels=self._max_labels)

        if shuffle:
            warnings.warn("Note that the PennTreebank samples are only "
                          "shuffled when the iterator method is used to "
                          "retrieve them.")
            self._iter_subset_class = resolve_iterator_class(
                'shuffled_sequential')
    def __init__(self, X=None, topo_view=None, y=None,
                 view_converter=None, axes=('b', 0, 1, 'c'),
                 rng=_default_seed, preprocessor=None, fit_preprocessor=False,
                 max_labels=None):
        self.X = X
        self.y = y
        self.max_labels = max_labels

        if max_labels is not None:
            assert y is not None
            assert np.all(y < max_labels)

        if topo_view is not None:
            assert view_converter is None
            self.set_topological_view(topo_view, axes)
        else:
            assert X is not None, ("DenseDesignMatrix needs to be provided "
                                   "with either topo_view, or X")
            if view_converter is not None:
                self.view_converter = view_converter

                # Get the topo_space (usually Conv2DSpace) from the
                # view_converter
                if not hasattr(view_converter, 'topo_space'):
                    raise NotImplementedError("Not able to get a topo_space "
                                              "from this converter: %s"
                                              % view_converter)

                # self.X_topo_space stores a "default" topological space that
                # will be used only when self.iterator is called without a
                # data_specs, and with "topo=True", which is deprecated.
                self.X_topo_space = view_converter.topo_space
            else:
                self.X_topo_space = None

            # Update data specs, if not done in set_topological_view
            X_space = VectorSpace(dim=self.X.shape[1])
            X_source = 'features'
            if y is None:
                space = X_space
                source = X_source
            else:
                if self.y.ndim == 1:
                    dim = 1
                else:
                    dim = self.y.shape[-1]
                y_space = VectorSpace(dim=dim)
                y_source = 'targets'

                space = CompositeSpace((X_space, y_space))
                source = (X_source, y_source)
            self.data_specs = (space, source)
            self.X_space = X_space

        self.compress = False
        self.design_loc = None
        self.rng = make_np_rng(rng, which_method="random_integers")
        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('sequential')
        self._iter_topo = False
        self._iter_targets = False
        self._iter_data_specs = (self.X_space, 'features')

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
        self.preprocessor = preprocessor
    def iterator(self,
                 mode=None,
                 batch_size=None,
                 num_batches=None,
                 topo=None,
                 targets=None,
                 rng=None,
                 data_specs=None,
                 return_tuple=False):

        warnings.warn(
            "Overloading this method is not necessary with the new "
            "interface change and this will be removed around November "
            "7th 2013",
            stacklevel=2)

        if topo is not None or targets is not None:
            if data_specs is not None:
                raise ValueError(
                    "In DenseDesignMatrix.iterator, both "
                    "the `data_specs` argument and deprecated arguments "
                    "`topo` or `targets` were provided.",
                    (data_specs, topo, targets))

            warnings.warn(
                "Usage of `topo` and `target` arguments are being "
                "deprecated, and will be removed around November 7th, "
                "2013. `data_specs` should be used instead.",
                stacklevel=2)
            # build data_specs from topo and targets if needed
            if topo is None:
                topo = getattr(self, '_iter_topo', False)
            if topo:
                # self.iterator is called without a data_specs, and with
                # "topo=True", so we use the default topological space
                # stored in self.X_topo_space
                assert self.X_topo_space is not None
                X_space = self.X_topo_space
            else:
                X_space = self.X_space

            if targets is None:
                targets = getattr(self, '_iter_targets', False)
            if targets:
                assert self.y is not None
                y_space = self.data_specs[0][1]
                space = (X_space, y_space)
                source = ('features', 'targets')
            else:
                space = X_space
                source = 'features'

            data_specs = (space, source)
            _deprecated_interface = True
        else:
            _deprecated_interface = False

        # TODO: Refactor
        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            else:
                raise ValueError('iteration mode not provided and no default '
                                 'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)
        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)
        if rng is None and mode.stochastic:
            rng = self.rng
        if data_specs is None:
            data_specs = self._iter_data_specs
        if _deprecated_interface:
            return FiniteDatasetIteratorPyTables(self,
                                                 mode(self.X.shape[0],
                                                      batch_size, num_batches,
                                                      rng),
                                                 data_specs=data_specs,
                                                 return_tuple=return_tuple)
        else:
            return FiniteDatasetIterator(self,
                                         mode(self.X.shape[0], batch_size,
                                              num_batches, rng),
                                         data_specs=data_specs,
                                         return_tuple=return_tuple)
Beispiel #55
0
    def __init__(self,
                 which_set,
                 frame_length,
                 overlap=0,
                 frames_per_example=1,
                 start=0,
                 stop=None,
                 audio_only=False,
                 rng=_default_seed):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in a frame
        overlap : int, optional
            Number of overlapping acoustic samples for two consecutive frames.
            Defaults to 0, meaning frames don't overlap.
        frames_per_example : int, optional
            Number of frames in a training example. Defaults to 1.
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.overlap = overlap
        self.frames_per_example = frames_per_example
        self.offset = self.frame_length - self.overlap
        self.audio_only = audio_only

        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std

        if not self.audio_only:
            self.num_phones = numpy.max(
                [numpy.max(sequence) for sequence in self.phones]) + 1
            self.num_phonemes = numpy.max(
                [numpy.max(sequence) for sequence in self.phonemes]) + 1
            self.num_words = numpy.max(
                [numpy.max(sequence) for sequence in self.words]) + 1
            # The following is hard coded. However, the way it is done above
            # could be problematic if a max value (the max over the whole
            # dataset (train + valid + test)) is not present in at least one
            # one of the three subsets. This is the case for speakers. This is
            # not the case for phones.
            self.num_speakers = 630

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
                self.speaker_id = self.speaker_id[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]
                self.speaker_id = self.speaker_id[start:]

        examples_per_sequence = [0]

        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            if not self.audio_only:
                # Phones segmentation
                phones_sequence = self.phones[sequence_id]
                phones_segmented_sequence = segment_axis(
                    phones_sequence, frame_length, overlap)
                self.phones[sequence_id] = phones_segmented_sequence
                # phones_segmented_sequence = scipy.stats.mode(
                #     phones_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phones_segmented_sequence = numpy.asarray(
                #     phones_segmented_sequence,
                #     dtype='int'
                # )
                # phones_sequence_list.append(phones_segmented_sequence)
                # Phonemes segmentation
                phonemes_sequence = self.phonemes[sequence_id]
                phonemes_segmented_sequence = segment_axis(
                    phonemes_sequence, frame_length, overlap)
                self.phonemes[sequence_id] = phonemes_segmented_sequence
                # phonemes_segmented_sequence = scipy.stats.mode(
                #     phonemes_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phonemes_segmented_sequence = numpy.asarray(
                #     phonemes_segmented_sequence,
                #     dtype='int'
                # )
                # phonemes_sequence_list.append(phonemes_segmented_sequence)
                # Words segmentation
                words_sequence = self.words[sequence_id]
                words_segmented_sequence = segment_axis(
                    words_sequence, frame_length, overlap)
                self.words[sequence_id] = words_segmented_sequence
                # words_segmented_sequence = scipy.stats.mode(
                #     words_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # words_segmented_sequence = numpy.asarray(words_segmented_sequence,
                #                                          dtype='int')
                # words_sequence_list.append(words_segmented_sequence)

            # TODO: look at this, does it force copying the data?
            # Sequence segmentation
            samples_segmented_sequence = segment_axis(samples_sequence,
                                                      frame_length, overlap)
            self.raw_wav[sequence_id] = samples_segmented_sequence

            # TODO: change me
            # Generate features/targets/phones/phonemes/words map
            num_frames = samples_segmented_sequence.shape[0]
            num_examples = num_frames - self.frames_per_example
            examples_per_sequence.append(num_examples)

        self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence)
        self.samples_sequences = self.raw_wav
        if not self.audio_only:
            self.phones_sequences = self.phones
            self.phonemes_sequences = self.phonemes
            self.words_sequences = self.words
        self.num_examples = self.cumulative_example_indexes[-1]

        # DataSpecs
        features_space = VectorSpace(dim=self.frame_length *
                                     self.frames_per_example)
        features_source = 'features'

        def features_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index]
                            [example_index:example_index +
                             self.frames_per_example].ravel())
            return rval

        targets_space = VectorSpace(dim=self.frame_length)
        targets_source = 'targets'

        def targets_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][
                    example_index + self.frames_per_example].ravel())
            return rval

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        map_fn_components = [features_map_fn, targets_map_fn]
        batch_components = [None, None]

        if not self.audio_only:
            phones_space = IndexSpace(max_labels=self.num_phones,
                                      dim=1,
                                      dtype=str(
                                          self.phones_sequences[0].dtype))
            phones_source = 'phones'

            def phones_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    rval.append(self.phones_sequences[sequence_index][
                        example_index + self.frames_per_example].ravel())
                return rval

            phonemes_space = IndexSpace(max_labels=self.num_phonemes,
                                        dim=1,
                                        dtype=str(
                                            self.phonemes_sequences[0].dtype))
            phonemes_source = 'phonemes'

            def phonemes_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    rval.append(self.phonemes_sequences[sequence_index][
                        example_index + self.frames_per_example].ravel())
                return rval

            words_space = IndexSpace(max_labels=self.num_words,
                                     dim=1,
                                     dtype=str(self.words_sequences[0].dtype))
            words_source = 'words'

            def words_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    rval.append(self.words_sequences[sequence_index][
                        example_index + self.frames_per_example].ravel())
                return rval

            speaker_id_space = IndexSpace(max_labels=self.num_speakers,
                                          dim=1,
                                          dtype=str(self.speaker_id.dtype))
            speaker_id_source = 'speaker_id'

            def speaker_id_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    rval.append(self.speaker_id[sequence_index].ravel())
                return rval

            dialect_space = IndexSpace(max_labels=8, dim=1, dtype='int32')
            dialect_source = 'dialect'

            def dialect_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    info = self.speaker_info_list[
                        self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[1:9]))
                return rval

            education_space = IndexSpace(max_labels=6, dim=1, dtype='int32')
            education_source = 'education'

            def education_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    info = self.speaker_info_list[
                        self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[9:15]))
                return rval

            race_space = IndexSpace(max_labels=8, dim=1, dtype='int32')
            race_source = 'race'

            def race_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    info = self.speaker_info_list[
                        self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[16:24]))
                return rval

            gender_space = IndexSpace(max_labels=2, dim=1, dtype='int32')
            gender_source = 'gender'

            def gender_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(
                        indexes):
                    info = self.speaker_info_list[
                        self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[24:]))
                return rval

            space_components.extend([
                phones_space, phonemes_space, words_space, speaker_id_space,
                dialect_space, education_space, race_space, gender_space
            ])
            source_components.extend([
                phones_source, phonemes_source, words_source,
                speaker_id_source, dialect_source, education_source,
                race_source, gender_source
            ])
            map_fn_components.extend([
                phones_map_fn, phonemes_map_fn, words_map_fn,
                speaker_id_map_fn, dialect_map_fn, education_map_fn,
                race_map_fn, gender_map_fn
            ])
            batch_components.extend(
                [None, None, None, None, None, None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.map_functions = tuple(map_fn_components)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace(
            (features_space, targets_space)), (features_source,
                                               targets_source))
Beispiel #56
0
    def __init__(self,
                 which_set,
                 frame_length,
                 start=0,
                 stop=None,
                 audio_only=False,
                 rng=_default_seed):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in the sliding window
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.audio_only = audio_only

        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std

        if not self.audio_only:
            self.num_phones = numpy.max(
                [numpy.max(sequence) for sequence in self.phones]) + 1
            self.num_phonemes = numpy.max(
                [numpy.max(sequence) for sequence in self.phonemes]) + 1
            self.num_words = numpy.max(
                [numpy.max(sequence) for sequence in self.words]) + 1

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]

        samples_sequences = []
        targets_sequences = []
        phones_sequences = []
        phonemes_sequences = []
        words_sequences = []
        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            # Sequence segmentation
            samples_segmented_sequence = segment_axis(samples_sequence,
                                                      frame_length,
                                                      frame_length - 1)[:-1]
            samples_sequences.append(samples_segmented_sequence)
            targets_sequences.append(samples_sequence[frame_length:].reshape(
                (samples_sequence[frame_length:].shape[0], 1)))
            if not self.audio_only:
                target_phones = self.phones[sequence_id][frame_length:]
                phones_sequences.append(
                    target_phones.reshape((target_phones.shape[0], 1)))
                target_phonemes = self.phonemes[sequence_id][frame_length:]
                phonemes_sequences.append(
                    target_phonemes.reshape((target_phonemes.shape[0], 1)))
                target_words = self.words[sequence_id][frame_length:]
                words_sequences.append(
                    target_words.reshape((target_words.shape[0], 1)))

        del self.raw_wav
        self.samples_sequences = samples_sequences
        self.targets_sequences = targets_sequences
        self.data = [samples_sequences, targets_sequences]
        if not self.audio_only:
            del self.phones
            del self.phonemes
            del self.words
            self.phones_sequences = phones_sequences
            self.phonemes_sequences = phonemes_sequences
            self.words_sequences = words_sequences
            self.data.extend(
                [phones_sequences, phonemes_sequences, words_sequences])
        self.num_examples = len(samples_sequences)

        # DataSpecs
        features_space = VectorSequenceSpace(dim=self.frame_length)
        features_source = 'features'

        targets_space = VectorSequenceSpace(dim=1)
        targets_source = 'targets'

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        batch_components = [None, None]

        if not self.audio_only:
            phones_space = IndexSequenceSpace(
                max_labels=self.num_phones,
                dim=1,
                dtype=str(self.phones_sequences[0].dtype))
            phones_source = 'phones'

            phonemes_space = IndexSequenceSpace(
                max_labels=self.num_phonemes,
                dim=1,
                dtype=str(self.phonemes_sequences[0].dtype))
            phonemes_source = 'phonemes'

            words_space = IndexSequenceSpace(
                max_labels=self.num_words,
                dim=1,
                dtype=str(self.words_sequences[0].dtype))
            words_source = 'words'

            space_components.extend(
                [phones_space, phonemes_space, words_space])
            source_components.extend(
                [phones_source, phonemes_source, words_source])
            batch_components.extend([None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace(
            (features_space, targets_space)), (features_source,
                                               targets_source))
Beispiel #57
0
    def __init__(self,
                 chromosomes="ALL",
                 dataset_name="snp",
                 read_only=False, balance_classes=False,
                 start=None, stop=None, shuffle=False,
                 add_noise=False, rng=_default_seed, flip_labels=False):
        print "Loading %r chromosomes for %s" % (chromosomes, dataset_name)
        if start is not None and stop is not None:
            print "Start: %d, stop: %d" % (start, stop)
        assert isinstance(chromosomes, int) or chromosomes == "ALL",\
            "Can only set chromosomes to be an integer or ALL"

        p = serial.preprocess("${PYLEARN2_NI_PATH}/" + dataset_name)
        data_files = glob(path.join(p, "chr*.npy"))
        label_file = path.join(p, "labels.npy")

        get_int = lambda y: int(''.join(x for x in y if x.isdigit()))
        data_files.sort(key=get_int)

        if (chromosomes == "ALL" or chromosomes > len(data_files)):
            chromosomes = len(data_files)

        self.y = np.atleast_2d(np.load(label_file)).T[start:stop]
        if flip_labels:
            self.y = (self.y + 1) % 2
        self.Xs = ()
        space = ()
        source = ()

        balanced_idx = None
        if balance_classes:
            num_classes = np.amax(self.y) + 1
            class_counts = [len(np.where(self.y == i)[0].tolist())
                            for i in range(num_classes)]
            min_count = min(class_counts)
            balanced_idx = []
            for i in range(num_classes):
                idx = np.where(self.y == i)[0].tolist()[:min_count]
                balanced_idx += idx
            balanced_idx.sort()
            assert len(balanced_idx) / min_count == num_classes
            assert len(balanced_idx) % min_count == 0

            self.y = self.y[balanced_idx]
            for i in range(num_classes):
                assert len(np.where(self.y == i)[0].tolist()) == min_count

        if read_only:
            print "Format is read-only for %s" % which_set
            h5_path = path.join(p, "gen." + which_set + ".h5")

            if not path.isfile(h5_path):
                self.make_h5(data_files,
                             h5_path,
                             start=start,
                             stop=stop)

            h5file = tables.openFile(h5_path)
            datas = [h5file.getNode("/", "Chr%d" % (c + 1)) for c in range(chromosomes)]
            self.Xs = tuple(data.X for data in datas)
            sizes = [h5file.getNode("/", "Sizes")[c] for c in range(chromosomes)]

        else:
            print "Format is on-memory for %s" % dataset_name
            sizes = []
            for c in range(0, chromosomes):
                X = np.load(data_files[c])[start:stop, :]

                assert "%d" % (c+1) in data_files[c]

                if balanced_idx is not None:
                    X = X[balanced_idx]

                assert X.shape[0] == self.y.shape[0],\
                    "Data and labels have different number of samples (%d vs %d)" %\
                    (X.shape[0], self.y.shape[0])

                self.Xs = self.Xs + (X / 2.0,)
                sizes.append(X.shape[1])

        print "%s samples are %d" % (dataset_name, self.y.shape[0])

        space = tuple(VectorSpace(dim=size) for size in sizes)
        source = tuple("chromosomes_%d" % (c + 1) for c in range(chromosomes))

        self.X_space = CompositeSpace(space)
        self.X_source = source

        space = space + (IndexSpace(dim=1, max_labels=2),)
        source = source + ("targets",)
        space = CompositeSpace(space)

        self.data_specs = (space, source)
        self.rng = make_np_rng(rng, which_method="random_integers")
        assert self.rng is not None

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class("shuffled_sequential")
        self._iter_topo = False
        self._iter_targets = False
        self._iter_data_specs = self.data_specs

        if add_noise:
            if add_noise is True:
                add_noise = 0.05
            self.convert = list(randomize_snps.RandomizeSNPs(input_space=x_space,
                                                             corruption_prob=add_noise)
                            for x_space in self.X_space.components) + [None]
        else:
            self.convert = None
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 topo=None, targets=None, rng=None, data_specs=None,
                 return_tuple=False):

        if topo is not None or targets is not None:
            if data_specs is not None:
                raise ValueError('In DenseDesignMatrix.iterator, both the '
                                 '"data_specs" argument and deprecated '
                                 'arguments "topo" or "targets" were '
                                 'provided.',
                                 (data_specs, topo, targets))

            warnings.warn("Usage of `topo` and `target` arguments are being "
                          "deprecated, and will be removed around November "
                          "7th, 2013. `data_specs` should be used instead.",
                          stacklevel=2)

            # build data_specs from topo and targets if needed
            if topo is None:
                topo = getattr(self, '_iter_topo', False)
            if topo:
                # self.iterator is called without a data_specs, and with
                # "topo=True", so we use the default topological space
                # stored in self.X_topo_space
                assert self.X_topo_space is not None
                X_space = self.X_topo_space
            else:
                X_space = self.X_space

            if targets is None:
                targets = getattr(self, '_iter_targets', False)
            if targets:
                assert self.y is not None
                y_space = self.data_specs[0].components[1]
                space = CompositeSpace((X_space, y_space))
                source = ('features', 'targets')
            else:
                space = X_space
                source = 'features'

            data_specs = (space, source)
            convert = None

        else:
            if data_specs is None:
                data_specs = self._iter_data_specs

            # If there is a view_converter, we have to use it to convert
            # the stored data for "features" into one that the iterator
            # can return.
            space, source = data_specs
            if isinstance(space, CompositeSpace):
                sub_spaces = space.components
                sub_sources = source
            else:
                sub_spaces = (space,)
                sub_sources = (source,)

            convert = []
            for sp, src in safe_zip(sub_spaces, sub_sources):
                if src == 'features' and \
                   getattr(self, 'view_converter', None) is not None:
                    conv_fn = (lambda batch, self=self, space=sp:
                               self.view_converter.get_formatted_batch(batch,
                                                                       space))
                else:
                    conv_fn = None

                convert.append(conv_fn)

        # TODO: Refactor
        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            else:
                raise ValueError('iteration mode not provided and no default '
                                 'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)
        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)
        if rng is None and mode.stochastic:
            rng = self.rng
        return FiniteDatasetIterator(self,
                                     mode(self.X.shape[0],
                                          batch_size,
                                          num_batches,
                                          rng),
                                     data_specs=data_specs,
                                     return_tuple=return_tuple,
                                     convert=convert)