Example #1
0
def setup_datastream(path, batch_size, sort_batch_count, valid=False):
    A = numpy.load(
        os.path.join(path,
                     ('valid_x_raw.npy' if valid else 'train_x_raw.npy')))
    B = numpy.load(
        os.path.join(path, ('valid_phn.npy' if valid else 'train_phn.npy')))
    C = numpy.load(
        os.path.join(
            path,
            ('valid_seq_to_phn.npy' if valid else 'train_seq_to_phn.npy')))

    D = [B[x[0]:x[1], 2] for x in C]

    ds = IndexableDataset({'input': A, 'output': D})
    stream = DataStream(ds, iteration_scheme=ShuffledExampleScheme(len(A)))

    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('input'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size,
                                                   num_examples=len(A)))
    stream = Padding(stream, mask_sources=['input', 'output'])

    return ds, stream
Example #2
0
    def train(self, req_vars):
        valid = TaxiDataset(self.config.valid_set,
                            'valid.hdf5',
                            sources=('trip_id', ))
        valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0]

        stream = TaxiDataset('train')

        if hasattr(
                self.config,
                'use_cuts_for_training') and self.config.use_cuts_for_training:
            stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme())
        else:
            stream = DataStream(stream,
                                iteration_scheme=ShuffledExampleScheme(
                                    stream.num_examples))

        stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids)
        stream = transformers.TaxiGenerateSplits(
            stream, max_splits=self.config.max_splits)

        stream = transformers.taxi_add_datetime(stream)
        # stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts)
        stream = transformers.Select(stream, tuple(req_vars))

        stream = Batch(stream,
                       iteration_scheme=ConstantScheme(self.config.batch_size))

        stream = MultiProcessing(stream)

        return stream
Example #3
0
    def train(self, req_vars):
        prefix_stream = DataStream(self.train_dataset,
                                   iteration_scheme=ShuffledExampleScheme(
                                       self.train_dataset.num_examples))

        if not data.tvt:
            prefix_stream = transformers.TaxiExcludeTrips(
                prefix_stream, self.valid_trips_ids)
        prefix_stream = transformers.TaxiExcludeEmptyTrips(prefix_stream)
        prefix_stream = transformers.TaxiGenerateSplits(
            prefix_stream, max_splits=self.config.max_splits)
        prefix_stream = transformers.taxi_add_datetime(prefix_stream)
        prefix_stream = transformers.taxi_add_first_last_len(
            prefix_stream, self.config.n_begin_end_pts)
        prefix_stream = Batch(prefix_stream,
                              iteration_scheme=ConstantScheme(
                                  self.config.batch_size))

        candidate_stream = self.candidate_stream(
            self.config.train_candidate_size)

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)
        stream = transformers.Select(stream, tuple(req_vars))
        stream = MultiProcessing(stream)
        return stream
Example #4
0
    def candidate_stream(self, n_candidates, sortmap=True):
        candidate_stream = DataStream(self.train_dataset,
                                      iteration_scheme=ShuffledExampleScheme(
                                          self.train_dataset.num_examples))
        if not data.tvt:
            candidate_stream = transformers.TaxiExcludeTrips(
                candidate_stream, self.valid_trips_ids)
        candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream)
        candidate_stream = transformers.taxi_add_datetime(candidate_stream)

        if not data.tvt:
            candidate_stream = transformers.add_destination(candidate_stream)

        if sortmap:
            candidate_stream = transformers.balanced_batch(
                candidate_stream,
                key='latitude',
                batch_size=n_candidates,
                batch_sort_size=self.config.batch_sort_size)
        else:
            candidate_stream = Batch(
                candidate_stream,
                iteration_scheme=ConstantScheme(n_candidates))

        candidate_stream = Padding(candidate_stream,
                                   mask_sources=['latitude', 'longitude'])

        return candidate_stream
Example #5
0
    def _construct_shuffled_stream(self, dataset, for_type='train'):
        '''Construc a shuffled stream from an IndexableDataset object

        Subclass should add transformation on the stream, e.g.,
                1.Sort samples by size
                2.Batch dataset
                3.Add mask on samples
        :param dataset: fuel.IndexableDataset
                This is constructed by self._construct_dataset method.
        :return: fuel.stream.Datastream
                An object of fuel.stream.Datastream with ShuffledExampleScheme
                A fuel shuffled stream with basic transformations,
        '''
        it = ShuffledExampleScheme(dataset.num_examples)
        stream = DataStream(dataset, iteration_scheme=it)
        # Sort samples by size and compact samples with similar size into a batch.
        # stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size * self.sort_batch_count))
        # comparison = _balanced_batch_helper(stream.sources.index(self.compare_source))
        # stream = Mapping(stream, SortMapping(comparison))
        # stream = Unpack(stream)
        # stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size))
        # # Add mask on inputs
        # for source in self.need_mask_sources.iteritems():
        #     stream = Padding(stream, mask_sources=[source[0]], mask_dtype=source[1])
        return stream
def setup_squad_ranker_datastream(path,
                                  vocab_file,
                                  config,
                                  example_count=1836975):
    ds = SQuADRankerDataset(path, vocab_file)
    it = ShuffledExampleScheme(examples=example_count)
    stream = DataStream(ds, iteration_scheme=it)

    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('question'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=[
                         'question', 'answer', 'better', 'worse', 'b_left',
                         'b_right', 'w_left', 'w_right'
                     ],
                     mask_dtype='int32')

    return ds, stream
Example #7
0
    def train(self, req_vars):
        prefix_stream = DataStream(self.train_dataset,
                                   iteration_scheme=ShuffledExampleScheme(
                                       self.train_dataset.num_examples))

        if not data.tvt:
            prefix_stream = transformers.TaxiExcludeTrips(
                prefix_stream, self.valid_trips_ids)
        prefix_stream = transformers.TaxiExcludeEmptyTrips(prefix_stream)
        prefix_stream = transformers.TaxiGenerateSplits(
            prefix_stream, max_splits=self.config.max_splits)

        prefix_stream = transformers.taxi_add_datetime(prefix_stream)

        prefix_stream = transformers.balanced_batch(
            prefix_stream,
            key='latitude',
            batch_size=self.config.batch_size,
            batch_sort_size=self.config.batch_sort_size)

        prefix_stream = Padding(prefix_stream,
                                mask_sources=['latitude', 'longitude'])

        candidate_stream = self.candidate_stream(
            self.config.train_candidate_size)

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)

        stream = transformers.Select(stream, tuple(req_vars))
        # stream = MultiProcessing(stream)
        return stream
Example #8
0
    def get_stream(self, part, batch_size, seed=None, raw_text=False):
        d = self.get_dataset(part)
        print("Dataset with {} examples".format(d.num_examples))
        it = ShuffledExampleScheme(d.num_examples,
                                   rng=numpy.random.RandomState(seed))
        stream = DataStream(d, iteration_scheme=it)
        stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

        if self._retrieval:
            stream = FixedMapping(
                stream,
                functools.partial(retrieve_and_pad_snli, self._retrieval),
                add_sources=("defs", "def_mask", "sentence1_def_map",
                             "sentence2_def_map")
            )  # This is because there is bug in Fuel :( Cannot concatenate tuple and list

        if not raw_text:
            stream = SourcewiseMapping(stream,
                                       functools.partial(digitize, self.vocab),
                                       which_sources=('sentence1',
                                                      'sentence2'))

        stream = Padding(
            stream,
            mask_sources=('sentence1',
                          'sentence2'))  # Increases amount of outputs by x2

        return stream
Example #9
0
    def train(self, req_vars):
        valid = TaxiDataset(self.config.valid_set,
                            'valid.hdf5',
                            sources=('trip_id', ))
        valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0]

        stream = TaxiDataset('train')
        stream = DataStream(stream,
                            iteration_scheme=ShuffledExampleScheme(
                                stream.num_examples))
        stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids)
        stream = transformers.TaxiExcludeEmptyTrips(stream)
        stream = transformers.taxi_add_datetime(stream)
        stream = transformers.add_destination(stream)
        stream = transformers.Select(
            stream, tuple(v for v in req_vars if not v.endswith('_mask')))

        stream = transformers.balanced_batch(
            stream,
            key='latitude',
            batch_size=self.config.batch_size,
            batch_sort_size=self.config.batch_sort_size)
        stream = Padding(stream, mask_sources=['latitude', 'longitude'])
        stream = transformers.Select(stream, req_vars)
        return stream
Example #10
0
def _get_shuffled_text_stream(src_data,
                              trg_data,
                              src_vocab_size=30000,
                              trg_vocab_size=30000,
                              src_sparse_feat_map='',
                              trg_sparse_feat_map='',
                              **kwargs):
    """Creates a parallel data stream using ``ParallelTextFile``. This
    data set implementation allows random access, so we return a 
    shuffled data stream using the ``ShuffledExampleScheme`` iteration 
    scheme.
    
    The arguments to this method are given by the configuration dict.
    """

    parallel_dataset = ParallelTextFile(
        src_data,
        trg_data,
        src_vocab_size,
        trg_vocab_size,
        src_sparse_feat_map=src_sparse_feat_map,
        trg_sparse_feat_map=trg_sparse_feat_map)
    #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples)
    iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples)
    return DataStream(parallel_dataset, iteration_scheme=iter_scheme)
Example #11
0
    def get_stream(self, part, batches=True, shuffle=True, add_sources=(),
                   num_examples=None, rng=None, seed=None):

        dataset = self.get_dataset(part, add_sources=add_sources)
        if num_examples is None:
            num_examples = dataset.num_examples

        if shuffle:
            iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng)
        else:
            iteration_scheme = SequentialExampleScheme(num_examples)

        stream = DataStream(
            dataset, iteration_scheme=iteration_scheme)

        stream = FilterSources(stream, (self.recordings_source,
                                        self.labels_source)+tuple(add_sources))
        if self.add_eos:
            stream = Mapping(stream, _AddLabel(self.eos_label))
        if self.add_bos:
            stream = Mapping(stream, _AddLabel(self.bos_label, append=False,
                                               times=self.add_bos))
        if self.preprocess_text:
            stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text)
        stream = Filter(stream, self.length_filter)
        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            stream = Mapping(stream, SortMapping(_length))
            stream = Unpack(stream)

        if self.preprocess_features == 'log_spectrogram':
            stream = Mapping(
                stream, functools.partial(apply_preprocessing,
                                          log_spectrogram))
        if self.normalization:
            stream = self.normalization.wrap_stream(stream)
        stream = ForceFloatX(stream)
        if not batches:
            return stream

        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(self.batch_size if part == 'train'
                                            else self.validation_batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        return stream
Example #12
0
def get_sgnmt_shuffled_tr_stream(src_data,
                                 trg_data,
                                 src_vocab_size=30000,
                                 trg_vocab_size=30000,
                                 unk_id=1,
                                 seq_len=50,
                                 batch_size=80,
                                 sort_k_batches=12,
                                 **kwargs):
    """Prepares the shuffled training data stream. This is similar to 
    ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination
    with ``ShuffledExampleScheme`` to support reshuffling."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab,
                                        trg_vocab, None)
    #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples)
    iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples)
    s = DataStream(parallel_dataset, iteration_scheme=iter_scheme)

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(
        s,
        stream._oov_to_unk(src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Example #13
0
    def train(self, req_vars):
        stream = TaxiDataset('train', data.traintest_ds)

        if hasattr(
                self.config,
                'use_cuts_for_training') and self.config.use_cuts_for_training:
            stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme())
        else:
            stream = DataStream(stream,
                                iteration_scheme=ShuffledExampleScheme(
                                    stream.num_examples))

        if not data.tvt:
            valid = TaxiDataset(data.valid_set,
                                data.valid_ds,
                                sources=('trip_id', ))
            valid_trips_ids = valid.get_data(None,
                                             slice(0, valid.num_examples))[0]
            stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids)

        if hasattr(self.config, 'max_splits'):
            stream = transformers.TaxiGenerateSplits(
                stream, max_splits=self.config.max_splits)
        elif not data.tvt:
            stream = transformers.add_destination(stream)

        if hasattr(self.config, 'train_max_len'):
            idx = stream.sources.index('latitude')

            def max_len_filter(x):
                return len(x[idx]) <= self.config.train_max_len

            stream = Filter(stream, max_len_filter)

        stream = transformers.TaxiExcludeEmptyTrips(stream)
        stream = transformers.taxi_add_datetime(stream)
        stream = transformers.Select(
            stream, tuple(v for v in req_vars if not v.endswith('_mask')))

        stream = transformers.balanced_batch(
            stream,
            key='latitude',
            batch_size=self.config.batch_size,
            batch_sort_size=self.config.batch_sort_size)
        stream = Padding(stream, mask_sources=['latitude', 'longitude'])
        stream = transformers.Select(stream, req_vars)
        stream = MultiProcessing(stream)

        return stream
Example #14
0
def setup_sorter_datastream(path, config):
    ds = SorterDataset(path)
    it = ShuffledExampleScheme(examples=config.example_count)
    stream = DataStream(ds, iteration_scheme=it)
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('unsorted'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)
    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=['answer', 'unsorted'],
                     mask_dtype='int32')
    return ds, stream
Example #15
0
    def candidate_stream(self, n_candidates):
        candidate_stream = DataStream(self.train_dataset,
                                      iteration_scheme=ShuffledExampleScheme(
                                          self.train_dataset.num_examples))
        if not data.tvt:
            candidate_stream = transformers.TaxiExcludeTrips(
                candidate_stream, self.valid_trips_ids)
        candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream)
        candidate_stream = transformers.taxi_add_datetime(candidate_stream)
        candidate_stream = transformers.taxi_add_first_last_len(
            candidate_stream, self.config.n_begin_end_pts)
        if not data.tvt:
            candidate_stream = transformers.add_destination(candidate_stream)

        return Batch(candidate_stream,
                     iteration_scheme=ConstantScheme(n_candidates))
Example #16
0
def stream_handwriting(
        which_sets,
        batch_size,
        seq_size,
        num_letters,
        sorting_mult=20):

    assert sorting_mult > 0

    dataset = Handwriting(which_sets)
    sorting_size = batch_size * sorting_mult
    num_examples = sorting_size * (dataset.num_examples / sorting_size)

    if which_sets == ('train',):
        print "Random order."
        scheme = ShuffledExampleScheme(num_examples)
    else:
        print "Sequential order."
        scheme = SequentialExampleScheme(num_examples)

    data_stream = DataStream.default_stream(dataset, iteration_scheme=scheme)

    # Sort by length of the data sequence.
    data_stream = Batch(
        data_stream, iteration_scheme=ConstantScheme(sorting_size))
    data_stream = Mapping(data_stream, SortMapping(_length))
    data_stream = Unpack(data_stream)
    data_stream = Batch(
        data_stream, iteration_scheme=ConstantScheme(batch_size))

    data_stream = Padding(data_stream)
    data_stream = SourceMapping(
        data_stream, _transpose, which_sources=('features', 'features_mask'))
    data_stream = SegmentSequence(
        data_stream,
        seq_size=seq_size + 1,
        share_value=True,
        return_last=True,
        which_sources=('features', 'features_mask'),
        add_flag=True)
    return data_stream
Example #17
0
    def train(self, req_vars):
        valid = TaxiDataset(self.config.valid_set,
                            'valid.hdf5',
                            sources=('trip_id', ))
        valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0]

        dataset = TaxiDataset('train')

        prefix_stream = DataStream(dataset,
                                   iteration_scheme=TaxiTimeCutScheme(
                                       self.config.num_cuts))
        prefix_stream = transformers.TaxiExcludeTrips(prefix_stream,
                                                      valid_trips_ids)
        prefix_stream = transformers.TaxiGenerateSplits(
            prefix_stream, max_splits=self.config.max_splits)
        prefix_stream = transformers.taxi_add_datetime(prefix_stream)
        prefix_stream = transformers.taxi_add_first_last_len(
            prefix_stream, self.config.n_begin_end_pts)
        prefix_stream = Batch(prefix_stream,
                              iteration_scheme=ConstantScheme(
                                  self.config.batch_size))

        candidate_stream = DataStream(dataset,
                                      iteration_scheme=ShuffledExampleScheme(
                                          dataset.num_examples))
        candidate_stream = transformers.TaxiExcludeTrips(
            candidate_stream, valid_trips_ids)
        candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream)
        candidate_stream = transformers.taxi_add_datetime(candidate_stream)
        candidate_stream = transformers.taxi_add_first_last_len(
            candidate_stream, self.config.n_begin_end_pts)
        candidate_stream = Batch(candidate_stream,
                                 iteration_scheme=ConstantScheme(
                                     self.config.train_candidate_size))

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)
        stream = transformers.Select(stream, tuple(req_vars))
        stream = MultiProcessing(stream)
        return stream
Example #18
0
    def get_stream(self, part, batches=True, shuffle=True,
                   add_sources=()):
        dataset = self.get_dataset(part, add_sources=add_sources)
        stream = (DataStream(dataset,
                             iteration_scheme=ShuffledExampleScheme(dataset.num_examples))
                  if shuffle
                  else dataset.get_example_stream())

        stream = FilterSources(stream, (self.recordings_source,
                                        self.labels_source)+tuple(add_sources))
        if self.add_eos:
            if self.prepend_eos:
                stream = Mapping(stream, _AddEosLabelBeginEnd(self.eos_label))
            else:
                stream = Mapping(stream, _AddEosLabelEnd(self.eos_label))
        if self.preprocess_text:
            stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text)
        stream = Filter(stream, self.length_filter)
        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            stream = Mapping(stream, SortMapping(_length))
            stream = Unpack(stream)

        if self.preprocess_features == 'log_spectrogram':
            stream = Mapping(
                stream, functools.partial(apply_preprocessing,
                                          log_spectrogram))
        if self.normalization:
            stream = self.normalization.wrap_stream(stream)
        stream = ForceFloatX(stream)
        if not batches:
            return stream

        stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        return stream
Example #19
0
    def valid(self, req_vars):
        valid_dataset = TaxiDataset(self.config.valid_set, 'valid.hdf5')
        train_dataset = TaxiDataset('train')
        valid_trips_ids = valid_dataset.get_data(
            None, slice(0, valid_dataset.num_examples))[
                valid_dataset.sources.index('trip_id')]

        prefix_stream = DataStream(valid_dataset,
                                   iteration_scheme=SequentialExampleScheme(
                                       valid_dataset.num_examples))
        prefix_stream = transformers.taxi_add_datetime(prefix_stream)
        prefix_stream = transformers.taxi_add_first_last_len(
            prefix_stream, self.config.n_begin_end_pts)
        prefix_stream = Batch(prefix_stream,
                              iteration_scheme=ConstantScheme(
                                  self.config.batch_size))

        candidate_stream = DataStream(train_dataset,
                                      iteration_scheme=ShuffledExampleScheme(
                                          train_dataset.num_examples))
        candidate_stream = transformers.TaxiExcludeTrips(
            candidate_stream, valid_trips_ids)
        candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream)
        candidate_stream = transformers.taxi_add_datetime(candidate_stream)
        candidate_stream = transformers.taxi_add_first_last_len(
            candidate_stream, self.config.n_begin_end_pts)
        candidate_stream = Batch(candidate_stream,
                                 iteration_scheme=ConstantScheme(
                                     self.config.valid_candidate_size))

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)
        stream = transformers.Select(stream, tuple(req_vars))
        stream = MultiProcessing(stream)
        return stream
Example #20
0
def test_shuffled_example_scheme_requests_examples():
    assert ShuffledExampleScheme(3).requests_examples
Example #21
0
def test_shuffled_example_scheme_no_rng():
    scheme = ShuffledExampleScheme(7)
    assert scheme.rng is not None
Example #22
0
    def get_stream(self,
                   part,
                   batches=True,
                   shuffle=True,
                   add_sources=(),
                   num_examples=None,
                   rng=None,
                   seed=None):
        dataset = self.get_dataset(part, add_sources=add_sources)
        iteration_scheme = None
        if self.use_iteration_scheme:
            if num_examples is None:
                num_examples = dataset.num_examples
            if shuffle:
                iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng)
            else:
                iteration_scheme = SequentialExampleScheme(num_examples)
        stream = DataStream(dataset, iteration_scheme=iteration_scheme)

        # Transformations before rearrangement
        labels_source = self.sources_map['labels']
        if self.add_eos:
            stream = _AddLabel(stream,
                               self.eos_label,
                               which_sources=[labels_source])
        if self.add_bos:
            if self.bos_label is None:
                raise Exception('No bos label given')
            stream = _AddLabel(stream,
                               self.bos_label,
                               append=False,
                               times=self.add_bos,
                               which_sources=[labels_source])
        if self.clip_length:
            stream = _Clip(stream,
                           self.clip_length,
                           force_eos=self.eos_label
                           if self.force_eos_when_clipping else None,
                           which_sources=[labels_source])

        # More efficient packing of examples in batches
        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            stream = Mapping(stream, SortMapping(_Length(index=0)))
            stream = Unpack(stream)

        stream = Rearrange(
            stream,
            dict_subset(self.sources_map,
                        self.default_sources + list(add_sources)))

        # Tranformations after rearrangement
        if self.corrupt_sources:
            # Can only corrupt sources with the same alphabet
            # as labels
            for source, prob in zip(self.corrupt_sources['names'],
                                    self.corrupt_sources['probs']):
                stream = _Corrupt(stream,
                                  prob,
                                  self.token_map(source),
                                  self.eos_label,
                                  which_sources=[source])
        if self.max_length and part == 'train':
            # Filtering by the maximum length is only done
            # for the training set.
            self.length_filter = _LengthFilter(indices=[
                i for i, source in enumerate(stream.sources)
                if source in self.filter_by
            ],
                                               max_length=self.max_length)
            stream = Filter(stream, self.length_filter)
        stream = ForceFloatX(stream)

        if not batches:
            return stream

        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(self.batch_size if part == 'train'
                                            else self.validation_batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        return stream
Example #23
0
 def get_stream(self,
                part,
                batch_size=None,
                shuffle=False,
                max_length=None,
                raw_text=False,
                q_ids=False,
                seed=None,
                dataset=None):
     if not seed:
         seed = fuel.config.default_seed
     rng = numpy.random.RandomState(seed)
     if not dataset:
         dataset = self.get_dataset(part)
     if shuffle:
         stream = DataStream(dataset,
                             iteration_scheme=ShuffledExampleScheme(
                                 dataset.num_examples, rng=rng))
     else:
         stream = dataset.get_example_stream()
     if not q_ids:
         stream = FilterSources(
             stream,
             [source for source in dataset.sources if source != 'q_ids'])
     else:
         stream = SourcewiseMapping(stream,
                                    _str2vec,
                                    which_sources=('q_ids'))
     stream = PutTextTransfomer(stream, dataset, raw_text=True)
     # <eos> is added for two purposes: to serve a sentinel for coattention,
     # and also to ensure the answer span ends at a token
     eos = self.vocab.EOS
     stream = SourcewiseMapping(stream,
                                functools.partial(add_eos, eos),
                                which_sources=('contexts', 'questions'))
     stream = Mapping(stream,
                      functools.partial(select_random_answer, rng),
                      mapping_accepts=dict)
     if not batch_size:
         if self._retrieval:
             raise NotImplementedError()
         return stream
     if raw_text:
         stream = Mapping(stream,
                          keep_text,
                          mapping_accepts=dict,
                          add_sources=('contexts_text', 'questions_text'))
     stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))
     if self._retrieval:
         stream = Mapping(stream,
                          functools.partial(retrieve_and_pad_squad,
                                            self._retrieval),
                          mapping_accepts=dict,
                          add_sources=('defs', 'def_mask',
                                       'contexts_def_map',
                                       'questions_def_map'))
     stream = SourcewiseMapping(stream,
                                functools.partial(digitize, self.vocab),
                                which_sources=('contexts', 'questions'))
     stream = Padding(stream,
                      mask_sources=['contexts', 'questions'] +
                      (['contexts_text'] if raw_text else []))
     return stream
Example #24
0
    def get_stream(self, part, batches=True, shuffle=True, add_sources=(),
                   num_examples=None, rng=None, seed=None):
        dataset = self.get_dataset(part, add_sources=add_sources)
        if num_examples is None:
            num_examples = dataset.num_examples

        if shuffle:
            iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng)
        else:
            iteration_scheme = SequentialExampleScheme(num_examples)

        stream = DataStream(
            dataset, iteration_scheme=iteration_scheme)

        if self.add_eos:
            stream = Mapping(stream, _AddLabel(
                self.eos_label,
                index=stream.sources.index(self.sources_map['labels'])))
        if self.add_bos:
            if self.bos_label is None:
                raise Exception('No bos label given')
            stream = Mapping(stream, _AddLabel(
                self.bos_label, append=False, times=self.add_bos,
                index=stream.sources.index(self.sources_map['labels'])))

        if self.max_length:
            stream = Filter(stream, self.length_filter)

        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            #
            # Hardcode 0 for source on which to sort. This will be good, as
            # most source lengths are correlated and, furthermore, the
            # labels will typically be the last source, thus in a single-input
            # case this sorts on input lengths
            #
            stream = Mapping(stream, SortMapping(_Length(
                index=0)))
            stream = Unpack(stream)

        if self.normalization:
            stream = self.normalization.wrap_stream(stream)
        stream = ForceFloatX(stream)
        stream = Rename(stream,
                        names=dict_subset({v: k for (k, v)
                                           in self.sources_map.items()},
                                          stream.sources,
                                          must_have=False))
        if not batches:
            return stream

        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(self.batch_size if part == 'train'
                                            else self.validation_batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        stream._produces_examples = False
        return stream
Example #25
0
def main(dataset_path, use_c, log_min, log_max, num_steps):
    train_set = H5PYDataset(
        dataset_path, which_sets=('train',), sources=('features', 'targets'),
        subset=slice(0, 63257), load_in_memory=True)
    train_stream = DataStream.default_stream(
        train_set,
        iteration_scheme=ShuffledExampleScheme(train_set.num_examples))

    def get_class_balanced_batch(iterator):
        train_features = [[] for _ in range(10)]
        train_targets = [[] for _ in range(10)]
        batch_size = 0
        while batch_size < 1000:
            f, t = next(iterator)
            t = t[0]
            if len(train_features[t]) < 100:
                train_features[t].append(f)
                train_targets[t].append(t)
                batch_size += 1
        train_features = numpy.vstack(sum(train_features, []))
        train_targets = numpy.vstack(sum(train_targets, []))
        return train_features, train_targets

    train_features, train_targets = get_class_balanced_batch(
        train_stream.get_epoch_iterator())

    valid_set = H5PYDataset(
        dataset_path, which_sets=('train',), sources=('features', 'targets'),
        subset=slice(63257, 73257), load_in_memory=True)
    valid_features, valid_targets = valid_set.data_sources

    test_set = H5PYDataset(
        dataset_path, which_sets=('test',), sources=('features', 'targets'),
        load_in_memory=True)
    test_features, test_targets = test_set.data_sources

    if use_c is None:
        best_error_rate = 1.0
        best_C = None
        for log_C in numpy.linspace(log_min, log_max, num_steps):
            C = numpy.exp(log_C)
            svm = LinearSVC(C=C)
            svm.fit(train_features, train_targets.ravel())
            error_rate = 1 - numpy.mean(
                [svm.score(valid_features[1000 * i: 1000 * (i + 1)],
                           valid_targets[1000 * i: 1000 * (i + 1)].ravel())
                 for i in range(10)])
            if error_rate < best_error_rate:
                best_error_rate = error_rate
                best_C = C
            print('C = {}, validation error rate = {} '.format(C, error_rate) +
                  '(best is {}, {})'.format(best_C, best_error_rate))
    else:
        best_C = use_c

    error_rates = []
    for _ in range(10):
        train_features, train_targets = get_class_balanced_batch(
            train_stream.get_epoch_iterator())
        svm = LinearSVC(C=best_C)
        svm.fit(train_features, train_targets.ravel())
        error_rates.append(1 - numpy.mean(
            [svm.score(valid_features[1000 * i: 1000 * (i + 1)],
                       valid_targets[1000 * i: 1000 * (i + 1)].ravel())
             for i in range(10)]))

    print('Validation error rate = {} +- {} '.format(numpy.mean(error_rates),
                                                     numpy.std(error_rates)))

    error_rates = []
    for _ in range(100):
        train_features, train_targets = get_class_balanced_batch(
            train_stream.get_epoch_iterator())
        svm = LinearSVC(C=best_C)
        svm.fit(train_features, train_targets.ravel())
        s = 1000 * numpy.sum(
            [svm.score(test_features[1000 * i: 1000 * (i + 1)],
                       test_targets[1000 * i: 1000 * (i + 1)].ravel())
             for i in range(26)])
        s += 32 * svm.score(test_features[-32:], test_targets[-32:].ravel())
        s = s / 26032.0
        error_rates.append(1 - s)

    print('Test error rate = {} +- {} '.format(numpy.mean(error_rates),
                                               numpy.std(error_rates)))
Example #26
0
def parrot_stream(voice,
                  use_speaker=False,
                  which_sets=('train', ),
                  batch_size=32,
                  seq_size=50,
                  num_examples=None,
                  sorting_mult=4,
                  noise_level=None,
                  labels_type='full_labels',
                  check_ratio=False,
                  raw_data=True,
                  q_type='mu-law',
                  q_level=256):

    assert labels_type in [
        'full_labels', 'phonemes', 'unconditional', 'unaligned_phonemes',
        'text'
    ]

    dataset = VoiceData(voice=voice, which_sets=which_sets)

    sorting_size = batch_size * sorting_mult

    if not num_examples:
        num_examples = dataset.num_examples

    if 'train' in which_sets:
        scheme = ShuffledExampleScheme(num_examples)
    else:
        scheme = SequentialExampleScheme(num_examples)

    data_stream = DataStream.default_stream(dataset, iteration_scheme=scheme)

    if check_ratio and labels_type in ['unaligned_phonemes', 'text']:
        idx = data_stream.sources.index(labels_type)
        min_val = 8 if labels_type == 'text' else 12.
        max_val = 16 if labels_type == 'text' else 25.
        data_stream = Filter(
            data_stream, lambda x: _check_ratio(x, 0, idx, min_val, max_val))

    segment_sources = ('features', 'features_mask')
    all_sources = segment_sources

    if raw_data:
        raw_sources = ('raw_audio', )
        all_sources += raw_sources
    else:
        raw_sources = ()

    if labels_type != 'unconditional':
        all_sources += ('labels', )
        data_stream = Rename(data_stream, {labels_type: 'labels'})

    if labels_type in ['full_labels', 'phonemes']:
        segment_sources += ('labels', )

    elif labels_type in ['unaligned_phonemes', 'text']:
        all_sources += ('labels_mask', )

    data_stream = Batch(data_stream,
                        iteration_scheme=ConstantScheme(sorting_size))
    data_stream = Mapping(data_stream, SortMapping(_length))
    data_stream = Unpack(data_stream)
    data_stream = Batch(data_stream,
                        iteration_scheme=ConstantScheme(batch_size))

    data_stream = Filter(data_stream,
                         lambda x: _check_batch_size(x, batch_size))

    data_stream = Padding(data_stream)

    if use_speaker:
        data_stream = FilterSources(data_stream,
                                    all_sources + ('speaker_index', ))
    else:
        data_stream = FilterSources(data_stream, all_sources)

    data_stream = SourceMapping(data_stream,
                                _transpose,
                                which_sources=segment_sources)

    # The conditional is not necessary, but I'm still adding it for clarity.
    if raw_data:
        data_stream = SourceMapping(data_stream,
                                    _chunk,
                                    which_sources=raw_sources)

        raw_transformer = get_raw_transformer(q_type, q_level)
        data_stream = SourceMapping(data_stream,
                                    raw_transformer,
                                    which_sources=raw_sources)

    data_stream = SegmentSequence(data_stream,
                                  seq_size=seq_size + 1,
                                  share_value=1,
                                  return_last=False,
                                  add_flag=True,
                                  which_sources=segment_sources + raw_sources)

    if noise_level is not None:
        data_stream = AddConstantSource(data_stream, noise_level,
                                        'feedback_noise_level')

    return data_stream
Example #27
0
    def get_one_stream(self, part, lang=None, batches=True, shuffle=True, add_sources=(),
                   num_examples=None, rng=None, seed=None, num_result=None,
                   soften_distributions=None, only_stream=False):
        assert lang in self.langs
        dataset = self.get_dataset(part, lang, add_sources=add_sources)
        if num_examples is None:
            num_examples = dataset.num_examples

        if shuffle:
            iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng)
        else:
            iteration_scheme = SequentialExampleScheme(num_examples)

        if num_result is None:
            num_result = num_examples

        if lang != self.langs[0] and not only_stream:
            iteration_scheme = RandomExampleScheme(num_examples, num_result=num_result, rng=rng)

        stream = DataStream(
            dataset, iteration_scheme=iteration_scheme)

        if soften_distributions:
            stream = Mapping(stream, SoftenResult(self.default_sources, soften_distributions))

        for bconv in self._binary_convertable_data:
            if bconv in self.default_sources:
                stream = Mapping(stream, ConvertToMask(self.default_sources,
                                                       bconv,
                                                       self.num_features(bconv)))

        if self.add_eos:
            stream = Mapping(stream, _AddLabel(
                self.eos_label,
                index=stream.sources.index(self.sources_map['labels'])))
        if self.add_bos:
            if self.bos_label is None:
                raise Exception('No bos label given')
            stream = Mapping(stream, _AddLabel(
                self.bos_label, append=False, times=self.add_bos,
                index=stream.sources.index(self.sources_map['labels'])))

        if self.max_length:
            stream = Filter(stream, self.length_filter)

        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            #
            # Hardcode 0 for source on which to sort. This will be good, as
            # most source lengths are correlated and, furthermore, the
            # labels will typically be the last source, thus in a single-input
            # case this sorts on input lengths
            #
            stream = Mapping(stream, SortMapping(_Length(
                index=0)))
            stream = Unpack(stream)

        if self.normalization:
            stream = self.normalization.wrap_stream(stream)
        stream = ForceFloatX(stream)
        stream = Rename(stream,
                        names=dict_subset({v: k for (k, v)
                                           in self.sources_map.items()},
                                          stream.sources,
                                          must_have=False))
        if not batches:
            return stream, num_examples

        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(self.batch_size if part == 'train'
                                            else self.validation_batch_size))

        stream._produces_examples = False
        return stream, num_examples
Example #28
0
        def get_stream(file,
                       dictionary=None,
                       add_dict=False,
                       shuffle=False,
                       batch_size=None,
                       read_ahead=1):
            """
            Creates a stream with train/valid/test examples.
            :param file: path to a file with the dataset
            :param dictionary: string->int dict with vocabulary from other datasets. If None, the dictionary is built
                               from this dataset
            :param add_dict: if True, new words are added to the dictionary from this dataset
            :param shuffle: if True, the dataset examples are randomly shuffled
            :param batch_size
            :param read_ahead: Number of batches that shall be pre-fetched and ordered by context length to speed up computation

            """

            # Dataset type (CNN/CBT/bAbI)
            data_type = self.args.dataset_type

            if not batch_size:
                batch_size = self.args.batch_size

            # Pattern for text tokenization
            pattern = re.compile(" |\t|\|")

            if data_type == 'babi':
                prepro = lambda x: nltk.word_tokenize(x)
            else:
                prepro = lambda x: pattern.split(x)

            if add_dict:
                # add words to dictionary
                f = codecs.open(file, 'r', encoding="utf8")
                vocabulary = get_vocabulary(f, prepro)
                code2token = map(lambda x: x[0], vocabulary.most_common())

                new_word_count = 0
                for word in code2token:
                    if word not in dictionary:
                        dictionary[word] = len(dictionary)
                        new_word_count += 1

                print "Added {} new words from file {} to previous vocabulary.".format(
                    new_word_count, file)

            if not dictionary:
                print "Computing new vocabulary for file {}.".format(file)
                # compute vocabulary
                f = codecs.open(file, 'r', encoding="utf8")
                vocabulary = get_vocabulary(f, prepro)
                code2token = map(lambda x: x[0], vocabulary.most_common())
                # Add special symbols (beginning/end of sentence, unknown token, end of question)
                code2token.extend(['<S>', '</S>', '<UNK>', '<QUESTION_END>'])
                dictionary = compute_token2code(code2token)

            # Select the data loader appropriate for the dataset

            common_params = {
                'level': 'word',
                'bos_token': None,
                'eos_token': None,
                'append_question': self.args.query_inited_context_encoder
            }

            if data_type == 'cnn':
                dataset = CNNDataset([file], dictionary, **common_params)
            elif data_type == 'cbt':
                dataset = CBDataset([file], dictionary, **common_params)
            elif data_type == 'babi':
                dataset = bAbIDataset([file], dictionary, **common_params)

            stream = dataset.get_example_stream()

            # Load all data into memory, this way we avoid reloading the data from disk in every epoch
            memory_data = [[] for _ in dataset.sources]
            for ex in stream.get_epoch_iterator():
                for source_example, data_list in zip(ex, memory_data):
                    data_list.append(source_example)

            data_dict = OrderedDict(zip(dataset.sources, memory_data))
            mem_dataset = UnpickableIndexableDataset(data_dict)
            if shuffle:
                # shuffle the data after each epoch of training
                mem_dataset.example_iteration_scheme = ShuffledExampleScheme(
                    mem_dataset.num_examples)
            stream = mem_dataset.get_example_stream()

            # Build a batched version of stream to read k batches ahead
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(batch_size *
                                                           read_ahead))

            if read_ahead > 1:

                # Sort all samples in the read-ahead batch
                stream = Mapping(stream, SortMapping(_length))

                # Convert it into a stream again
                stream = Unpack(stream)

                # Construct batches from the stream with specified batch size
                stream = Batch(stream,
                               iteration_scheme=ConstantScheme(batch_size))

            # Pad sequences that are short
            stream = Padding(
                stream, mask_sources=['context', 'question', 'candidates'])

            return stream, dictionary