def test_mapping_sort():
    data = [[1, 2, 3], [2, 3, 1], [3, 2, 1]]
    data_sorted = [[1, 2, 3]] * 3
    data_sorted_rev = [[3, 2, 1]] * 3
    stream = DataStream(IterableDataset(data))
    wrapper1 = Mapping(stream, SortMapping(operator.itemgetter(0)))
    assert list(wrapper1.get_epoch_iterator()) == list(zip(data_sorted))
    wrapper2 = Mapping(stream, SortMapping(lambda x: -x[0]))
    assert list(wrapper2.get_epoch_iterator()) == list(zip(data_sorted_rev))
    wrapper3 = Mapping(stream, SortMapping(operator.itemgetter(0),
                                           reverse=True))
    assert list(wrapper3.get_epoch_iterator()) == list(zip(data_sorted_rev))
def setup_squad_ranker_datastream(path,
                                  vocab_file,
                                  config,
                                  example_count=1836975):
    ds = SQuADRankerDataset(path, vocab_file)
    it = ShuffledExampleScheme(examples=example_count)
    stream = DataStream(ds, iteration_scheme=it)

    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('question'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=[
                         'question', 'answer', 'better', 'worse', 'b_left',
                         'b_right', 'w_left', 'w_right'
                     ],
                     mask_dtype='int32')

    return ds, stream
def setup_squad_datastream(path, vocab_file, config):
    ds = SQuADDataset(path, vocab_file)
    it = SQuADIterator(path)
    stream = DataStream(ds, iteration_scheme=it)

    if config.concat_ctx_and_question:
        stream = ConcatCtxAndQuestion(stream, config.concat_question_before,
                                      ds.reverse_vocab['<DUMMY>'])

    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('context'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=[
                         'context', 'question', 'answer', 'ans_indices',
                         'ans_boundaries'
                     ],
                     mask_dtype='int32')

    return ds, stream
Example #4
0
def setup_datastream(path, batch_size, sort_batch_count, valid=False):
    A = numpy.load(
        os.path.join(path,
                     ('valid_x_raw.npy' if valid else 'train_x_raw.npy')))
    B = numpy.load(
        os.path.join(path, ('valid_phn.npy' if valid else 'train_phn.npy')))
    C = numpy.load(
        os.path.join(
            path,
            ('valid_seq_to_phn.npy' if valid else 'train_seq_to_phn.npy')))

    D = [B[x[0]:x[1], 2] for x in C]

    ds = IndexableDataset({'input': A, 'output': D})
    stream = DataStream(ds, iteration_scheme=ShuffledExampleScheme(len(A)))

    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('input'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size,
                                                   num_examples=len(A)))
    stream = Padding(stream, mask_sources=['input', 'output'])

    return ds, stream
Example #5
0
def setup_datastream(path, vocab_file, config):
    ds = QADataset(path,
                   vocab_file,
                   config.n_entities,
                   need_sep_token=config.concat_ctx_and_question)
    it = QAIterator(path, shuffle=config.shuffle_questions)

    stream = DataStream(ds, iteration_scheme=it)

    if config.concat_ctx_and_question:
        stream = ConcatCtxAndQuestion(stream, config.concat_question_before,
                                      ds.reverse_vocab['<SEP>'])

    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(
        stream.sources.index(
            'question' if config.concat_ctx_and_question else 'context'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    print('sources')
    print(stream.sources)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=['context', 'question', 'candidates'],
                     mask_dtype='int32')

    print('sources2')
    print(stream.sources)

    return ds, stream
Example #6
0
    def train(self, req_vars):
        stream = TaxiDataset('train', data.traintest_ds)

        if hasattr(self.config, 'use_cuts_for_training') and self.config.use_cuts_for_training:
            stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme())
        else:
            stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme(stream.num_examples))

        if not data.tvt:
            valid = TaxiDataset(data.valid_set, data.valid_ds, sources=('trip_id',))
            valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0]
            stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids)

        stream = transformers.TaxiGenerateSplits(stream, max_splits=self.config.max_splits)

        if hasattr(self.config, 'shuffle_batch_size'):
            stream = transformers.Batch(stream, iteration_scheme=ConstantScheme(self.config.shuffle_batch_size))
            stream = Mapping(stream, SortMapping(key=UniformGenerator()))
            stream = Unpack(stream)

        stream = transformers.taxi_add_datetime(stream)
        stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts)
        stream = transformers.Select(stream, tuple(req_vars))
        
        stream = Batch(stream, iteration_scheme=ConstantScheme(self.config.batch_size))

        stream = MultiProcessing(stream)

        return stream
Example #7
0
def get_train_stream(configuration, sfiles, tfiles, svocab_dict, tvocab_dict):

	s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')
	t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')

	# Merge 
	stream = Merge([s_dataset.get_example_stream(),
                    t_dataset.get_example_stream()],
                   ('source', 'target'))
	# Filter -- TODO 
	stream = Filter(stream, predicate=_too_long(seq_len=configuration['seq_len']))

	# Map - no need 

	# Batch - Sort 
	stream = Batch(stream, 
		iteration_scheme=ConstantScheme(
			configuration['batch_size']*configuration['sort_k_batches']))
	stream = Mapping(stream, SortMapping(_length))
	stream = Unpack(stream)
	stream = Batch(
        stream, iteration_scheme=ConstantScheme(configuration['batch_size']))

	# Pad 
	# Note that </s>=0. Fuel only allows padding 0 by default 
	masked_stream = Padding(stream)

	return masked_stream
Example #8
0
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data,
                  src_vocab_size=30000, trg_vocab_size=30000,
                  unk_id=0, eos_id=1, bos_id=2, train_noise=0,
                  seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    src_stream = get_stream(src_vocab, src_data, src_vocab_size, unk_id, eos_id, bos_id, train_noise)
    trg_stream = get_stream(trg_vocab, trg_data, trg_vocab_size, unk_id, eos_id, bos_id, 0)

    # Merge them to get a source, target pair
    stream = Merge([src_stream, trg_stream], ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream, predicate=_not_too_long(seq_len))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    return PaddingWithEOS(stream, [eos_id, eos_id])
Example #9
0
def framewise_timit_datastream(path, which_set, batch_size, local_copy=False):
    # load frame-wise dataset
    timit_dataset = FramewiseTimit(which_set=which_set,
                                   path=path,
                                   local_copy=local_copy)

    # set shuffle range
    shuffle_rng = numpy.random.RandomState(123)

    # set iterator scheme
    iterator_scheme = SequentialShuffledScheme(
        num_examples=timit_dataset.num_examples,
        batch_size=batch_size,
        rng=shuffle_rng)

    # base data stream
    base_stream = DataStream(dataset=timit_dataset,
                             iteration_scheme=iterator_scheme)

    # reshape data stream data_source, shape_source
    reshape_stream = Reshape(data_source='features',
                             shape_source='features_shapes',
                             data_stream=base_stream,
                             iteration_scheme=iterator_scheme)

    # sort data stream
    sort_stream = Mapping(data_stream=reshape_stream,
                          mapping=SortMapping(key=lambda x: x[0].shape[0]))

    # padding data stream
    padded_stream = Padding(data_stream=sort_stream)

    return padded_stream
def balanced_batch(stream, key, batch_size, batch_sort_size):
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   batch_sort_size))
    comparison = _balanced_batch_helper(stream.sources.index(key))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)
    return Batch(stream, iteration_scheme=ConstantScheme(batch_size))
Example #11
0
 def test_mapping_sort_multisource(self):
     data = OrderedDict([('x', self.data_x), ('y', self.data_y)])
     data_sorted = [([1, 2, 3], [6, 5, 4]), ([1, 2, 3], [4, 6, 5]),
                    ([1, 2, 3], [4, 5, 6])]
     stream = DataStream(IterableDataset(data))
     transformer = Mapping(stream,
                           mapping=SortMapping(operator.itemgetter(0)))
     assert_equal(list(transformer.get_epoch_iterator()), data_sorted)
def test_mapping_sort_multisource():
    data = OrderedDict()
    data['x'] = [[1, 2, 3], [2, 3, 1], [3, 2, 1]]
    data['y'] = [[6, 5, 4], [6, 5, 4], [6, 5, 4]]
    data_sorted = [([1, 2, 3], [6, 5, 4]), ([1, 2, 3], [4, 6, 5]),
                   ([1, 2, 3], [4, 5, 6])]
    stream = DataStream(IterableDataset(data))
    wrapper = Mapping(stream, mapping=SortMapping(operator.itemgetter(0)))
    assert list(wrapper.get_epoch_iterator()) == data_sorted
Example #13
0
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data,
                  src_vocab_size=30000, trg_vocab_size=30000, unk_id=1,
                  seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab, 'rb')),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab, 'rb')),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1])

    return masked_stream
Example #14
0
def construct_stream(dataset, rng, pool_size, maximum_frames, window_features,
                     **kwargs):
    """Construct data stream.

    Parameters:
    -----------
    dataset : Dataset
        Dataset to use.
    rng : numpy.random.RandomState
        Random number generator.
    pool_size : int
        Pool size for TIMIT dataset.
    maximum_frames : int
        Maximum frames for TIMIT datset.
    subsample : bool, optional
        Subsample features.
    pretrain_alignment : bool, optional
        Use phoneme alignment for pretraining.
    uniform_alignment : bool, optional
        Use uniform alignment for pretraining.

    """
    kwargs.setdefault('subsample', False)
    kwargs.setdefault('pretrain_alignment', False)
    kwargs.setdefault('uniform_alignment', False)
    stream = DataStream(dataset,
                        iteration_scheme=SequentialShuffledScheme(
                            dataset.num_examples, pool_size, rng))
    if kwargs['pretrain_alignment'] and kwargs['uniform_alignment']:
        stream = AddUniformAlignmentMask(stream)
    stream = Reshape('features', 'features_shapes', data_stream=stream)
    means, stds = dataset.get_normalization_factors()
    stream = Normalize(stream, means, stds)
    if not window_features == 1:
        stream = WindowFeatures(stream, 'features', window_features)
    if kwargs['pretrain_alignment']:
        stream = Reshape('alignments', 'alignments_shapes', data_stream=stream)
    stream = Mapping(stream, SortMapping(key=key))
    stream = MaximumFrameCache(max_frames=maximum_frames,
                               data_stream=stream,
                               rng=rng)
    stream = Padding(data_stream=stream, mask_sources=['features', 'phonemes'])
    if kwargs['pretrain_alignment']:
        stream = AlignmentPadding(stream, 'alignments')
        stream = Transpose(stream, [(1, 0, 2), (1, 0), (1, 0), (1, 0),
                                    (2, 1, 0)])
    else:
        stream = Transpose(stream, [(1, 0, 2), (1, 0), (1, 0), (1, 0)])

    stream = ForceFloatX(stream)
    if kwargs['subsample']:
        stream = Subsample(stream, 'features', 5)
        stream = Subsample(stream, 'features_mask', 5)
    return stream
Example #15
0
def get_sgnmt_tr_stream(src_data,
                        trg_data,
                        src_vocab_size=30000,
                        trg_vocab_size=30000,
                        unk_id=1,
                        seq_len=50,
                        batch_size=80,
                        sort_k_batches=12,
                        **kwargs):
    """Prepares the unshuffled training data stream. This corresponds 
    to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the
    blocks examples."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    s = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(
        s,
        stream._oov_to_unk(src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Example #16
0
    def get_stream(self, part, batches=True, shuffle=True, add_sources=(),
                   num_examples=None, rng=None, seed=None):

        dataset = self.get_dataset(part, add_sources=add_sources)
        if num_examples is None:
            num_examples = dataset.num_examples

        if shuffle:
            iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng)
        else:
            iteration_scheme = SequentialExampleScheme(num_examples)

        stream = DataStream(
            dataset, iteration_scheme=iteration_scheme)

        stream = FilterSources(stream, (self.recordings_source,
                                        self.labels_source)+tuple(add_sources))
        if self.add_eos:
            stream = Mapping(stream, _AddLabel(self.eos_label))
        if self.add_bos:
            stream = Mapping(stream, _AddLabel(self.bos_label, append=False,
                                               times=self.add_bos))
        if self.preprocess_text:
            stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text)
        stream = Filter(stream, self.length_filter)
        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            stream = Mapping(stream, SortMapping(_length))
            stream = Unpack(stream)

        if self.preprocess_features == 'log_spectrogram':
            stream = Mapping(
                stream, functools.partial(apply_preprocessing,
                                          log_spectrogram))
        if self.normalization:
            stream = self.normalization.wrap_stream(stream)
        stream = ForceFloatX(stream)
        if not batches:
            return stream

        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(self.batch_size if part == 'train'
                                            else self.validation_batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        return stream
Example #17
0
def create_data_generator(path, vocab_file, config):
    ds = QADataset(path,
                   vocab_file,
                   config.n_entities,
                   need_sep_token=config.concat_ctx_and_question)
    it = QAIterator(path, shuffle=config.shuffle_questions)

    stream = DataStream(ds, iteration_scheme=it)

    if config.concat_ctx_and_question:
        stream = ConcatCtxAndQuestion(stream, config.concat_question_before,
                                      ds.reverse_vocab['<SEP>'])

    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(
        stream.sources.index(
            'question' if config.concat_ctx_and_question else 'context'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=['context', 'question', 'candidates'],
                     mask_dtype='int32')

    def gen():

        if not config.concat_ctx_and_question:
            for (seq_cont, seq_cont_mask, seq_quest, seq_quest_mask, tg,
                 candidates, candidates_mask) in stream.get_epoch_iterator():
                seq_cont_mask = seq_cont_mask.astype('float32')
                seq_quest_mask = seq_quest_mask.astype('float32')
                candidates_mask = candidates_mask.astype('float32')

                yield (seq_cont, seq_cont_mask, seq_quest, seq_quest_mask, tg,
                       candidates, candidates_mask)
        else:

            for (seq, seq_mask, tg, candidates, candidates_mask) \
                    in stream.get_epoch_iterator():
                seq_mask = seq_mask.astype('float32')
                candidates_mask = candidates_mask.astype('float32')

                yield (seq, seq_mask, tg, candidates, candidates_mask)

    return gen
Example #18
0
def timit_datastream(path, which_set, local_copy, pool_size, maximum_frames):

    # load dataset
    timit_dataset = Timit(which_set=which_set,
                          path=path,
                          local_copy=local_copy)

    # get statistics
    data_means, data_stds = timit_dataset.get_normalization_factors()

    # set shuffle range
    shuffle_rng = numpy.random.RandomState(123)

    # set iterator scheme
    iterator_scheme = SequentialShuffledScheme(
        num_examples=timit_dataset.num_examples,
        batch_size=pool_size,
        rng=shuffle_rng)

    # base data stream
    base_stream = DataStream(dataset=timit_dataset,
                             iteration_scheme=iterator_scheme)

    # reshape stream
    reshape_stream = Reshape(data_source='features',
                             shape_source='features_shapes',
                             data_stream=base_stream)

    # normalize data stream
    normalize_stream = Normalize(data_stream=reshape_stream,
                                 means=data_means,
                                 stds=data_stds)

    # sort data stream
    sort_stream = Mapping(data_stream=normalize_stream,
                          mapping=SortMapping(key=lambda x: x[0].shape[0]))

    # max frame stream
    max_frame_stream = MaximumFrameCache(max_frames=maximum_frames,
                                         data_stream=sort_stream,
                                         rng=shuffle_rng)

    # padding data stream
    padded_stream = Padding(data_stream=max_frame_stream,
                            mask_sources=['features', 'phonemes'])

    # floatX stream
    data_stream = ForceFloatX(padded_stream)
    return timit_dataset, data_stream
Example #19
0
def get_sgnmt_shuffled_tr_stream(src_data,
                                 trg_data,
                                 src_vocab_size=30000,
                                 trg_vocab_size=30000,
                                 unk_id=1,
                                 seq_len=50,
                                 batch_size=80,
                                 sort_k_batches=12,
                                 **kwargs):
    """Prepares the shuffled training data stream. This is similar to 
    ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination
    with ``ShuffledExampleScheme`` to support reshuffling."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab,
                                        trg_vocab, None)
    #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples)
    iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples)
    s = DataStream(parallel_dataset, iteration_scheme=iter_scheme)

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(
        s,
        stream._oov_to_unk(src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Example #20
0
def setup_sorter_datastream(path, config):
    ds = SorterDataset(path)
    it = ShuffledExampleScheme(examples=config.example_count)
    stream = DataStream(ds, iteration_scheme=it)
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('unsorted'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)
    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=['answer', 'unsorted'],
                     mask_dtype='int32')
    return ds, stream
Example #21
0
def test_data_stream_mapping_sort_multisource_ndarrays():
    data = OrderedDict()
    data['x'] = [numpy.array([1, 2, 3]),
                 numpy.array([2, 3, 1]),
                 numpy.array([3, 2, 1])]
    data['y'] = [numpy.array([6, 5, 4]),
                 numpy.array([6, 5, 4]),
                 numpy.array([6, 5, 4])]
    data_sorted = [(numpy.array([1, 2, 3]), numpy.array([6, 5, 4])),
                   (numpy.array([1, 2, 3]), numpy.array([4, 6, 5])),
                   (numpy.array([1, 2, 3]), numpy.array([4, 5, 6]))]
    stream = DataStream(IterableDataset(data))
    wrapper = Mapping(stream, mapping=SortMapping(operator.itemgetter(0)))
    for output, ground_truth in zip(wrapper.get_epoch_iterator(), data_sorted):
        assert len(output) == len(ground_truth)
        assert (output[0] == ground_truth[0]).all()
        assert (output[1] == ground_truth[1]).all()
Example #22
0
def load_parallel_data(src_file,
                       tgt_file,
                       batch_size,
                       sort_k_batches,
                       dictionary,
                       training=False):
    def preproc(s):
        s = s.replace('``', '"')
        s = s.replace('\'\'', '"')
        return s

    enc_dset = TextFile(files=[src_file],
                        dictionary=dictionary,
                        bos_token=None,
                        eos_token=None,
                        unk_token=CHAR_UNK_TOK,
                        level='character',
                        preprocess=preproc)
    dec_dset = TextFile(files=[tgt_file],
                        dictionary=dictionary,
                        bos_token=CHAR_SOS_TOK,
                        eos_token=CHAR_EOS_TOK,
                        unk_token=CHAR_UNK_TOK,
                        level='character',
                        preprocess=preproc)
    # NOTE merge encoder and decoder setup together
    stream = Merge(
        [enc_dset.get_example_stream(),
         dec_dset.get_example_stream()], ('source', 'target'))
    if training:
        # filter sequences that are too long
        stream = Filter(stream, predicate=TooLong(seq_len=CHAR_MAX_SEQ_LEN))
        # batch and read k batches ahead
        stream = Batch(stream,
                       iteration_scheme=ConstantScheme(batch_size *
                                                       sort_k_batches))
        # sort all samples in read-ahead batch
        stream = Mapping(stream, SortMapping(lambda x: len(x[1])))
        # turn back into stream
        stream = Unpack(stream)
    # batch again
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))
    masked_stream = Padding(stream)
    return masked_stream
def setup_toy_datastream(config):
    ds = ToyDataset()
    it = ToyIterator()

    stream = DataStream(ds, iteration_scheme=it)
    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('context'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(
        stream,
        mask_sources=['context', 'question', 'answer', 'ans_indices'],
        mask_dtype='int32')

    return ds, stream
def setup_cnnsquad_datastream(sq_path, cnn_path, vocab_file, config):

    ds = CNNSQDataset(sq_path, cnn_path, vocab_file)
    it = CNNSQIterator(sq_path, cnn_path, cnn_ratio=config.add_cnn_data)

    stream = DataStream(ds, iteration_scheme=it)
    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('context'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=['context', 'question', 'answer'],
                     mask_dtype='int32')

    return ds, stream
Example #25
0
def stream_handwriting(
        which_sets,
        batch_size,
        seq_size,
        num_letters,
        sorting_mult=20):

    assert sorting_mult > 0

    dataset = Handwriting(which_sets)
    sorting_size = batch_size * sorting_mult
    num_examples = sorting_size * (dataset.num_examples / sorting_size)

    if which_sets == ('train',):
        print "Random order."
        scheme = ShuffledExampleScheme(num_examples)
    else:
        print "Sequential order."
        scheme = SequentialExampleScheme(num_examples)

    data_stream = DataStream.default_stream(dataset, iteration_scheme=scheme)

    # Sort by length of the data sequence.
    data_stream = Batch(
        data_stream, iteration_scheme=ConstantScheme(sorting_size))
    data_stream = Mapping(data_stream, SortMapping(_length))
    data_stream = Unpack(data_stream)
    data_stream = Batch(
        data_stream, iteration_scheme=ConstantScheme(batch_size))

    data_stream = Padding(data_stream)
    data_stream = SourceMapping(
        data_stream, _transpose, which_sources=('features', 'features_mask'))
    data_stream = SegmentSequence(
        data_stream,
        seq_size=seq_size + 1,
        share_value=True,
        return_last=True,
        which_sources=('features', 'features_mask'),
        add_flag=True)
    return data_stream
Example #26
0
def _get_sgnmt_tr_stream(data_stream,
                         src_vocab_size=30000,
                         trg_vocab_size=30000,
                         seq_len=50,
                         batch_size=80,
                         sort_k_batches=12,
                         src_sparse_feat_map='',
                         trg_sparse_feat_map='',
                         **kwargs):
    """Prepares the raw text file stream ``data_stream`` for the Blocks
    main loop. This includes handling UNKs, splitting ino batches, sort
    locally by sequence length, and masking. This roughly corresponds 
    to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the
    blocks examples.
    
    The arguments to this method are given by the configuration dict.
    """

    # Filter sequences that are too long
    s = Filter(data_stream, predicate=stream._too_long(seq_len=seq_len))

    # Replacing out of vocabulary tokens with unk token already
    # handled in the `DataSet`s

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Example #27
0
    def get_stream(self, part, batches=True, shuffle=True,
                   add_sources=()):
        dataset = self.get_dataset(part, add_sources=add_sources)
        stream = (DataStream(dataset,
                             iteration_scheme=ShuffledExampleScheme(dataset.num_examples))
                  if shuffle
                  else dataset.get_example_stream())

        stream = FilterSources(stream, (self.recordings_source,
                                        self.labels_source)+tuple(add_sources))
        if self.add_eos:
            if self.prepend_eos:
                stream = Mapping(stream, _AddEosLabelBeginEnd(self.eos_label))
            else:
                stream = Mapping(stream, _AddEosLabelEnd(self.eos_label))
        if self.preprocess_text:
            stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text)
        stream = Filter(stream, self.length_filter)
        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            stream = Mapping(stream, SortMapping(_length))
            stream = Unpack(stream)

        if self.preprocess_features == 'log_spectrogram':
            stream = Mapping(
                stream, functools.partial(apply_preprocessing,
                                          log_spectrogram))
        if self.normalization:
            stream = self.normalization.wrap_stream(stream)
        stream = ForceFloatX(stream)
        if not batches:
            return stream

        stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        return stream
Example #28
0
def _get_stream_from_lines(vocab,
                           lines,
                           preprocess=to_lower_case,
                           vocab_size=30000,
                           eos_id=0,
                           eos='</S>',
                           unk_id=1,
                           batch_size=80,
                           sort_k_batches=12):
    if preprocess is not None:
        lines = [preprocess(line) + ' ' + eos for line in lines]
    dataset = IterableDataset(iterables=lines)
    stream = DataStream(dataset)
    stream = Mapping(
        stream, lambda x:
        ([vocab[w] if w in vocab else unk_id for w in x[0].split()], ))

    if vocab_size < len(vocab):
        stream = Mapping(stream,
                         _oov_to_unk(vocab_size=vocab_size, unk_id=unk_id))
    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length(target_source_index=0)))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    stream = _PaddingWithToken(stream, eos_id)

    return stream
def construct_stream(dataset, rng, batch_size, n_batches=None, **kwargs):
    """Construct data stream.

    Parameters:
    -----------
    dataset : Dataset
        Dataset to use.
    rng : numpy.random.RandomState
        Random number generator.
    batch_size : int
        Size of the batch
    n_batches : int
        Number of batchs to update population statistics.
    """
    if n_batches is not None:
        scheme = ShuffledScheme(n_batches * batch_size, batch_size=batch_size)
    else:
        scheme = ShuffledScheme(dataset.num_examples, batch_size=batch_size)
    stream = DataStream(dataset, iteration_scheme=scheme)
    stream = Mapping(stream, SortMapping(key=key))
    stream = Padding(data_stream=stream, mask_sources=['features', 'phonemes'])
    stream = Transpose(stream, [(1, 0, 2), (1, 0), (1, 0), (1, 0)])
    return stream
Example #30
0
def get_tr_stream(path, src_eos_idx, phones_sil, tgt_eos_idx, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    sources = ('words', 'audio', 'words_ends', 'punctuation_marks', 'phones', 'phones_words_ends', 'phones_words_acoustic_ends')
    #sources = ('words', 'audio', 'words_ends', 'punctuation_marks', 'phones', 'phones_words_ends')
    dataset = H5PYDataset(path, which_sets=('train',), sources=sources, load_in_memory=False)
    print "creating example stream"
    stream = dataset.get_example_stream()
    print "example stream created"

    # Filter sequences that are too long
    stream = Filter(stream, predicate=_too_long(seq_len=seq_len))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(stream, {
        'words': src_eos_idx,
        'phones': phones_sil,
        'punctuation_marks': tgt_eos_idx,
        'audio': 0,
        'words_ends': -1,
        'phones_words_ends': -1,
        'phones_words_acoustic_ends': -1,
    })

    return masked_stream