def test_mapping_sort(): data = [[1, 2, 3], [2, 3, 1], [3, 2, 1]] data_sorted = [[1, 2, 3]] * 3 data_sorted_rev = [[3, 2, 1]] * 3 stream = DataStream(IterableDataset(data)) wrapper1 = Mapping(stream, SortMapping(operator.itemgetter(0))) assert list(wrapper1.get_epoch_iterator()) == list(zip(data_sorted)) wrapper2 = Mapping(stream, SortMapping(lambda x: -x[0])) assert list(wrapper2.get_epoch_iterator()) == list(zip(data_sorted_rev)) wrapper3 = Mapping(stream, SortMapping(operator.itemgetter(0), reverse=True)) assert list(wrapper3.get_epoch_iterator()) == list(zip(data_sorted_rev))
def setup_squad_ranker_datastream(path, vocab_file, config, example_count=1836975): ds = SQuADRankerDataset(path, vocab_file) it = ShuffledExampleScheme(examples=example_count) stream = DataStream(ds, iteration_scheme=it) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('question')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=[ 'question', 'answer', 'better', 'worse', 'b_left', 'b_right', 'w_left', 'w_right' ], mask_dtype='int32') return ds, stream
def setup_squad_datastream(path, vocab_file, config): ds = SQuADDataset(path, vocab_file) it = SQuADIterator(path) stream = DataStream(ds, iteration_scheme=it) if config.concat_ctx_and_question: stream = ConcatCtxAndQuestion(stream, config.concat_question_before, ds.reverse_vocab['<DUMMY>']) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('context')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=[ 'context', 'question', 'answer', 'ans_indices', 'ans_boundaries' ], mask_dtype='int32') return ds, stream
def setup_datastream(path, batch_size, sort_batch_count, valid=False): A = numpy.load( os.path.join(path, ('valid_x_raw.npy' if valid else 'train_x_raw.npy'))) B = numpy.load( os.path.join(path, ('valid_phn.npy' if valid else 'train_phn.npy'))) C = numpy.load( os.path.join( path, ('valid_seq_to_phn.npy' if valid else 'train_seq_to_phn.npy'))) D = [B[x[0]:x[1], 2] for x in C] ds = IndexableDataset({'input': A, 'output': D}) stream = DataStream(ds, iteration_scheme=ShuffledExampleScheme(len(A))) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('input')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size, num_examples=len(A))) stream = Padding(stream, mask_sources=['input', 'output']) return ds, stream
def setup_datastream(path, vocab_file, config): ds = QADataset(path, vocab_file, config.n_entities, need_sep_token=config.concat_ctx_and_question) it = QAIterator(path, shuffle=config.shuffle_questions) stream = DataStream(ds, iteration_scheme=it) if config.concat_ctx_and_question: stream = ConcatCtxAndQuestion(stream, config.concat_question_before, ds.reverse_vocab['<SEP>']) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper( stream.sources.index( 'question' if config.concat_ctx_and_question else 'context')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) print('sources') print(stream.sources) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=['context', 'question', 'candidates'], mask_dtype='int32') print('sources2') print(stream.sources) return ds, stream
def train(self, req_vars): stream = TaxiDataset('train', data.traintest_ds) if hasattr(self.config, 'use_cuts_for_training') and self.config.use_cuts_for_training: stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme()) else: stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme(stream.num_examples)) if not data.tvt: valid = TaxiDataset(data.valid_set, data.valid_ds, sources=('trip_id',)) valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) stream = transformers.TaxiGenerateSplits(stream, max_splits=self.config.max_splits) if hasattr(self.config, 'shuffle_batch_size'): stream = transformers.Batch(stream, iteration_scheme=ConstantScheme(self.config.shuffle_batch_size)) stream = Mapping(stream, SortMapping(key=UniformGenerator())) stream = Unpack(stream) stream = transformers.taxi_add_datetime(stream) stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts) stream = transformers.Select(stream, tuple(req_vars)) stream = Batch(stream, iteration_scheme=ConstantScheme(self.config.batch_size)) stream = MultiProcessing(stream) return stream
def get_train_stream(configuration, sfiles, tfiles, svocab_dict, tvocab_dict): s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') # Merge stream = Merge([s_dataset.get_example_stream(), t_dataset.get_example_stream()], ('source', 'target')) # Filter -- TODO stream = Filter(stream, predicate=_too_long(seq_len=configuration['seq_len'])) # Map - no need # Batch - Sort stream = Batch(stream, iteration_scheme=ConstantScheme( configuration['batch_size']*configuration['sort_k_batches'])) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) stream = Batch( stream, iteration_scheme=ConstantScheme(configuration['batch_size'])) # Pad # Note that </s>=0. Fuel only allows padding 0 by default masked_stream = Padding(stream) return masked_stream
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=0, eos_id=1, bos_id=2, train_noise=0, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): src_stream = get_stream(src_vocab, src_data, src_vocab_size, unk_id, eos_id, bos_id, train_noise) trg_stream = get_stream(trg_vocab, trg_data, trg_vocab_size, unk_id, eos_id, bos_id, 0) # Merge them to get a source, target pair stream = Merge([src_stream, trg_stream], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_not_too_long(seq_len)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short return PaddingWithEOS(stream, [eos_id, eos_id])
def framewise_timit_datastream(path, which_set, batch_size, local_copy=False): # load frame-wise dataset timit_dataset = FramewiseTimit(which_set=which_set, path=path, local_copy=local_copy) # set shuffle range shuffle_rng = numpy.random.RandomState(123) # set iterator scheme iterator_scheme = SequentialShuffledScheme( num_examples=timit_dataset.num_examples, batch_size=batch_size, rng=shuffle_rng) # base data stream base_stream = DataStream(dataset=timit_dataset, iteration_scheme=iterator_scheme) # reshape data stream data_source, shape_source reshape_stream = Reshape(data_source='features', shape_source='features_shapes', data_stream=base_stream, iteration_scheme=iterator_scheme) # sort data stream sort_stream = Mapping(data_stream=reshape_stream, mapping=SortMapping(key=lambda x: x[0].shape[0])) # padding data stream padded_stream = Padding(data_stream=sort_stream) return padded_stream
def balanced_batch(stream, key, batch_size, batch_sort_size): stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * batch_sort_size)) comparison = _balanced_batch_helper(stream.sources.index(key)) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) return Batch(stream, iteration_scheme=ConstantScheme(batch_size))
def test_mapping_sort_multisource(self): data = OrderedDict([('x', self.data_x), ('y', self.data_y)]) data_sorted = [([1, 2, 3], [6, 5, 4]), ([1, 2, 3], [4, 6, 5]), ([1, 2, 3], [4, 5, 6])] stream = DataStream(IterableDataset(data)) transformer = Mapping(stream, mapping=SortMapping(operator.itemgetter(0))) assert_equal(list(transformer.get_epoch_iterator()), data_sorted)
def test_mapping_sort_multisource(): data = OrderedDict() data['x'] = [[1, 2, 3], [2, 3, 1], [3, 2, 1]] data['y'] = [[6, 5, 4], [6, 5, 4], [6, 5, 4]] data_sorted = [([1, 2, 3], [6, 5, 4]), ([1, 2, 3], [4, 6, 5]), ([1, 2, 3], [4, 5, 6])] stream = DataStream(IterableDataset(data)) wrapper = Mapping(stream, mapping=SortMapping(operator.itemgetter(0))) assert list(wrapper.get_epoch_iterator()) == data_sorted
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def construct_stream(dataset, rng, pool_size, maximum_frames, window_features, **kwargs): """Construct data stream. Parameters: ----------- dataset : Dataset Dataset to use. rng : numpy.random.RandomState Random number generator. pool_size : int Pool size for TIMIT dataset. maximum_frames : int Maximum frames for TIMIT datset. subsample : bool, optional Subsample features. pretrain_alignment : bool, optional Use phoneme alignment for pretraining. uniform_alignment : bool, optional Use uniform alignment for pretraining. """ kwargs.setdefault('subsample', False) kwargs.setdefault('pretrain_alignment', False) kwargs.setdefault('uniform_alignment', False) stream = DataStream(dataset, iteration_scheme=SequentialShuffledScheme( dataset.num_examples, pool_size, rng)) if kwargs['pretrain_alignment'] and kwargs['uniform_alignment']: stream = AddUniformAlignmentMask(stream) stream = Reshape('features', 'features_shapes', data_stream=stream) means, stds = dataset.get_normalization_factors() stream = Normalize(stream, means, stds) if not window_features == 1: stream = WindowFeatures(stream, 'features', window_features) if kwargs['pretrain_alignment']: stream = Reshape('alignments', 'alignments_shapes', data_stream=stream) stream = Mapping(stream, SortMapping(key=key)) stream = MaximumFrameCache(max_frames=maximum_frames, data_stream=stream, rng=rng) stream = Padding(data_stream=stream, mask_sources=['features', 'phonemes']) if kwargs['pretrain_alignment']: stream = AlignmentPadding(stream, 'alignments') stream = Transpose(stream, [(1, 0, 2), (1, 0), (1, 0), (1, 0), (2, 1, 0)]) else: stream = Transpose(stream, [(1, 0, 2), (1, 0), (1, 0), (1, 0)]) stream = ForceFloatX(stream) if kwargs['subsample']: stream = Subsample(stream, 'features', 5) stream = Subsample(stream, 'features_mask', 5) return stream
def get_sgnmt_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the unshuffled training data stream. This corresponds to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the blocks examples.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def get_stream(self, part, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None): dataset = self.get_dataset(part, add_sources=add_sources) if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) stream = DataStream( dataset, iteration_scheme=iteration_scheme) stream = FilterSources(stream, (self.recordings_source, self.labels_source)+tuple(add_sources)) if self.add_eos: stream = Mapping(stream, _AddLabel(self.eos_label)) if self.add_bos: stream = Mapping(stream, _AddLabel(self.bos_label, append=False, times=self.add_bos)) if self.preprocess_text: stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text) stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) if self.preprocess_features == 'log_spectrogram': stream = Mapping( stream, functools.partial(apply_preprocessing, log_spectrogram)) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) if not batches: return stream stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) return stream
def create_data_generator(path, vocab_file, config): ds = QADataset(path, vocab_file, config.n_entities, need_sep_token=config.concat_ctx_and_question) it = QAIterator(path, shuffle=config.shuffle_questions) stream = DataStream(ds, iteration_scheme=it) if config.concat_ctx_and_question: stream = ConcatCtxAndQuestion(stream, config.concat_question_before, ds.reverse_vocab['<SEP>']) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper( stream.sources.index( 'question' if config.concat_ctx_and_question else 'context')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=['context', 'question', 'candidates'], mask_dtype='int32') def gen(): if not config.concat_ctx_and_question: for (seq_cont, seq_cont_mask, seq_quest, seq_quest_mask, tg, candidates, candidates_mask) in stream.get_epoch_iterator(): seq_cont_mask = seq_cont_mask.astype('float32') seq_quest_mask = seq_quest_mask.astype('float32') candidates_mask = candidates_mask.astype('float32') yield (seq_cont, seq_cont_mask, seq_quest, seq_quest_mask, tg, candidates, candidates_mask) else: for (seq, seq_mask, tg, candidates, candidates_mask) \ in stream.get_epoch_iterator(): seq_mask = seq_mask.astype('float32') candidates_mask = candidates_mask.astype('float32') yield (seq, seq_mask, tg, candidates, candidates_mask) return gen
def timit_datastream(path, which_set, local_copy, pool_size, maximum_frames): # load dataset timit_dataset = Timit(which_set=which_set, path=path, local_copy=local_copy) # get statistics data_means, data_stds = timit_dataset.get_normalization_factors() # set shuffle range shuffle_rng = numpy.random.RandomState(123) # set iterator scheme iterator_scheme = SequentialShuffledScheme( num_examples=timit_dataset.num_examples, batch_size=pool_size, rng=shuffle_rng) # base data stream base_stream = DataStream(dataset=timit_dataset, iteration_scheme=iterator_scheme) # reshape stream reshape_stream = Reshape(data_source='features', shape_source='features_shapes', data_stream=base_stream) # normalize data stream normalize_stream = Normalize(data_stream=reshape_stream, means=data_means, stds=data_stds) # sort data stream sort_stream = Mapping(data_stream=normalize_stream, mapping=SortMapping(key=lambda x: x[0].shape[0])) # max frame stream max_frame_stream = MaximumFrameCache(max_frames=maximum_frames, data_stream=sort_stream, rng=shuffle_rng) # padding data stream padded_stream = Padding(data_stream=max_frame_stream, mask_sources=['features', 'phonemes']) # floatX stream data_stream = ForceFloatX(padded_stream) return timit_dataset, data_stream
def get_sgnmt_shuffled_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the shuffled training data stream. This is similar to ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination with ``ShuffledExampleScheme`` to support reshuffling.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab, trg_vocab, None) #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples) iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples) s = DataStream(parallel_dataset, iteration_scheme=iter_scheme) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def setup_sorter_datastream(path, config): ds = SorterDataset(path) it = ShuffledExampleScheme(examples=config.example_count) stream = DataStream(ds, iteration_scheme=it) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('unsorted')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=['answer', 'unsorted'], mask_dtype='int32') return ds, stream
def test_data_stream_mapping_sort_multisource_ndarrays(): data = OrderedDict() data['x'] = [numpy.array([1, 2, 3]), numpy.array([2, 3, 1]), numpy.array([3, 2, 1])] data['y'] = [numpy.array([6, 5, 4]), numpy.array([6, 5, 4]), numpy.array([6, 5, 4])] data_sorted = [(numpy.array([1, 2, 3]), numpy.array([6, 5, 4])), (numpy.array([1, 2, 3]), numpy.array([4, 6, 5])), (numpy.array([1, 2, 3]), numpy.array([4, 5, 6]))] stream = DataStream(IterableDataset(data)) wrapper = Mapping(stream, mapping=SortMapping(operator.itemgetter(0))) for output, ground_truth in zip(wrapper.get_epoch_iterator(), data_sorted): assert len(output) == len(ground_truth) assert (output[0] == ground_truth[0]).all() assert (output[1] == ground_truth[1]).all()
def load_parallel_data(src_file, tgt_file, batch_size, sort_k_batches, dictionary, training=False): def preproc(s): s = s.replace('``', '"') s = s.replace('\'\'', '"') return s enc_dset = TextFile(files=[src_file], dictionary=dictionary, bos_token=None, eos_token=None, unk_token=CHAR_UNK_TOK, level='character', preprocess=preproc) dec_dset = TextFile(files=[tgt_file], dictionary=dictionary, bos_token=CHAR_SOS_TOK, eos_token=CHAR_EOS_TOK, unk_token=CHAR_UNK_TOK, level='character', preprocess=preproc) # NOTE merge encoder and decoder setup together stream = Merge( [enc_dset.get_example_stream(), dec_dset.get_example_stream()], ('source', 'target')) if training: # filter sequences that are too long stream = Filter(stream, predicate=TooLong(seq_len=CHAR_MAX_SEQ_LEN)) # batch and read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # sort all samples in read-ahead batch stream = Mapping(stream, SortMapping(lambda x: len(x[1]))) # turn back into stream stream = Unpack(stream) # batch again stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) masked_stream = Padding(stream) return masked_stream
def setup_toy_datastream(config): ds = ToyDataset() it = ToyIterator() stream = DataStream(ds, iteration_scheme=it) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('context')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding( stream, mask_sources=['context', 'question', 'answer', 'ans_indices'], mask_dtype='int32') return ds, stream
def setup_cnnsquad_datastream(sq_path, cnn_path, vocab_file, config): ds = CNNSQDataset(sq_path, cnn_path, vocab_file) it = CNNSQIterator(sq_path, cnn_path, cnn_ratio=config.add_cnn_data) stream = DataStream(ds, iteration_scheme=it) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('context')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=['context', 'question', 'answer'], mask_dtype='int32') return ds, stream
def stream_handwriting( which_sets, batch_size, seq_size, num_letters, sorting_mult=20): assert sorting_mult > 0 dataset = Handwriting(which_sets) sorting_size = batch_size * sorting_mult num_examples = sorting_size * (dataset.num_examples / sorting_size) if which_sets == ('train',): print "Random order." scheme = ShuffledExampleScheme(num_examples) else: print "Sequential order." scheme = SequentialExampleScheme(num_examples) data_stream = DataStream.default_stream(dataset, iteration_scheme=scheme) # Sort by length of the data sequence. data_stream = Batch( data_stream, iteration_scheme=ConstantScheme(sorting_size)) data_stream = Mapping(data_stream, SortMapping(_length)) data_stream = Unpack(data_stream) data_stream = Batch( data_stream, iteration_scheme=ConstantScheme(batch_size)) data_stream = Padding(data_stream) data_stream = SourceMapping( data_stream, _transpose, which_sources=('features', 'features_mask')) data_stream = SegmentSequence( data_stream, seq_size=seq_size + 1, share_value=True, return_last=True, which_sources=('features', 'features_mask'), add_flag=True) return data_stream
def _get_sgnmt_tr_stream(data_stream, src_vocab_size=30000, trg_vocab_size=30000, seq_len=50, batch_size=80, sort_k_batches=12, src_sparse_feat_map='', trg_sparse_feat_map='', **kwargs): """Prepares the raw text file stream ``data_stream`` for the Blocks main loop. This includes handling UNKs, splitting ino batches, sort locally by sequence length, and masking. This roughly corresponds to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the blocks examples. The arguments to this method are given by the configuration dict. """ # Filter sequences that are too long s = Filter(data_stream, predicate=stream._too_long(seq_len=seq_len)) # Replacing out of vocabulary tokens with unk token already # handled in the `DataSet`s # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def get_stream(self, part, batches=True, shuffle=True, add_sources=()): dataset = self.get_dataset(part, add_sources=add_sources) stream = (DataStream(dataset, iteration_scheme=ShuffledExampleScheme(dataset.num_examples)) if shuffle else dataset.get_example_stream()) stream = FilterSources(stream, (self.recordings_source, self.labels_source)+tuple(add_sources)) if self.add_eos: if self.prepend_eos: stream = Mapping(stream, _AddEosLabelBeginEnd(self.eos_label)) else: stream = Mapping(stream, _AddEosLabelEnd(self.eos_label)) if self.preprocess_text: stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text) stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) if self.preprocess_features == 'log_spectrogram': stream = Mapping( stream, functools.partial(apply_preprocessing, log_spectrogram)) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) if not batches: return stream stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) return stream
def _get_stream_from_lines(vocab, lines, preprocess=to_lower_case, vocab_size=30000, eos_id=0, eos='</S>', unk_id=1, batch_size=80, sort_k_batches=12): if preprocess is not None: lines = [preprocess(line) + ' ' + eos for line in lines] dataset = IterableDataset(iterables=lines) stream = DataStream(dataset) stream = Mapping( stream, lambda x: ([vocab[w] if w in vocab else unk_id for w in x[0].split()], )) if vocab_size < len(vocab): stream = Mapping(stream, _oov_to_unk(vocab_size=vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length(target_source_index=0))) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short stream = _PaddingWithToken(stream, eos_id) return stream
def construct_stream(dataset, rng, batch_size, n_batches=None, **kwargs): """Construct data stream. Parameters: ----------- dataset : Dataset Dataset to use. rng : numpy.random.RandomState Random number generator. batch_size : int Size of the batch n_batches : int Number of batchs to update population statistics. """ if n_batches is not None: scheme = ShuffledScheme(n_batches * batch_size, batch_size=batch_size) else: scheme = ShuffledScheme(dataset.num_examples, batch_size=batch_size) stream = DataStream(dataset, iteration_scheme=scheme) stream = Mapping(stream, SortMapping(key=key)) stream = Padding(data_stream=stream, mask_sources=['features', 'phonemes']) stream = Transpose(stream, [(1, 0, 2), (1, 0), (1, 0), (1, 0)]) return stream
def get_tr_stream(path, src_eos_idx, phones_sil, tgt_eos_idx, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" sources = ('words', 'audio', 'words_ends', 'punctuation_marks', 'phones', 'phones_words_ends', 'phones_words_acoustic_ends') #sources = ('words', 'audio', 'words_ends', 'punctuation_marks', 'phones', 'phones_words_ends') dataset = H5PYDataset(path, which_sets=('train',), sources=sources, load_in_memory=False) print "creating example stream" stream = dataset.get_example_stream() print "example stream created" # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS(stream, { 'words': src_eos_idx, 'phones': phones_sil, 'punctuation_marks': tgt_eos_idx, 'audio': 0, 'words_ends': -1, 'phones_words_ends': -1, 'phones_words_acoustic_ends': -1, }) return masked_stream