def test_ngram_stream_error_on_multiple_sources(): # Check that NGram accepts only data streams with one source sentences = [list(numpy.random.randint(10, size=sentence_length)) for sentence_length in [3, 5, 7]] stream = DataStream(IterableDataset(sentences)) stream.sources = ('1', '2') assert_raises(ValueError, NGrams, 4, stream)
def load_datastream(train_batch_size=100): from fuel.datasets.mnist import MNIST from fuel.transformers import ScaleAndShift, Cast, Flatten, Mapping from fuel.streams import DataStream from fuel.schemes import SequentialScheme, ShuffledScheme MNIST.default_transformers = ( (ScaleAndShift, [2.0 / 255.0, -1], {'which_sources': 'features'}), (Cast, [np.float32], {'which_sources': 'features'}), ) mnist_train = MNIST(('train',), subset=slice(None, 50000)) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme(mnist_train.num_examples, train_batch_size) ) mnist_validation = MNIST(('train',), subset=slice(50000, None)) mnist_validation_stream = DataStream.default_stream( mnist_validation, iteration_scheme=SequentialScheme(mnist_validation.num_examples, 250) ) mnist_test = MNIST(('test',)) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, 250) ) return { 'train': mnist_train_stream, 'validation': mnist_validation_stream, 'test': mnist_test_stream }
def get_streams(num_train_examples, batch_size, use_test=True): dataset = MNIST(("train",)) all_ind = numpy.arange(dataset.num_examples) rng = numpy.random.RandomState(seed=1) rng.shuffle(all_ind) indices_train = all_ind[:num_train_examples] indices_valid = all_ind[num_train_examples:] tarin_stream = Flatten( DataStream.default_stream(dataset, iteration_scheme=ShuffledScheme(indices_train, batch_size)) ) valid_stream = None if len(indices_valid) != 0: valid_stream = Flatten( DataStream.default_stream(dataset, iteration_scheme=ShuffledScheme(indices_valid, batch_size)) ) test_stream = None if use_test: dataset = MNIST(("test",)) ind = numpy.arange(dataset.num_examples) rng = numpy.random.RandomState(seed=1) rng.shuffle(all_ind) test_stream = Flatten(DataStream.default_stream(dataset, iteration_scheme=ShuffledScheme(ind, batch_size))) return tarin_stream, valid_stream, test_stream
def test_in_memory(): skip_if_not_available(datasets=['mnist.hdf5']) # Load MNIST and get two batches mnist = MNIST(('train',), load_in_memory=True) data_stream = DataStream(mnist, iteration_scheme=SequentialScheme( examples=mnist.num_examples, batch_size=256)) epoch = data_stream.get_epoch_iterator() for i, (features, targets) in enumerate(epoch): if i == 1: break handle = mnist.open() known_features, _ = mnist.get_data(handle, slice(256, 512)) mnist.close(handle) assert numpy.all(features == known_features) # Pickle the epoch and make sure that the data wasn't dumped with tempfile.NamedTemporaryFile(delete=False) as f: filename = f.name cPickle.dump(epoch, f) assert os.path.getsize(filename) < 1024 * 1024 # Less than 1MB # Reload the epoch and make sure that the state was maintained del epoch with open(filename, 'rb') as f: epoch = cPickle.load(f) features, targets = next(epoch) handle = mnist.open() known_features, _ = mnist.get_data(handle, slice(512, 768)) mnist.close(handle) assert numpy.all(features == known_features)
def maxout_vae_mnist_test(path_vae_mnist): # load vae model on mnist vae_mnist = load(path_vae_mnist) maxout = Maxout() x = T.matrix('features') y = T.imatrix('targets') batch_size = 128 z, _ = vae_mnist.sampler.sample(vae_mnist.encoder_mlp.apply(x)) predict = maxout.apply(z) cost = Softmax().categorical_cross_entropy(y.flatten(), predict) y_hat = Softmax().apply(predict) cost.name = 'cost' cg = ComputationGraph(cost) temp = cg.parameters for t, i in zip(temp, range(len(temp))): t.name = t.name+str(i)+"maxout" error_brick = MisclassificationRate() error_rate = error_brick.apply(y, y_hat) # training step_rule = RMSProp(0.01, 0.9) #step_rule = Momentum(0.2, 0.9) train_set = MNIST('train') test_set = MNIST("test") data_stream_train = Flatten(DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size))) data_stream_test =Flatten(DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size))) algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost], data_stream=data_stream_train, prefix="train") monitor_valid = DataStreamMonitoring( variables=[cost, error_rate], data_stream=data_stream_test, prefix="test") extensions = [ monitor_train, monitor_valid, FinishAfter(after_n_epochs=50), Printing(every_n_epochs=1) ] main_loop = MainLoop(data_stream=data_stream_train, algorithm=algorithm, model = Model(cost), extensions=extensions) main_loop.run() # save here from blocks.serialization import dump with closing(open('../data_mnist/maxout', 'w')) as f: dump(maxout, f)
def test_unpack_transformer(): data = range(10) stream = DataStream(IterableDataset(data)) stream = Batch(stream, iteration_scheme=ConstantScheme(2)) stream = Unpack(stream) epoch = stream.get_epoch_iterator() for i, v in enumerate(epoch): assert numpy.shape(v)[0] == 1 assert v[0] == i
def prepare_cifar10(): class Dataset: pass result = Dataset() CIFAR10.default_transformers = ( (ScaleAndShift, [2.0 / 255.0, -1], {'which_sources': 'features'}), (Cast, [np.float32], {'which_sources': 'features'})) mean = cifar10_mean() def patch_get_epoch_iterator(stream): def get_epoch_iterator(self): for X, Y in self._get_epoch_iterator(): # 0 degrees X -= mean[numpy.newaxis,:,:,:] yield augument(X, 25), Y stream._get_epoch_iterator = stream.get_epoch_iterator stream.get_epoch_iterator = types.MethodType(get_epoch_iterator, stream) def patch_get_epoch_iterator_test(stream): def get_epoch_iterator(self): for X, Y in self._get_epoch_iterator(): # 0 degrees X -= mean[numpy.newaxis,:,:,:] yield X, Y stream._get_epoch_iterator = stream.get_epoch_iterator stream.get_epoch_iterator = types.MethodType(get_epoch_iterator, stream) result.train = train = CIFAR10(("train",), subset = slice(None, 40000)) result.train_stream = DataStream.default_stream( result.train, iteration_scheme = ShuffledScheme(result.train.num_examples, 25)) patch_get_epoch_iterator(result.train_stream) result.validation = CIFAR10(("train",), subset=slice(40000, None)) result.validation_stream = DataStream.default_stream( result.validation, iteration_scheme = SequentialScheme(result.validation.num_examples, 100)) patch_get_epoch_iterator(result.validation_stream) result.test = CIFAR10(("test",)) result.test_stream = DataStream.default_stream( result.test, iteration_scheme = SequentialScheme(result.test.num_examples, 100)) patch_get_epoch_iterator_test(result.test_stream) return result
def test_get_data_inmemory(self): ds = ImagesFromFile('{}/*.jpeg'.format(self.ds_path), load_in_memory=True) stream = DataStream(ds, iteration_scheme=ShuffledScheme( ds.num_examples, batch_size=10)) for imgs, file_paths in stream.get_epoch_iterator(): assert len(imgs) == 5 assert all([img.shape == (512, 512, 3) for img in imgs]) assert all([self.ds_path in fp for fp in file_paths]) assert len(set(file_paths)) == 5
def test_batch_iteration_scheme_with_lists(self): """Batch schemes should work with more than ndarrays.""" data = IndexableDataset(OrderedDict([('foo', list(range(50))), ('bar', list(range(1, 51)))])) stream = DataStream(data, iteration_scheme=ShuffledScheme(data.num_examples, 5)) returned = [sum(batches, []) for batches in zip(*list(stream.get_epoch_iterator()))] assert set(returned[0]) == set(range(50)) assert set(returned[1]) == set(range(1, 51))
def __init__(self,dataset,batch_size=128,shuffle=False): self.dataset = dataset if shuffle: self.datastream = DataStream(self.dataset, iteration_scheme=ShuffledScheme( examples=dataset.num_examples, batch_size=batch_size)) else: self.datastream = DataStream(self.dataset, iteration_scheme=SequentialScheme( examples=dataset.num_examples, batch_size=batch_size))
def main(save_to, num_epochs, bokeh=False): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1 ** 2).sum() + .00005 * (W2 ** 2).sum() cost.name = 'final_cost' mnist_train = MNIST("train") mnist_test = MNIST("test") algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], Flatten( DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features',)), prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] main_loop = MainLoop( algorithm, Flatten( DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features',)), model=Model(cost), extensions=extensions) main_loop.run()
def test_dataset(): data = [1, 2, 3] # The default stream requests an example at a time stream = DataStream(IterableDataset(data)) epoch = stream.get_epoch_iterator() assert list(epoch) == list(zip(data)) # Check if iterating over multiple epochs works for i, epoch in zip(range(2), stream.iterate_epochs()): assert list(epoch) == list(zip(data)) # Check whether the returning as a dictionary of sources works assert next(stream.get_epoch_iterator(as_dict=True)) == {"data": 1}
def batch_iterator(self, dataset, batchsize, shuffle=False): if isinstance(dataset, Dataset): if shuffle: train_scheme = ShuffledScheme(examples=dataset.num_examples, batch_size=batchsize) else: train_scheme = SequentialScheme(examples=dataset.num_examples, batch_size=batchsize) stream = DataStream(dataset=dataset, iteration_scheme=train_scheme) if self.fuel_stream_xform_fn is not None: stream = self.fuel_stream_xform_fn(stream) return stream.get_epoch_iterator() elif _is_sequence_of_arrays(dataset): return iterate_minibatches(dataset, batchsize, shuffle=shuffle) else: raise TypeError('dataset should be a fuel Dataset instance or a list of arrays')
def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream(IndexableDataset(indexables=OrderedDict([ ('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord( stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data',) * 3 + stream.sources + ('zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream,)
def get_stream(batch_size, input_size, test=False): from fuel.datasets.dogs_vs_cats import DogsVsCats from fuel.streams import DataStream from fuel.schemes import ShuffledScheme, SequentialScheme ,SequentialExampleScheme from fuel.transformers.image import RandomFixedSizeCrop from fuel.transformers import Flatten #, ForceFloatX from ScikitResize import ScikitResize from fuel.transformers import Cast # Load the training set if test : train = DogsVsCats(('train',),subset=slice(0, 30)) valid = DogsVsCats(('train',),subset=slice(19980, 20000)) test = DogsVsCats(('test',),subset=slice(0,4)) else : train = DogsVsCats(('train',),subset=slice(0,22000)) valid = DogsVsCats(('train',),subset=slice(22000, 25000)) test = DogsVsCats(('test',)) #Generating stream train_stream = DataStream.default_stream( train, iteration_scheme=ShuffledScheme(train.num_examples, batch_size) ) valid_stream = DataStream.default_stream( valid, iteration_scheme=ShuffledScheme(valid.num_examples, batch_size) ) test_stream = DataStream.default_stream( test, iteration_scheme=SequentialScheme(test.num_examples, 1) # iteration_scheme=SequentialExampleScheme(test.num_examples) ) #Reshaping procedure #Apply crop and resize to desired square shape train_stream = ScikitResize(train_stream, input_size, which_sources=('image_features',)) valid_stream = ScikitResize(valid_stream, input_size, which_sources=('image_features',)) test_stream = ScikitResize(test_stream, input_size, which_sources=('image_features',)) #ForceFloatX, to spare you from possible bugs #train_stream = ForceFloatX(train_stream) #valid_stream = ForceFloatX(valid_stream) #test_stream = ForceFloatX(test_stream) #Cast instead of forcefloatX train_stream = Cast(train_stream, dtype='float32',which_sources=('image_features',)) valid_stream = Cast(valid_stream, dtype='float32',which_sources=('image_features',)) test_stream = Cast(test_stream, dtype='float32',which_sources=('image_features',)) return train_stream, valid_stream, test_stream
def test_h5py_dataset_datastream_pickles(): try: h5file = h5py.File(name='tmp.hdf5', mode="w") features = h5file.create_dataset('features', (10, 5), dtype='float32') features[...] = numpy.arange(50, dtype='float32').reshape((10, 5)) split_dict = {'train': {'features': (0, 10)}} h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) h5file.flush() h5file.close() dataset = H5PYDataset(path='tmp.hdf5', which_set='train') stream = DataStream(dataset) pickle.loads(pickle.dumps(stream)) finally: stream.close() if os.path.exists('tmp.hdf5'): os.remove('tmp.hdf5')
def monk_music_stream (which_sets = ('train',),batch_size = 64, seq_size=128, frame_size=160, num_examples= None, which_sources = ('features',)): """ This function generates the stream for the monk_music dataset. It doesn't compute incremental windows and instead simply separates the dataset into sequences """ dataset = MonkMusic(which_sets = which_sets, filename = "dataset.hdf5", load_in_memory=True) large_batch_size = batch_size * frame_size * seq_size if not num_examples: num_examples = large_batch_size*(dataset.num_examples/large_batch_size) # If there are memory problems revert to SequentialScheme data_stream = DataStream.default_stream( dataset, iteration_scheme=SequentialScheme( num_examples, large_batch_size)) data_stream = ScaleAndShift(data_stream, scale = 1./data_stats["std"], shift = -data_stats["mean"]/data_stats["std"]) data_stream = Mapping(data_stream, lambda data: _get_subsequences(data,batch_size,seq_size,frame_size)) data_stream = ForceFloatX(data_stream) return data_stream
def get_mnist_video_streams(batch_size): train_dataset = ClutteredMNISTVideo(which_sets=["train"]) valid_dataset = ClutteredMNISTVideo(which_sets=["valid"]) train_ind = numpy.arange(train_dataset.num_examples) valid_ind = numpy.arange(valid_dataset.num_examples) rng = numpy.random.RandomState(seed=1) rng.shuffle(train_ind) rng.shuffle(valid_ind) train_datastream = DataStream.default_stream(train_dataset, iteration_scheme=ShuffledScheme(train_ind, batch_size)) train_datastream = PreprocessTransformer(train_datastream) valid_datastream = DataStream.default_stream(valid_dataset, iteration_scheme=ShuffledScheme(valid_ind, batch_size)) valid_datastream = PreprocessTransformer(valid_datastream) return train_datastream, valid_datastream
def create_dataset(dataset): if trainning: scheme = ShuffledScheme(dataset.num_examples, 32) else: scheme = SequentialScheme(dataset.num_examples, 32) stream = DataStream.default_stream(dataset, iteration_scheme=scheme) return ResizeTransformer(stream, image_size)
def test_cifar100(): train = CIFAR100('train', load_in_memory=False) assert train.num_examples == 50000 handle = train.open() coarse_labels, features, fine_labels = train.get_data(handle, slice(49990, 50000)) assert features.shape == (10, 3, 32, 32) assert coarse_labels.shape == (10, 1) assert fine_labels.shape == (10, 1) train.close(handle) test = CIFAR100('test', load_in_memory=False) handle = test.open() coarse_labels, features, fine_labels = test.get_data(handle, slice(0, 10)) assert features.shape == (10, 3, 32, 32) assert coarse_labels.shape == (10, 1) assert fine_labels.shape == (10, 1) assert features.dtype == numpy.uint8 assert coarse_labels.dtype == numpy.uint8 assert fine_labels.dtype == numpy.uint8 test.close(handle) stream = DataStream.default_stream( test, iteration_scheme=SequentialScheme(10, 10)) data = next(stream.get_epoch_iterator())[1] assert data.min() >= 0.0 and data.max() <= 1.0 assert data.dtype == config.floatX assert_raises(ValueError, CIFAR100, 'valid')
def DStream(datatype, config): if datatype=='train': filename = config['train_file'] elif datatype == 'valid': filename = config['valid_file'] elif datatype == 'test': filename = config['test_file'] else: logger.error('wrong datatype, train, valid, or test') data = TextFile(files=[filename], dictionary=pickle.load(open(config['train_dic'],'rb')), unk_token=config['unk_token'], level='word', bos_token=config['bos_token'], eos_token=config['eos_token']) data_stream = DataStream.default_stream(data) data_stream.sources = ('sentence',) # organize data in batches and pad shorter sequences with zeros batch_size = config['batch_size'] data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(batch_size)) data_stream = Padding(data_stream) return data_stream
def test_cifar10(): train = CIFAR10(('train',), load_in_memory=False) assert train.num_examples == 50000 handle = train.open() features, targets = train.get_data(handle, slice(49990, 50000)) assert features.shape == (10, 3, 32, 32) assert targets.shape == (10, 1) train.close(handle) test = CIFAR10(('test',), load_in_memory=False) handle = test.open() features, targets = test.get_data(handle, slice(0, 10)) assert features.shape == (10, 3, 32, 32) assert targets.shape == (10, 1) assert features.dtype == numpy.uint8 assert targets.dtype == numpy.uint8 test.close(handle) stream = DataStream.default_stream( test, iteration_scheme=SequentialScheme(10, 10)) data = next(stream.get_epoch_iterator())[0] assert data.min() >= 0.0 and data.max() <= 1.0 assert data.dtype == config.floatX assert_raises(ValueError, CIFAR10, ('valid',)) dummy = CIFAR10(('train',), subset=slice(50000, 60000)) handle = dummy.open() assert_raises(ValueError, dummy.get_data, handle, slice(0, 10000)) dummy.close(handle)
def open_stream(which_sets= ('train',), port=5557, num_examples = None): dataset = Blizzard(which_sets = which_sets) if num_examples == None: num_examples = dataset.num_examples data_stream = DataStream.default_stream( dataset, iteration_scheme=SequentialScheme( num_examples, batch_size)) data_stream = ScaleAndShift(data_stream, scale = 1/data_std, shift = -data_mean/data_std) data_stream = Mapping(data_stream, _downsample_and_upsample, add_sources=('upsampled',)) data_stream = Mapping(data_stream, _equalize_size) data_stream = Mapping(data_stream, _get_residual, add_sources = ('residual',)) data_stream = FilterSources(data_stream, sources = ('upsampled', 'residual',)) data_stream = Mapping(data_stream, _segment_axis) data_stream = Mapping(data_stream, _transpose) data_stream = ForceFloatX(data_stream) start_server(data_stream, port=port)
def get_cmv_v1_streams(batch_size): train_dataset = CMVv1(which_sets=["train"]) valid_dataset = CMVv1(which_sets=["valid"]) train_ind = numpy.arange(train_dataset.num_examples) valid_ind = numpy.arange(valid_dataset.num_examples) rng = numpy.random.RandomState(seed=1) rng.shuffle(train_ind) rng.shuffle(valid_ind) train_datastream = DataStream.default_stream(train_dataset, iteration_scheme=ShuffledScheme(train_ind, batch_size)) train_datastream = Preprocessor_CMV_v1(train_datastream) valid_datastream = DataStream.default_stream(valid_dataset, iteration_scheme=ShuffledScheme(valid_ind, batch_size)) valid_datastream = Preprocessor_CMV_v1(valid_datastream) return train_datastream, valid_datastream
def fuel_data_to_list(fuel_data, shuffle): if(shuffle): scheme = ShuffledScheme(fuel_data.num_examples, fuel_data.num_examples) else: scheme = SequentialScheme(fuel_data.num_examples, fuel_data.num_examples) fuel_data_stream = DataStream.default_stream(fuel_data, iteration_scheme=scheme) return fuel_data_stream.get_epoch_iterator().next()
def create_svhn_data_streams(batch_size, monitoring_batch_size, rng=None): train_set = SVHN(2, ('extra',), sources=('features',)) valid_set = SVHN(2, ('train',), sources=('features',)) main_loop_stream = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme( train_set.num_examples, batch_size, rng=rng)) train_monitor_stream = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme( 5000, monitoring_batch_size, rng=rng)) valid_monitor_stream = DataStream.default_stream( valid_set, iteration_scheme=ShuffledScheme( 5000, monitoring_batch_size, rng=rng)) return main_loop_stream, train_monitor_stream, valid_monitor_stream
def create_tiny_imagenet_data_streams(batch_size, monitoring_batch_size, rng=None): train_set = TinyILSVRC2012(('train',), sources=('features',)) valid_set = TinyILSVRC2012(('valid',), sources=('features',)) main_loop_stream = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme( train_set.num_examples, batch_size, rng=rng)) train_monitor_stream = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme( 4096, monitoring_batch_size, rng=rng)) valid_monitor_stream = DataStream.default_stream( valid_set, iteration_scheme=ShuffledScheme( 4096, monitoring_batch_size, rng=rng)) return main_loop_stream, train_monitor_stream, valid_monitor_stream
def set_datastream(data_path, batch_size): dataset = H5PYDataset(file_or_path=data_path, which_sets=('train',), sources=('input_feature', 'target_feature')) data_stream = DataStream.default_stream(dataset=dataset, iteration_scheme=ShuffledScheme(batch_size=batch_size, examples=dataset.num_examples)) return data_stream
def create_celeba_data_streams(batch_size, monitoring_batch_size, sources=('features', ), rng=None): train_set = CelebA('64', ('train',), sources=sources) valid_set = CelebA('64', ('valid',), sources=sources) main_loop_stream = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme( train_set.num_examples, batch_size, rng=rng)) train_monitor_stream = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme( 5000, monitoring_batch_size, rng=rng)) valid_monitor_stream = DataStream.default_stream( valid_set, iteration_scheme=ShuffledScheme( 5000, monitoring_batch_size, rng=rng)) return main_loop_stream, train_monitor_stream, valid_monitor_stream
def create_gaussian_mixture_data_streams(batch_size, monitoring_batch_size, means=None, variances=None, priors=None, rng=None, num_examples=100000, sources=('features', )): train_set = GaussianMixture(num_examples=num_examples, means=means, variances=variances, priors=priors, rng=rng, sources=sources) valid_set = GaussianMixture(num_examples=num_examples, means=means, variances=variances, priors=priors, rng=rng, sources=sources) main_loop_stream = DataStream(train_set, iteration_scheme=ShuffledScheme( train_set.num_examples, batch_size=batch_size, rng=rng)) train_monitor_stream = DataStream(train_set, iteration_scheme=ShuffledScheme( 5000, batch_size, rng=rng)) valid_monitor_stream = DataStream(valid_set, iteration_scheme=ShuffledScheme( 5000, batch_size, rng=rng)) return main_loop_stream, train_monitor_stream, valid_monitor_stream
def get_sgnmt_shuffled_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the shuffled training data stream. This is similar to ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination with ``ShuffledExampleScheme`` to support reshuffling.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab, trg_vocab, None) #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples) iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples) s = DataStream(parallel_dataset, iteration_scheme=iter_scheme) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def timit_datastream(path, which_set, local_copy, pool_size, maximum_frames): # load dataset timit_dataset = Timit(which_set=which_set, path=path, local_copy=local_copy) # get statistics data_means, data_stds = timit_dataset.get_normalization_factors() # set shuffle range shuffle_rng = numpy.random.RandomState(123) # set iterator scheme iterator_scheme = SequentialShuffledScheme( num_examples=timit_dataset.num_examples, batch_size=pool_size, rng=shuffle_rng) # base data stream base_stream = DataStream(dataset=timit_dataset, iteration_scheme=iterator_scheme) # reshape stream reshape_stream = Reshape(data_source='features', shape_source='features_shapes', data_stream=base_stream) # normalize data stream normalize_stream = Normalize(data_stream=reshape_stream, means=data_means, stds=data_stds) # sort data stream sort_stream = Mapping(data_stream=normalize_stream, mapping=SortMapping(key=lambda x: x[0].shape[0])) # max frame stream max_frame_stream = MaximumFrameCache(max_frames=maximum_frames, data_stream=sort_stream, rng=shuffle_rng) # padding data stream padded_stream = Padding(data_stream=max_frame_stream, mask_sources=['features', 'phonemes']) # floatX stream data_stream = ForceFloatX(padded_stream) return timit_dataset, data_stream
def create_data_generator(path, vocab_file, config): ds = QADataset(path, vocab_file, config.n_entities, need_sep_token=config.concat_ctx_and_question) it = QAIterator(path, shuffle=config.shuffle_questions) stream = DataStream(ds, iteration_scheme=it) if config.concat_ctx_and_question: stream = ConcatCtxAndQuestion(stream, config.concat_question_before, ds.reverse_vocab['<SEP>']) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper( stream.sources.index( 'question' if config.concat_ctx_and_question else 'context')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=['context', 'question', 'candidates'], mask_dtype='int32') def gen(): if not config.concat_ctx_and_question: for (seq_cont, seq_cont_mask, seq_quest, seq_quest_mask, tg, candidates, candidates_mask) in stream.get_epoch_iterator(): seq_cont_mask = seq_cont_mask.astype('float32') seq_quest_mask = seq_quest_mask.astype('float32') candidates_mask = candidates_mask.astype('float32') yield (seq_cont, seq_cont_mask, seq_quest, seq_quest_mask, tg, candidates, candidates_mask) else: for (seq, seq_mask, tg, candidates, candidates_mask) \ in stream.get_epoch_iterator(): seq_mask = seq_mask.astype('float32') candidates_mask = candidates_mask.astype('float32') yield (seq, seq_mask, tg, candidates, candidates_mask) return gen
def valid(self, req_vars): valid_dataset = TaxiDataset(self.config.valid_set, 'valid.hdf5') train_dataset = TaxiDataset('train') valid_trips_ids = valid_dataset.get_data( None, slice(0, valid_dataset.num_examples))[ valid_dataset.sources.index('trip_id')] prefix_stream = DataStream(valid_dataset, iteration_scheme=SequentialExampleScheme( valid_dataset.num_examples)) prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.taxi_add_first_last_len( prefix_stream, self.config.n_begin_end_pts) prefix_stream = Batch(prefix_stream, iteration_scheme=ConstantScheme( self.config.batch_size)) candidate_stream = DataStream(train_dataset, iteration_scheme=ShuffledExampleScheme( train_dataset.num_examples)) candidate_stream = transformers.TaxiExcludeTrips( candidate_stream, valid_trips_ids) candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream) candidate_stream = transformers.taxi_add_datetime(candidate_stream) candidate_stream = transformers.taxi_add_first_last_len( candidate_stream, self.config.n_begin_end_pts) candidate_stream = Batch(candidate_stream, iteration_scheme=ConstantScheme( self.config.valid_candidate_size)) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) stream = MultiProcessing(stream) return stream
def get_datastream(dataset, batch_size=160): dataset = DataStream( dataset, iteration_scheme=ShuffledScheme(dataset.num_examples, batch_size), ) dataset = Padding(dataset) # if flatten: # dataset = Flatten(dataset, which_sources=('features,')) def _transpose(data): return tuple(np.rollaxis(array, 1, 0) for array in data) dataset = Mapping(dataset, _transpose) return dataset
def test_padding_data_stream(): # 1-D sequences stream = Batch( DataStream(IterableDataset([[1], [2, 3], [], [4, 5, 6], [7]])), ConstantScheme(2)) mask_stream = Padding(stream) assert mask_stream.sources == ("data", "data_mask") it = mask_stream.get_epoch_iterator() data, mask = next(it) assert (data == numpy.array([[1, 0], [2, 3]])).all() assert (mask == numpy.array([[1, 0], [1, 1]])).all() data, mask = next(it) assert (data == numpy.array([[0, 0, 0], [4, 5, 6]])).all() assert (mask == numpy.array([[0, 0, 0], [1, 1, 1]])).all() data, mask = next(it) assert (data == numpy.array([[7]])).all() assert (mask == numpy.array([[1]])).all() # 2D sequences stream2 = Batch( DataStream(IterableDataset([numpy.ones((3, 4)), 2 * numpy.ones((2, 4))])), ConstantScheme(2)) it = Padding(stream2).get_epoch_iterator() data, mask = next(it) assert data.shape == (2, 3, 4) assert (data[0, :, :] == 1).all() assert (data[1, :2, :] == 2).all() assert (mask == numpy.array([[1, 1, 1], [1, 1, 0]])).all() # 2 sources stream3 = Padding(Batch( DataStream(IterableDataset(dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))), ConstantScheme(2))) assert len(next(stream3.get_epoch_iterator())) == 4
def get_datastream(path, norm_path, which_set='train_si84', batch_size=1): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) data_mean_std = numpy.load(norm_path) iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) base_stream = Normalize(data_stream=base_stream, means=data_mean_std['mean'], stds=data_mean_std['std']) fs = FilterSources(data_stream=base_stream, sources=['features', 'targets']) padded_stream = Padding(data_stream=fs) return padded_stream
def setup_sorter_datastream(path, config): ds = SorterDataset(path) it = ShuffledExampleScheme(examples=config.example_count) stream = DataStream(ds, iteration_scheme=it) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('unsorted')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=['answer', 'unsorted'], mask_dtype='int32') return ds, stream
def setUp(self): rng = numpy.random.RandomState(123) self.stream = DataStream(IndexableDataset(OrderedDict([ ('features', rng.rand(4, 2, 2)), ('targets', numpy.array([0, 1, 0, 1])) ]), axis_labels={ 'features': ('batch', 'width', 'height'), 'targets': ('batch', ) }), iteration_scheme=SequentialScheme(4, 2)) self.duplicate = Duplicate(self.stream, 'features')
def setUp(self): self.data = OrderedDict([('features', numpy.ones((4, 2, 2))), ('targets', numpy.array([[0, 1, 2], [1, 0, 1], [1, 1, 1], [2, 0, 0]]))]) self.neg_data = OrderedDict([('features', numpy.ones((4, 2, 2))), ('targets', numpy.array([[0, -1, 2], [1, 0, -3], [1, 1, 1], [2, 0, 0]]))]) self.num_classes = (3, 2, 3) self.stream_example = StructuredOneHotEncoding( DataStream(IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4)), num_classes=self.num_classes, which_sources=('targets', ))
def _get_sgnmt_dev_stream(val_set=None, src_vocab=None, src_vocab_size=30000, **kwargs): """Setup development set stream if necessary. The arguments to this method are given by the configuration dict. """ dev_stream = None if val_set is not None: src_vocab = _add_special_ids({str(i) : i for i in xrange(src_vocab_size)}) dev_dataset = TextFile([val_set], src_vocab, None) dev_stream = DataStream(dev_dataset) return dev_stream
def get_test_stream(src_vocab, trg_vocab, src_data, trg_data=None, src_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the test data stream (=no batches or gold labels).""" print('streaming...') # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=2, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=2, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, preprocess=get_unicode) print(src_data) #exit() trg_dataset = TextFile([trg_data], trg_vocab, preprocess=get_unicode) #stream=DataStream(src_dataset) stream = Merge([DataStream(src_dataset), DataStream(trg_dataset)], ('source', 'target')) return stream
def get_dataset_iterator(dataset, split, include_features=True, include_targets=False, unit_scale=True, label_transforms=False, return_length=False): """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)""" sources = [] sources = sources + ['features'] if include_features else sources sources = sources + ['targets'] if include_targets else sources if split == "all": splits = ('train', 'valid', 'test') elif split == "nontrain": splits = ('valid', 'test') else: splits = (split, ) dataset_fname = find_in_data_path("{}.hdf5".format(dataset)) h5_dataset = H5PYDataset(dataset_fname, which_sets=splits, sources=sources) if unit_scale: h5_dataset.default_transformers = uint8_pixels_to_floatX( ('features', )) datastream = DataStream.default_stream( dataset=h5_dataset, iteration_scheme=SequentialExampleScheme(h5_dataset.num_examples)) if label_transforms: # TODO: maybe refactor this common bit with get_custom_streams below datastream = AddLabelUncertainty(datastream, chance=0, which_sources=('targets', )) datastream = RandomLabelStrip(datastream, chance=0, which_sources=('targets', )) # HACK: allow variable stretch datastream = StretchLabels(datastream, length=128, which_sources=('targets', )) it = datastream.get_epoch_iterator() if return_length: return it, h5_dataset.num_examples else: return it
def get_dev_stream(val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) dev_dataset = TextFile([val_set], src_vocab, None) dev_stream = DataStream(dev_dataset) return dev_stream
def create_ivector_test_datastream(path, which_set, batch_size=1, delay=0): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) iterator_scheme = SequentialScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['features', 'ivectors', 'targets']) if delay: fs = DelayTransformer(fs, delay) fs = FilterSources(data_stream=fs, sources=['features', 'ivectors']) return Padding(fs)
def candidate_stream(self, n_candidates): candidate_stream = DataStream(self.train_dataset, iteration_scheme=ShuffledExampleScheme( self.train_dataset.num_examples)) if not data.tvt: candidate_stream = transformers.TaxiExcludeTrips( candidate_stream, self.valid_trips_ids) candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream) candidate_stream = transformers.taxi_add_datetime(candidate_stream) candidate_stream = transformers.taxi_add_first_last_len( candidate_stream, self.config.n_begin_end_pts) if not data.tvt: candidate_stream = transformers.add_destination(candidate_stream) return Batch(candidate_stream, iteration_scheme=ConstantScheme(n_candidates))
def get_stream(input_file, vocab_file, **kwards): unk_token = kwards.pop('unk_token') eos_token = kwards.pop('eos_token') dataset = TextFile(files=[input_file], encoding='UTF-8', preprocess=to_lower_case, dictionary=pkl.load(open(vocab_file, 'rb')), level='word', unk_token=unk_token, bos_token=None, eos_token=eos_token) stream = DataStream(dataset) return stream
def test_sources_selection(): features = [5, 6, 7, 1] targets = [1, 0, 1, 1] stream = DataStream(IterableDataset(OrderedDict( [('features', features), ('targets', targets)]))) assert list(stream.get_epoch_iterator()) == list(zip(features, targets)) stream = DataStream(IterableDataset( {'features': features, 'targets': targets}, sources=('targets',))) assert list(stream.get_epoch_iterator()) == list(zip(targets))
def define_stream(which_sets=('train',), initial_scale=1, scale=0.5, batch_size=64, seq_length=64, frame_size=128, tbptt_flag = True, num_examples=None): def _segment_axis(data): # Defined inside so that frame_size is available x = tuple([numpy.array([segment_axis(x, frame_size, 0) for x in var]) for var in data]) return x scale = float(scale) dataset = Blizzard(which_sets=which_sets) if num_examples is None: num_examples = batch_size*(dataset.num_examples/batch_size) data_stream = DataStream.default_stream( dataset, iteration_scheme=SequentialScheme(num_examples, batch_size)) data_stream = ScaleAndShift(data_stream, scale=1/data_std, shift=-data_mean/float(data_std)) # Original sampling rate data_stream = Resample(data_stream, scale=initial_scale) data_stream = Mapping(data_stream, _copy, add_sources=('upsampled',)) data_stream = Resample(data_stream, scale=scale, which_sources=('upsampled',)) data_stream = Resample(data_stream, scale=1/scale, which_sources=('upsampled',)) # data_stream = Mapping(data_stream, _downsample_and_upsample, # add_sources=('upsampled',)) data_stream = Mapping(data_stream, _equalize_size) data_stream = Mapping(data_stream, _get_residual, add_sources=('residual',)) data_stream = FilterSources(data_stream, sources=('upsampled', 'residual',)) data_stream = Mapping(data_stream, _segment_axis) data_stream = Mapping(data_stream, _transpose) return data_stream
def test_data_stream_mapping_sort_multisource_ndarrays(): data = OrderedDict() data['x'] = [numpy.array([1, 2, 3]), numpy.array([2, 3, 1]), numpy.array([3, 2, 1])] data['y'] = [numpy.array([6, 5, 4]), numpy.array([6, 5, 4]), numpy.array([6, 5, 4])] data_sorted = [(numpy.array([1, 2, 3]), numpy.array([6, 5, 4])), (numpy.array([1, 2, 3]), numpy.array([4, 6, 5])), (numpy.array([1, 2, 3]), numpy.array([4, 5, 6]))] stream = DataStream(IterableDataset(data)) wrapper = Mapping(stream, mapping=SortMapping(operator.itemgetter(0))) for output, ground_truth in zip(wrapper.get_epoch_iterator(), data_sorted): assert len(output) == len(ground_truth) assert (output[0] == ground_truth[0]).all() assert (output[1] == ground_truth[1]).all()
def create_data(data): stream = DataStream(data, iteration_scheme=ShuffledScheme(data.num_examples, batch_size)) # Data Augmentation stream = MinimumImageDimensions(stream, image_size, which_sources=('image_features',)) stream = MaximumImageDimensions(stream, image_size, which_sources=('image_features',)) stream = RandomHorizontalSwap(stream, which_sources=('image_features',)) stream = Random2DRotation(stream, which_sources=('image_features',)) #stream = ScikitResize(stream, image_size, which_sources=('image_features',)) # Data Preprocessing # Data Transformation stream = ScaleAndShift(stream, 1./255, 0, which_sources=('image_features',)) stream = Cast(stream, dtype='float32', which_sources=('image_features',)) return stream
def get_stream(self, part, batch_size=None, max_length=None, seed=None): dataset = self.get_dataset(part, max_length) if self._layout == 'lambada' and part == 'train': stream = DataStream(dataset, iteration_scheme=RandomSpanScheme( dataset.num_examples, max_length, seed)) stream = Mapping(stream, listify) else: stream = dataset.get_example_stream() stream = SourcewiseMapping(stream, functools.partial(add_bos, Vocabulary.BOS)) stream = SourcewiseMapping(stream, vectorize) if not batch_size: return stream stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) stream = Padding(stream) return stream
def get_tst_stream(val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): tst_stream = None if val_set is not None and src_vocab is not None: # Load dictionaries and ensure special tokens exist src_vocab = ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) tst_dataset = TextFile([val_set], src_vocab, None) tst_stream = DataStream(tst_dataset) return tst_stream
def get_stream(self, which_set, shuffle=True, monitor=False, num_examples=None, center=True): scheme_klass = ShuffledScheme if shuffle else SequentialScheme if num_examples is None: num_examples = self.get_stream_num_examples(which_set, monitor=monitor) scheme = scheme_klass(num_examples, self.batch_size) stream = DataStream.default_stream(dataset=self.datasets[which_set], iteration_scheme=scheme) stream = self.apply_default_transformers(stream) stream = Canonicalize(stream, mapping=self.preprocess) if center: stream = transformers.Mapping(stream, mapping=self.center) return stream
def stream_handwriting(which_sets, batch_size, seq_size, tbptt=True): dataset = Handwriting(which_sets) data_stream = DataStream.default_stream( dataset, iteration_scheme=ShuffledScheme( batch_size * (dataset.num_examples / batch_size), batch_size)) data_stream = FilterSources(data_stream, sources=('features', )) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) if tbptt: data_stream = SegmentSequence(data_stream, add_flag=True, seq_size=seq_size) data_stream = ForceFloatX(data_stream) return data_stream
def start_server(port, which_set): fuel.server.logger.setLevel('WARN') dataset = IMDBText(which_set, sorted=True) n_train = dataset.num_examples #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size) scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size) stream = DataStream( dataset=dataset, iteration_scheme=scheme) print "loading glove" glove = GloveTransformer(glove_version, data_stream=stream) padded = Padding( data_stream=glove, mask_sources=('features',) ) fuel.server.start_server(padded, port=port, hwm=20)
def get_comb_stream(fea2obj, which_set, batch_size=None, shuffle=True): streams = [] for fea in fea2obj: obj = fea2obj[fea] dataset = H5PYDataset(obj.fuelfile, which_sets=(which_set,),load_in_memory=True) if batch_size == None: batch_size = dataset.num_examples if shuffle: iterschema = ShuffledScheme(examples=dataset.num_examples, batch_size=batch_size) else: iterschema = SequentialScheme(examples=dataset.num_examples, batch_size=batch_size) stream = DataStream(dataset=dataset, iteration_scheme=iterschema) if fea in seq_features: stream = CutInput(stream, obj.max_len) if obj.rec == True: logger.info('transforming data for recursive input') stream = LettersTransposer(stream, which_sources=fea)# Required because Recurrent bricks receive as input [sequence, batch,# features] streams.append(stream) stream = Merge(streams, tuple(fea2obj.keys())) return stream, dataset.num_examples
def get_dev_streams(config): """Setup development set stream if necessary.""" dev_streams = {} for cg in config['cgs']: if 'val_sets' in config and cg in config['val_sets']: logger.info('Building development stream for cg:[{}]'.format(cg)) eid = p_(cg)[0] dev_file = config['val_sets'][cg] # Get dictionary and fix EOS dictionary = cPickle.load(open(config['src_vocabs'][eid])) dictionary['<S>'] = 0 dictionary['<UNK>'] = config['unk_id'] dictionary['</S>'] = config['src_eos_idxs'][eid] # Get as a text file and convert it into a stream dev_dataset = TextFile([dev_file], dictionary, None) dev_streams[cg] = DataStream(dev_dataset) return dev_streams
def create_data(data): stream = DataStream.default_stream(data, iteration_scheme=ShuffledScheme( data.num_examples, batch_size)) stream_downscale = MinimumImageDimensions( stream, image_size, which_sources=('image_features', )) #stream_rotate = Random2DRotation(stream_downscale, which_sources=('image_features',)) stream_max = ScikitResize(stream_downscale, image_size, which_sources=('image_features', )) stream_scale = ScaleAndShift(stream_max, 1. / 255, 0, which_sources=('image_features', )) stream_cast = Cast(stream_scale, dtype='float32', which_sources=('image_features', )) #stream_flat = Flatten(stream_scale, which_sources=('image_features',)) return stream_cast