Esempio n. 1
0
def test_ngram_stream_error_on_multiple_sources():
    # Check that NGram accepts only data streams with one source
    sentences = [list(numpy.random.randint(10, size=sentence_length))
                 for sentence_length in [3, 5, 7]]
    stream = DataStream(IterableDataset(sentences))
    stream.sources = ('1', '2')
    assert_raises(ValueError, NGrams, 4, stream)
Esempio n. 2
0
def load_datastream(train_batch_size=100):
    from fuel.datasets.mnist import MNIST
    from fuel.transformers import ScaleAndShift, Cast, Flatten, Mapping
    from fuel.streams import DataStream
    from fuel.schemes import SequentialScheme, ShuffledScheme

    MNIST.default_transformers = (
        (ScaleAndShift, [2.0 / 255.0, -1], {'which_sources': 'features'}),
        (Cast, [np.float32], {'which_sources': 'features'}),
    )

    mnist_train = MNIST(('train',), subset=slice(None, 50000))
    mnist_train_stream = DataStream.default_stream(
        mnist_train,
        iteration_scheme=ShuffledScheme(mnist_train.num_examples, train_batch_size)
    )

    mnist_validation = MNIST(('train',), subset=slice(50000, None))
    mnist_validation_stream = DataStream.default_stream(
        mnist_validation,
        iteration_scheme=SequentialScheme(mnist_validation.num_examples, 250)
    )

    mnist_test = MNIST(('test',))
    mnist_test_stream = DataStream.default_stream(
        mnist_test,
        iteration_scheme=SequentialScheme(mnist_test.num_examples, 250)
    )

    return {
        'train': mnist_train_stream,
        'validation': mnist_validation_stream,
        'test': mnist_test_stream
    }
Esempio n. 3
0
def get_streams(num_train_examples, batch_size, use_test=True):
    dataset = MNIST(("train",))
    all_ind = numpy.arange(dataset.num_examples)
    rng = numpy.random.RandomState(seed=1)
    rng.shuffle(all_ind)

    indices_train = all_ind[:num_train_examples]
    indices_valid = all_ind[num_train_examples:]

    tarin_stream = Flatten(
        DataStream.default_stream(dataset, iteration_scheme=ShuffledScheme(indices_train, batch_size))
    )

    valid_stream = None
    if len(indices_valid) != 0:
        valid_stream = Flatten(
            DataStream.default_stream(dataset, iteration_scheme=ShuffledScheme(indices_valid, batch_size))
        )

    test_stream = None
    if use_test:
        dataset = MNIST(("test",))
        ind = numpy.arange(dataset.num_examples)
        rng = numpy.random.RandomState(seed=1)
        rng.shuffle(all_ind)

        test_stream = Flatten(DataStream.default_stream(dataset, iteration_scheme=ShuffledScheme(ind, batch_size)))

    return tarin_stream, valid_stream, test_stream
Esempio n. 4
0
def test_in_memory():
    skip_if_not_available(datasets=['mnist.hdf5'])
    # Load MNIST and get two batches
    mnist = MNIST(('train',), load_in_memory=True)
    data_stream = DataStream(mnist, iteration_scheme=SequentialScheme(
        examples=mnist.num_examples, batch_size=256))
    epoch = data_stream.get_epoch_iterator()
    for i, (features, targets) in enumerate(epoch):
        if i == 1:
            break
    handle = mnist.open()
    known_features, _ = mnist.get_data(handle, slice(256, 512))
    mnist.close(handle)
    assert numpy.all(features == known_features)

    # Pickle the epoch and make sure that the data wasn't dumped
    with tempfile.NamedTemporaryFile(delete=False) as f:
        filename = f.name
        cPickle.dump(epoch, f)
    assert os.path.getsize(filename) < 1024 * 1024  # Less than 1MB

    # Reload the epoch and make sure that the state was maintained
    del epoch
    with open(filename, 'rb') as f:
        epoch = cPickle.load(f)
    features, targets = next(epoch)
    handle = mnist.open()
    known_features, _ = mnist.get_data(handle, slice(512, 768))
    mnist.close(handle)
    assert numpy.all(features == known_features)
def maxout_vae_mnist_test(path_vae_mnist):

    # load vae model on mnist
    vae_mnist = load(path_vae_mnist)
    maxout = Maxout()
    x = T.matrix('features')
    y = T.imatrix('targets')
    batch_size = 128
    z, _ = vae_mnist.sampler.sample(vae_mnist.encoder_mlp.apply(x))
    predict = maxout.apply(z)

    cost = Softmax().categorical_cross_entropy(y.flatten(), predict)
    y_hat = Softmax().apply(predict)
    cost.name = 'cost'
    cg = ComputationGraph(cost)

    temp = cg.parameters
    for t, i in zip(temp, range(len(temp))):
        t.name = t.name+str(i)+"maxout"

    error_brick = MisclassificationRate()
    error_rate = error_brick.apply(y, y_hat) 

    # training
    step_rule = RMSProp(0.01, 0.9)
    #step_rule = Momentum(0.2, 0.9)
    train_set = MNIST('train')
    test_set = MNIST("test")

    data_stream_train = Flatten(DataStream.default_stream(
            train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size)))

    data_stream_test =Flatten(DataStream.default_stream(
            test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size)))

    algorithm = GradientDescent(cost=cost, params=cg.parameters,
                                step_rule=step_rule)

    monitor_train = TrainingDataMonitoring(
        variables=[cost], data_stream=data_stream_train, prefix="train")
    monitor_valid = DataStreamMonitoring(
        variables=[cost, error_rate], data_stream=data_stream_test, prefix="test")


    extensions = [  monitor_train,
                    monitor_valid,
                    FinishAfter(after_n_epochs=50),
                    Printing(every_n_epochs=1)
                  ]

    main_loop = MainLoop(data_stream=data_stream_train,
                        algorithm=algorithm, model = Model(cost),
                        extensions=extensions)
    main_loop.run()

    # save here
    from blocks.serialization import dump
    with closing(open('../data_mnist/maxout', 'w')) as f:
	    dump(maxout, f)
Esempio n. 6
0
def test_unpack_transformer():
    data = range(10)
    stream = DataStream(IterableDataset(data))
    stream = Batch(stream, iteration_scheme=ConstantScheme(2))
    stream = Unpack(stream)
    epoch = stream.get_epoch_iterator()
    for i, v in enumerate(epoch):
        assert numpy.shape(v)[0] == 1
        assert v[0] == i
Esempio n. 7
0
def prepare_cifar10():
	class Dataset:
		pass

	result = Dataset()

	CIFAR10.default_transformers = (
		(ScaleAndShift, [2.0 / 255.0, -1], {'which_sources': 'features'}),
		(Cast, [np.float32], {'which_sources': 'features'}))

	mean = cifar10_mean()

	def patch_get_epoch_iterator(stream):
		def get_epoch_iterator(self):
			for X, Y in self._get_epoch_iterator():
				# 0 degrees
				X -= mean[numpy.newaxis,:,:,:]

				yield augument(X, 25), Y

		stream._get_epoch_iterator = stream.get_epoch_iterator
		stream.get_epoch_iterator = types.MethodType(get_epoch_iterator, stream)

	def patch_get_epoch_iterator_test(stream):
		def get_epoch_iterator(self):
			for X, Y in self._get_epoch_iterator():
				# 0 degrees
				X -= mean[numpy.newaxis,:,:,:]
				yield X, Y

		stream._get_epoch_iterator = stream.get_epoch_iterator
		stream.get_epoch_iterator = types.MethodType(get_epoch_iterator, stream)


	result.train = train = CIFAR10(("train",), subset = slice(None, 40000))
	result.train_stream = DataStream.default_stream(
		result.train,
		iteration_scheme = ShuffledScheme(result.train.num_examples, 25))

	patch_get_epoch_iterator(result.train_stream)

	result.validation = CIFAR10(("train",), subset=slice(40000, None))
	result.validation_stream = DataStream.default_stream(
		result.validation, 
		iteration_scheme = SequentialScheme(result.validation.num_examples, 100))

	patch_get_epoch_iterator(result.validation_stream)

	result.test = CIFAR10(("test",))
	result.test_stream = DataStream.default_stream(
		result.test, 
		iteration_scheme = SequentialScheme(result.test.num_examples, 100))

	patch_get_epoch_iterator_test(result.test_stream)

	return result
Esempio n. 8
0
 def test_get_data_inmemory(self):
     ds = ImagesFromFile('{}/*.jpeg'.format(self.ds_path),
                         load_in_memory=True)
     stream = DataStream(ds, iteration_scheme=ShuffledScheme(
         ds.num_examples, batch_size=10))
     for imgs, file_paths in stream.get_epoch_iterator():
         assert len(imgs) == 5
         assert all([img.shape == (512, 512, 3) for img in imgs])
         assert all([self.ds_path in fp for fp in file_paths])
         assert len(set(file_paths)) == 5
Esempio n. 9
0
 def test_batch_iteration_scheme_with_lists(self):
     """Batch schemes should work with more than ndarrays."""
     data = IndexableDataset(OrderedDict([('foo', list(range(50))),
                                          ('bar', list(range(1, 51)))]))
     stream = DataStream(data,
                         iteration_scheme=ShuffledScheme(data.num_examples,
                                                         5))
     returned = [sum(batches, []) for batches in
                 zip(*list(stream.get_epoch_iterator()))]
     assert set(returned[0]) == set(range(50))
     assert set(returned[1]) == set(range(1, 51))
Esempio n. 10
0
 def __init__(self,dataset,batch_size=128,shuffle=False):
     self.dataset = dataset
     if shuffle:
         self.datastream = DataStream(self.dataset,
                                  iteration_scheme=ShuffledScheme(
                                  examples=dataset.num_examples,
                                  batch_size=batch_size))
     else:
         self.datastream = DataStream(self.dataset,
                                  iteration_scheme=SequentialScheme(
                                  examples=dataset.num_examples,
                                  batch_size=batch_size))
Esempio n. 11
0
def main(save_to, num_epochs, bokeh=False):
    mlp = MLP([Tanh(), Softmax()], [784, 100, 10],
              weights_init=IsotropicGaussian(0.01),
              biases_init=Constant(0))
    mlp.initialize()
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')
    probs = mlp.apply(tensor.flatten(x, outdim=2))
    cost = CategoricalCrossEntropy().apply(y.flatten(), probs)
    error_rate = MisclassificationRate().apply(y.flatten(), probs)

    cg = ComputationGraph([cost])
    W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
    cost = cost + .00005 * (W1 ** 2).sum() + .00005 * (W2 ** 2).sum()
    cost.name = 'final_cost'

    mnist_train = MNIST("train")
    mnist_test = MNIST("test")

    algorithm = GradientDescent(
        cost=cost, params=cg.parameters,
        step_rule=Scale(learning_rate=0.1))
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs),
                  DataStreamMonitoring(
                      [cost, error_rate],
                      Flatten(
                          DataStream.default_stream(
                              mnist_test,
                              iteration_scheme=SequentialScheme(
                                  mnist_test.num_examples, 500)),
                          which_sources=('features',)),
                      prefix="test"),
                  TrainingDataMonitoring(
                      [cost, error_rate,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      after_epoch=True),
                  Checkpoint(save_to),
                  Printing()]

    main_loop = MainLoop(
        algorithm,
        Flatten(
            DataStream.default_stream(
                mnist_train,
                iteration_scheme=SequentialScheme(
                    mnist_train.num_examples, 50)),
            which_sources=('features',)),
        model=Model(cost),
        extensions=extensions)

    main_loop.run()
Esempio n. 12
0
def test_dataset():
    data = [1, 2, 3]
    # The default stream requests an example at a time
    stream = DataStream(IterableDataset(data))
    epoch = stream.get_epoch_iterator()
    assert list(epoch) == list(zip(data))

    # Check if iterating over multiple epochs works
    for i, epoch in zip(range(2), stream.iterate_epochs()):
        assert list(epoch) == list(zip(data))

    # Check whether the returning as a dictionary of sources works
    assert next(stream.get_epoch_iterator(as_dict=True)) == {"data": 1}
 def batch_iterator(self, dataset, batchsize, shuffle=False):
     if isinstance(dataset, Dataset):
         if shuffle:
             train_scheme = ShuffledScheme(examples=dataset.num_examples, batch_size=batchsize)
         else:
             train_scheme = SequentialScheme(examples=dataset.num_examples, batch_size=batchsize)
         stream = DataStream(dataset=dataset, iteration_scheme=train_scheme)
         if self.fuel_stream_xform_fn is not None:
             stream = self.fuel_stream_xform_fn(stream)
         return stream.get_epoch_iterator()
     elif _is_sequence_of_arrays(dataset):
         return iterate_minibatches(dataset, batchsize, shuffle=shuffle)
     else:
         raise TypeError('dataset should be a fuel Dataset instance or a list of arrays')
Esempio n. 14
0
    def prep_dataset(dataset):
        dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))]
        dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2))


        stream = DataStream(IndexableDataset(indexables=OrderedDict([
            ('data', dataset)])),
            iteration_scheme=SequentialExampleScheme(dataset.shape[0]))
        stream = Transpose(stream, [(1, 0)])
        stream = SampleDropsNPWord(
          stream, z_prob_states, z_prob_cells, drop_prob_igates,
          layer_size, num_layers, False, stoch_depth, share_mask,
          gaussian_drop, alphabetsize)
        stream.sources = ('data',) * 3 + stream.sources + ('zoneouts_states', 'zoneouts_cells', 'zoneouts_igates')
        return (stream,)
Esempio n. 15
0
def get_stream(batch_size, input_size, test=False):
    from fuel.datasets.dogs_vs_cats import DogsVsCats
    from fuel.streams import DataStream
    from fuel.schemes import ShuffledScheme, SequentialScheme ,SequentialExampleScheme
    from fuel.transformers.image import RandomFixedSizeCrop
    from fuel.transformers import Flatten #, ForceFloatX
    from ScikitResize import ScikitResize
    from fuel.transformers import Cast
    # Load the training set
    if test :
        train = DogsVsCats(('train',),subset=slice(0, 30)) 
        valid = DogsVsCats(('train',),subset=slice(19980, 20000)) 
        test = DogsVsCats(('test',),subset=slice(0,4))
    else :
        train = DogsVsCats(('train',),subset=slice(0,22000)) 
        valid = DogsVsCats(('train',),subset=slice(22000, 25000)) 
        test = DogsVsCats(('test',))
    #Generating stream
    train_stream = DataStream.default_stream(
        train,
        iteration_scheme=ShuffledScheme(train.num_examples, batch_size)
    )

    valid_stream = DataStream.default_stream(
        valid,
        iteration_scheme=ShuffledScheme(valid.num_examples, batch_size)
    )
    test_stream = DataStream.default_stream(
        test,
        iteration_scheme=SequentialScheme(test.num_examples, 1)
#        iteration_scheme=SequentialExampleScheme(test.num_examples)
    )
    #Reshaping procedure
    #Apply crop and resize to desired square shape
    train_stream = ScikitResize(train_stream, input_size, which_sources=('image_features',))
    valid_stream = ScikitResize(valid_stream, input_size, which_sources=('image_features',))
    test_stream = ScikitResize(test_stream, input_size, which_sources=('image_features',))

    #ForceFloatX, to spare you from possible bugs
    #train_stream = ForceFloatX(train_stream)
    #valid_stream = ForceFloatX(valid_stream)
    #test_stream = ForceFloatX(test_stream)

    #Cast instead of forcefloatX
    train_stream = Cast(train_stream, dtype='float32',which_sources=('image_features',))
    valid_stream = Cast(valid_stream, dtype='float32',which_sources=('image_features',))
    test_stream = Cast(test_stream, dtype='float32',which_sources=('image_features',))
    return train_stream, valid_stream, test_stream
Esempio n. 16
0
def test_h5py_dataset_datastream_pickles():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {'train': {'features': (0, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        dataset = H5PYDataset(path='tmp.hdf5', which_set='train')
        stream = DataStream(dataset)
        pickle.loads(pickle.dumps(stream))
    finally:
        stream.close()
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
Esempio n. 17
0
def monk_music_stream (which_sets = ('train',),batch_size = 64,
        seq_size=128, frame_size=160, num_examples= None,
        which_sources = ('features',)):

    """
    This function generates the stream for the monk_music dataset.
    It doesn't compute incremental windows and instead simply separates the
    dataset into sequences
    """

    dataset = MonkMusic(which_sets = which_sets, filename = "dataset.hdf5",
        load_in_memory=True)

    large_batch_size = batch_size * frame_size * seq_size
    if not num_examples:
        num_examples = large_batch_size*(dataset.num_examples/large_batch_size)

    # If there are memory problems revert to SequentialScheme
    data_stream = DataStream.default_stream(
            dataset, iteration_scheme=SequentialScheme(
            num_examples,
            large_batch_size))

    data_stream = ScaleAndShift(data_stream,
            scale = 1./data_stats["std"],
            shift = -data_stats["mean"]/data_stats["std"])

    data_stream = Mapping(data_stream,
            lambda data: _get_subsequences(data,batch_size,seq_size,frame_size))

    data_stream = ForceFloatX(data_stream)

    return data_stream
Esempio n. 18
0
def get_mnist_video_streams(batch_size):
    train_dataset = ClutteredMNISTVideo(which_sets=["train"])
    valid_dataset = ClutteredMNISTVideo(which_sets=["valid"])
    train_ind = numpy.arange(train_dataset.num_examples)
    valid_ind = numpy.arange(valid_dataset.num_examples)
    rng = numpy.random.RandomState(seed=1)
    rng.shuffle(train_ind)
    rng.shuffle(valid_ind)

    train_datastream = DataStream.default_stream(train_dataset, iteration_scheme=ShuffledScheme(train_ind, batch_size))
    train_datastream = PreprocessTransformer(train_datastream)

    valid_datastream = DataStream.default_stream(valid_dataset, iteration_scheme=ShuffledScheme(valid_ind, batch_size))
    valid_datastream = PreprocessTransformer(valid_datastream)

    return train_datastream, valid_datastream
Esempio n. 19
0
 def create_dataset(dataset):
     if trainning:
         scheme = ShuffledScheme(dataset.num_examples, 32)
     else:
         scheme = SequentialScheme(dataset.num_examples, 32)
     stream = DataStream.default_stream(dataset, iteration_scheme=scheme)
     return ResizeTransformer(stream, image_size)
Esempio n. 20
0
def test_cifar100():
    train = CIFAR100('train', load_in_memory=False)
    assert train.num_examples == 50000
    handle = train.open()
    coarse_labels, features, fine_labels = train.get_data(handle,
                                                          slice(49990, 50000))

    assert features.shape == (10, 3, 32, 32)
    assert coarse_labels.shape == (10, 1)
    assert fine_labels.shape == (10, 1)
    train.close(handle)

    test = CIFAR100('test', load_in_memory=False)
    handle = test.open()
    coarse_labels, features, fine_labels = test.get_data(handle,
                                                         slice(0, 10))

    assert features.shape == (10, 3, 32, 32)
    assert coarse_labels.shape == (10, 1)
    assert fine_labels.shape == (10, 1)

    assert features.dtype == numpy.uint8
    assert coarse_labels.dtype == numpy.uint8
    assert fine_labels.dtype == numpy.uint8

    test.close(handle)

    stream = DataStream.default_stream(
        test, iteration_scheme=SequentialScheme(10, 10))
    data = next(stream.get_epoch_iterator())[1]

    assert data.min() >= 0.0 and data.max() <= 1.0
    assert data.dtype == config.floatX

    assert_raises(ValueError, CIFAR100, 'valid')
Esempio n. 21
0
def DStream(datatype, config):

    if datatype=='train':
        filename = config['train_file']
    elif datatype == 'valid':
        filename = config['valid_file']
    elif datatype == 'test':
        filename = config['test_file']
    else:
        logger.error('wrong datatype, train, valid, or test')


    data = TextFile(files=[filename],
                    dictionary=pickle.load(open(config['train_dic'],'rb')),
                    unk_token=config['unk_token'],
                    level='word',
                    bos_token=config['bos_token'],
                    eos_token=config['eos_token'])

    data_stream = DataStream.default_stream(data)
    data_stream.sources = ('sentence',)


    # organize data in batches and pad shorter sequences with zeros
    batch_size = config['batch_size']
    data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(batch_size))
    data_stream = Padding(data_stream)
    return data_stream
Esempio n. 22
0
def test_cifar10():
    train = CIFAR10(('train',), load_in_memory=False)
    assert train.num_examples == 50000
    handle = train.open()
    features, targets = train.get_data(handle, slice(49990, 50000))
    assert features.shape == (10, 3, 32, 32)
    assert targets.shape == (10, 1)
    train.close(handle)

    test = CIFAR10(('test',), load_in_memory=False)
    handle = test.open()
    features, targets = test.get_data(handle, slice(0, 10))
    assert features.shape == (10, 3, 32, 32)
    assert targets.shape == (10, 1)
    assert features.dtype == numpy.uint8
    assert targets.dtype == numpy.uint8
    test.close(handle)

    stream = DataStream.default_stream(
        test, iteration_scheme=SequentialScheme(10, 10))
    data = next(stream.get_epoch_iterator())[0]
    assert data.min() >= 0.0 and data.max() <= 1.0
    assert data.dtype == config.floatX

    assert_raises(ValueError, CIFAR10, ('valid',))

    dummy = CIFAR10(('train',), subset=slice(50000, 60000))
    handle = dummy.open()
    assert_raises(ValueError, dummy.get_data, handle, slice(0, 10000))
    dummy.close(handle)
Esempio n. 23
0
def open_stream(which_sets= ('train',), port=5557, num_examples = None):

    dataset = Blizzard(which_sets = which_sets)

    if num_examples == None:
        num_examples = dataset.num_examples

    data_stream = DataStream.default_stream(
            dataset, iteration_scheme=SequentialScheme(
            num_examples, batch_size))

    data_stream = ScaleAndShift(data_stream, scale = 1/data_std, 
                                            shift = -data_mean/data_std)
    data_stream = Mapping(data_stream, _downsample_and_upsample, 
                          add_sources=('upsampled',))
    data_stream = Mapping(data_stream, _equalize_size)
    data_stream = Mapping(data_stream, _get_residual,
                          add_sources = ('residual',))
    data_stream = FilterSources(data_stream, 
                          sources = ('upsampled', 'residual',))
    data_stream = Mapping(data_stream, _segment_axis)
    data_stream = Mapping(data_stream, _transpose)
    data_stream = ForceFloatX(data_stream)

    start_server(data_stream, port=port)
Esempio n. 24
0
def get_cmv_v1_streams(batch_size):
    train_dataset = CMVv1(which_sets=["train"])
    valid_dataset = CMVv1(which_sets=["valid"])
    train_ind = numpy.arange(train_dataset.num_examples)
    valid_ind = numpy.arange(valid_dataset.num_examples)
    rng = numpy.random.RandomState(seed=1)
    rng.shuffle(train_ind)
    rng.shuffle(valid_ind)

    train_datastream = DataStream.default_stream(train_dataset, iteration_scheme=ShuffledScheme(train_ind, batch_size))
    train_datastream = Preprocessor_CMV_v1(train_datastream)

    valid_datastream = DataStream.default_stream(valid_dataset, iteration_scheme=ShuffledScheme(valid_ind, batch_size))
    valid_datastream = Preprocessor_CMV_v1(valid_datastream)

    return train_datastream, valid_datastream
Esempio n. 25
0
def fuel_data_to_list(fuel_data, shuffle):
    if(shuffle):
        scheme = ShuffledScheme(fuel_data.num_examples, fuel_data.num_examples)
    else:
        scheme = SequentialScheme(fuel_data.num_examples, fuel_data.num_examples)
    fuel_data_stream = DataStream.default_stream(fuel_data, iteration_scheme=scheme)
    return fuel_data_stream.get_epoch_iterator().next()
Esempio n. 26
0
def create_svhn_data_streams(batch_size, monitoring_batch_size, rng=None):
    train_set = SVHN(2, ('extra',), sources=('features',))
    valid_set = SVHN(2, ('train',), sources=('features',))
    main_loop_stream = DataStream.default_stream(
        train_set,
        iteration_scheme=ShuffledScheme(
            train_set.num_examples, batch_size, rng=rng))
    train_monitor_stream = DataStream.default_stream(
        train_set,
        iteration_scheme=ShuffledScheme(
            5000, monitoring_batch_size, rng=rng))
    valid_monitor_stream = DataStream.default_stream(
        valid_set,
        iteration_scheme=ShuffledScheme(
            5000, monitoring_batch_size, rng=rng))
    return main_loop_stream, train_monitor_stream, valid_monitor_stream
Esempio n. 27
0
def create_tiny_imagenet_data_streams(batch_size, monitoring_batch_size,
                                      rng=None):
    train_set = TinyILSVRC2012(('train',), sources=('features',))
    valid_set = TinyILSVRC2012(('valid',), sources=('features',))
    main_loop_stream = DataStream.default_stream(
        train_set,
        iteration_scheme=ShuffledScheme(
            train_set.num_examples, batch_size, rng=rng))
    train_monitor_stream = DataStream.default_stream(
        train_set,
        iteration_scheme=ShuffledScheme(
            4096, monitoring_batch_size, rng=rng))
    valid_monitor_stream = DataStream.default_stream(
        valid_set,
        iteration_scheme=ShuffledScheme(
            4096, monitoring_batch_size, rng=rng))
    return main_loop_stream, train_monitor_stream, valid_monitor_stream
Esempio n. 28
0
def set_datastream(data_path, batch_size):
    dataset = H5PYDataset(file_or_path=data_path,
                          which_sets=('train',),
                          sources=('input_feature', 'target_feature'))
    data_stream = DataStream.default_stream(dataset=dataset,
                                            iteration_scheme=ShuffledScheme(batch_size=batch_size,
                                                                            examples=dataset.num_examples))
    return data_stream
Esempio n. 29
0
def create_celeba_data_streams(batch_size, monitoring_batch_size,
                               sources=('features', ), rng=None):
    train_set = CelebA('64', ('train',), sources=sources)
    valid_set = CelebA('64', ('valid',), sources=sources)
    main_loop_stream = DataStream.default_stream(
        train_set,
        iteration_scheme=ShuffledScheme(
            train_set.num_examples, batch_size, rng=rng))
    train_monitor_stream = DataStream.default_stream(
        train_set,
        iteration_scheme=ShuffledScheme(
            5000, monitoring_batch_size, rng=rng))
    valid_monitor_stream = DataStream.default_stream(
        valid_set,
        iteration_scheme=ShuffledScheme(
            5000, monitoring_batch_size, rng=rng))
    return main_loop_stream, train_monitor_stream, valid_monitor_stream
def create_gaussian_mixture_data_streams(batch_size,
                                         monitoring_batch_size,
                                         means=None,
                                         variances=None,
                                         priors=None,
                                         rng=None,
                                         num_examples=100000,
                                         sources=('features', )):
    train_set = GaussianMixture(num_examples=num_examples,
                                means=means,
                                variances=variances,
                                priors=priors,
                                rng=rng,
                                sources=sources)

    valid_set = GaussianMixture(num_examples=num_examples,
                                means=means,
                                variances=variances,
                                priors=priors,
                                rng=rng,
                                sources=sources)

    main_loop_stream = DataStream(train_set,
                                  iteration_scheme=ShuffledScheme(
                                      train_set.num_examples,
                                      batch_size=batch_size,
                                      rng=rng))

    train_monitor_stream = DataStream(train_set,
                                      iteration_scheme=ShuffledScheme(
                                          5000, batch_size, rng=rng))

    valid_monitor_stream = DataStream(valid_set,
                                      iteration_scheme=ShuffledScheme(
                                          5000, batch_size, rng=rng))

    return main_loop_stream, train_monitor_stream, valid_monitor_stream
Esempio n. 31
0
def get_sgnmt_shuffled_tr_stream(src_data,
                                 trg_data,
                                 src_vocab_size=30000,
                                 trg_vocab_size=30000,
                                 unk_id=1,
                                 seq_len=50,
                                 batch_size=80,
                                 sort_k_batches=12,
                                 **kwargs):
    """Prepares the shuffled training data stream. This is similar to 
    ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination
    with ``ShuffledExampleScheme`` to support reshuffling."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab,
                                        trg_vocab, None)
    #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples)
    iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples)
    s = DataStream(parallel_dataset, iteration_scheme=iter_scheme)

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(
        s,
        stream._oov_to_unk(src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Esempio n. 32
0
def timit_datastream(path, which_set, local_copy, pool_size, maximum_frames):

    # load dataset
    timit_dataset = Timit(which_set=which_set,
                          path=path,
                          local_copy=local_copy)

    # get statistics
    data_means, data_stds = timit_dataset.get_normalization_factors()

    # set shuffle range
    shuffle_rng = numpy.random.RandomState(123)

    # set iterator scheme
    iterator_scheme = SequentialShuffledScheme(
        num_examples=timit_dataset.num_examples,
        batch_size=pool_size,
        rng=shuffle_rng)

    # base data stream
    base_stream = DataStream(dataset=timit_dataset,
                             iteration_scheme=iterator_scheme)

    # reshape stream
    reshape_stream = Reshape(data_source='features',
                             shape_source='features_shapes',
                             data_stream=base_stream)

    # normalize data stream
    normalize_stream = Normalize(data_stream=reshape_stream,
                                 means=data_means,
                                 stds=data_stds)

    # sort data stream
    sort_stream = Mapping(data_stream=normalize_stream,
                          mapping=SortMapping(key=lambda x: x[0].shape[0]))

    # max frame stream
    max_frame_stream = MaximumFrameCache(max_frames=maximum_frames,
                                         data_stream=sort_stream,
                                         rng=shuffle_rng)

    # padding data stream
    padded_stream = Padding(data_stream=max_frame_stream,
                            mask_sources=['features', 'phonemes'])

    # floatX stream
    data_stream = ForceFloatX(padded_stream)
    return timit_dataset, data_stream
Esempio n. 33
0
def create_data_generator(path, vocab_file, config):
    ds = QADataset(path,
                   vocab_file,
                   config.n_entities,
                   need_sep_token=config.concat_ctx_and_question)
    it = QAIterator(path, shuffle=config.shuffle_questions)

    stream = DataStream(ds, iteration_scheme=it)

    if config.concat_ctx_and_question:
        stream = ConcatCtxAndQuestion(stream, config.concat_question_before,
                                      ds.reverse_vocab['<SEP>'])

    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(
        stream.sources.index(
            'question' if config.concat_ctx_and_question else 'context'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=['context', 'question', 'candidates'],
                     mask_dtype='int32')

    def gen():

        if not config.concat_ctx_and_question:
            for (seq_cont, seq_cont_mask, seq_quest, seq_quest_mask, tg,
                 candidates, candidates_mask) in stream.get_epoch_iterator():
                seq_cont_mask = seq_cont_mask.astype('float32')
                seq_quest_mask = seq_quest_mask.astype('float32')
                candidates_mask = candidates_mask.astype('float32')

                yield (seq_cont, seq_cont_mask, seq_quest, seq_quest_mask, tg,
                       candidates, candidates_mask)
        else:

            for (seq, seq_mask, tg, candidates, candidates_mask) \
                    in stream.get_epoch_iterator():
                seq_mask = seq_mask.astype('float32')
                candidates_mask = candidates_mask.astype('float32')

                yield (seq, seq_mask, tg, candidates, candidates_mask)

    return gen
Esempio n. 34
0
    def valid(self, req_vars):
        valid_dataset = TaxiDataset(self.config.valid_set, 'valid.hdf5')
        train_dataset = TaxiDataset('train')
        valid_trips_ids = valid_dataset.get_data(
            None, slice(0, valid_dataset.num_examples))[
                valid_dataset.sources.index('trip_id')]

        prefix_stream = DataStream(valid_dataset,
                                   iteration_scheme=SequentialExampleScheme(
                                       valid_dataset.num_examples))
        prefix_stream = transformers.taxi_add_datetime(prefix_stream)
        prefix_stream = transformers.taxi_add_first_last_len(
            prefix_stream, self.config.n_begin_end_pts)
        prefix_stream = Batch(prefix_stream,
                              iteration_scheme=ConstantScheme(
                                  self.config.batch_size))

        candidate_stream = DataStream(train_dataset,
                                      iteration_scheme=ShuffledExampleScheme(
                                          train_dataset.num_examples))
        candidate_stream = transformers.TaxiExcludeTrips(
            candidate_stream, valid_trips_ids)
        candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream)
        candidate_stream = transformers.taxi_add_datetime(candidate_stream)
        candidate_stream = transformers.taxi_add_first_last_len(
            candidate_stream, self.config.n_begin_end_pts)
        candidate_stream = Batch(candidate_stream,
                                 iteration_scheme=ConstantScheme(
                                     self.config.valid_candidate_size))

        sources = prefix_stream.sources + tuple(
            'candidate_%s' % k for k in candidate_stream.sources)
        stream = Merge((prefix_stream, candidate_stream), sources)
        stream = transformers.Select(stream, tuple(req_vars))
        stream = MultiProcessing(stream)
        return stream
Esempio n. 35
0
    def get_datastream(dataset, batch_size=160):
        dataset = DataStream(
            dataset,
            iteration_scheme=ShuffledScheme(dataset.num_examples, batch_size),
        )
        dataset = Padding(dataset)

        # if flatten:
        # dataset = Flatten(dataset, which_sources=('features,'))

        def _transpose(data):
            return tuple(np.rollaxis(array, 1, 0) for array in data)

        dataset = Mapping(dataset, _transpose)
        return dataset
Esempio n. 36
0
def test_padding_data_stream():
    # 1-D sequences
    stream = Batch(
        DataStream(IterableDataset([[1], [2, 3], [], [4, 5, 6], [7]])),
        ConstantScheme(2))
    mask_stream = Padding(stream)
    assert mask_stream.sources == ("data", "data_mask")
    it = mask_stream.get_epoch_iterator()
    data, mask = next(it)
    assert (data == numpy.array([[1, 0], [2, 3]])).all()
    assert (mask == numpy.array([[1, 0], [1, 1]])).all()
    data, mask = next(it)
    assert (data == numpy.array([[0, 0, 0], [4, 5, 6]])).all()
    assert (mask == numpy.array([[0, 0, 0], [1, 1, 1]])).all()
    data, mask = next(it)
    assert (data == numpy.array([[7]])).all()
    assert (mask == numpy.array([[1]])).all()

    # 2D sequences
    stream2 = Batch(
        DataStream(IterableDataset([numpy.ones((3, 4)),
                                    2 * numpy.ones((2, 4))])),
        ConstantScheme(2))
    it = Padding(stream2).get_epoch_iterator()
    data, mask = next(it)
    assert data.shape == (2, 3, 4)
    assert (data[0, :, :] == 1).all()
    assert (data[1, :2, :] == 2).all()
    assert (mask == numpy.array([[1, 1, 1], [1, 1, 0]])).all()

    # 2 sources
    stream3 = Padding(Batch(
        DataStream(IterableDataset(dict(features=[[1], [2, 3]],
                                        targets=[[4, 5, 6], [7]]))),
        ConstantScheme(2)))
    assert len(next(stream3.get_epoch_iterator())) == 4
Esempio n. 37
0
def get_datastream(path, norm_path, which_set='train_si84', batch_size=1):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    data_mean_std = numpy.load(norm_path)

    iterator_scheme = ShuffledScheme(batch_size=batch_size,
                                     examples=wsj_dataset.num_examples)
    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)
    base_stream = Normalize(data_stream=base_stream,
                            means=data_mean_std['mean'],
                            stds=data_mean_std['std'])
    fs = FilterSources(data_stream=base_stream,
                       sources=['features', 'targets'])
    padded_stream = Padding(data_stream=fs)
    return padded_stream
Esempio n. 38
0
def setup_sorter_datastream(path, config):
    ds = SorterDataset(path)
    it = ShuffledExampleScheme(examples=config.example_count)
    stream = DataStream(ds, iteration_scheme=it)
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(config.batch_size *
                                                   config.sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('unsorted'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)
    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream,
                     mask_sources=['answer', 'unsorted'],
                     mask_dtype='int32')
    return ds, stream
Esempio n. 39
0
    def setUp(self):
        rng = numpy.random.RandomState(123)
        self.stream = DataStream(IndexableDataset(OrderedDict([
            ('features', rng.rand(4, 2, 2)),
            ('targets', numpy.array([0, 1, 0, 1]))
        ]),
                                                  axis_labels={
                                                      'features':
                                                      ('batch', 'width',
                                                       'height'),
                                                      'targets': ('batch', )
                                                  }),
                                 iteration_scheme=SequentialScheme(4, 2))

        self.duplicate = Duplicate(self.stream, 'features')
Esempio n. 40
0
 def setUp(self):
     self.data = OrderedDict([('features', numpy.ones((4, 2, 2))),
                              ('targets',
                               numpy.array([[0, 1, 2], [1, 0, 1], [1, 1, 1],
                                            [2, 0, 0]]))])
     self.neg_data = OrderedDict([('features', numpy.ones((4, 2, 2))),
                                  ('targets',
                                   numpy.array([[0, -1, 2], [1, 0, -3],
                                                [1, 1, 1], [2, 0, 0]]))])
     self.num_classes = (3, 2, 3)
     self.stream_example = StructuredOneHotEncoding(
         DataStream(IndexableDataset(self.data),
                    iteration_scheme=SequentialExampleScheme(4)),
         num_classes=self.num_classes,
         which_sources=('targets', ))
Esempio n. 41
0
def _get_sgnmt_dev_stream(val_set=None,
                          src_vocab=None,
                          src_vocab_size=30000,
                          **kwargs):
    """Setup development set stream if necessary.
    
    The arguments to this method are given by the configuration dict.
    """
    dev_stream = None
    if val_set is not None:
        src_vocab = _add_special_ids({str(i) : i 
                                        for i in xrange(src_vocab_size)})
        dev_dataset = TextFile([val_set], src_vocab, None)
        dev_stream = DataStream(dev_dataset)
    return dev_stream
Esempio n. 42
0
def get_test_stream(src_vocab,
                    trg_vocab,
                    src_data,
                    trg_data=None,
                    src_vocab_size=30000,
                    unk_id=1,
                    seq_len=50,
                    batch_size=80,
                    sort_k_batches=12,
                    **kwargs):
    """Prepares the test data stream (=no batches or gold labels)."""

    print('streaming...')
    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(src_vocab if isinstance(
        src_vocab, dict) else cPickle.load(open(src_vocab)),
                                       bos_idx=0,
                                       eos_idx=2,
                                       unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
        trg_vocab, dict) else cPickle.load(open(trg_vocab)),
                                       bos_idx=0,
                                       eos_idx=2,
                                       unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, preprocess=get_unicode)
    print(src_data)
    #exit()
    trg_dataset = TextFile([trg_data], trg_vocab, preprocess=get_unicode)

    #stream=DataStream(src_dataset)
    stream = Merge([DataStream(src_dataset),
                    DataStream(trg_dataset)], ('source', 'target'))

    return stream
Esempio n. 43
0
def get_dataset_iterator(dataset,
                         split,
                         include_features=True,
                         include_targets=False,
                         unit_scale=True,
                         label_transforms=False,
                         return_length=False):
    """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)"""
    sources = []
    sources = sources + ['features'] if include_features else sources
    sources = sources + ['targets'] if include_targets else sources
    if split == "all":
        splits = ('train', 'valid', 'test')
    elif split == "nontrain":
        splits = ('valid', 'test')
    else:
        splits = (split, )

    dataset_fname = find_in_data_path("{}.hdf5".format(dataset))
    h5_dataset = H5PYDataset(dataset_fname, which_sets=splits, sources=sources)
    if unit_scale:
        h5_dataset.default_transformers = uint8_pixels_to_floatX(
            ('features', ))

    datastream = DataStream.default_stream(
        dataset=h5_dataset,
        iteration_scheme=SequentialExampleScheme(h5_dataset.num_examples))

    if label_transforms:
        # TODO: maybe refactor this common bit with get_custom_streams below
        datastream = AddLabelUncertainty(datastream,
                                         chance=0,
                                         which_sources=('targets', ))

        datastream = RandomLabelStrip(datastream,
                                      chance=0,
                                      which_sources=('targets', ))

        # HACK: allow variable stretch
        datastream = StretchLabels(datastream,
                                   length=128,
                                   which_sources=('targets', ))

    it = datastream.get_epoch_iterator()
    if return_length:
        return it, h5_dataset.num_examples
    else:
        return it
def get_dev_stream(val_set=None,
                   src_vocab=None,
                   src_vocab_size=30000,
                   unk_id=1,
                   **kwargs):
    """Setup development set stream if necessary."""
    dev_stream = None
    if val_set is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab)),
                                           bos_idx=0,
                                           eos_idx=src_vocab_size - 1,
                                           unk_idx=unk_id)
        dev_dataset = TextFile([val_set], src_vocab, None)
        dev_stream = DataStream(dev_dataset)
    return dev_stream
Esempio n. 45
0
def create_ivector_test_datastream(path, which_set, batch_size=1, delay=0):
    wsj_dataset = H5PYDataset(path, which_sets=(which_set, ))
    iterator_scheme = SequentialScheme(batch_size=batch_size,
                                       examples=wsj_dataset.num_examples)

    base_stream = DataStream(dataset=wsj_dataset,
                             iteration_scheme=iterator_scheme)

    fs = FilterSources(data_stream=base_stream,
                       sources=['features', 'ivectors', 'targets'])

    if delay:
        fs = DelayTransformer(fs, delay)

    fs = FilterSources(data_stream=fs, sources=['features', 'ivectors'])
    return Padding(fs)
Esempio n. 46
0
    def candidate_stream(self, n_candidates):
        candidate_stream = DataStream(self.train_dataset,
                                      iteration_scheme=ShuffledExampleScheme(
                                          self.train_dataset.num_examples))
        if not data.tvt:
            candidate_stream = transformers.TaxiExcludeTrips(
                candidate_stream, self.valid_trips_ids)
        candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream)
        candidate_stream = transformers.taxi_add_datetime(candidate_stream)
        candidate_stream = transformers.taxi_add_first_last_len(
            candidate_stream, self.config.n_begin_end_pts)
        if not data.tvt:
            candidate_stream = transformers.add_destination(candidate_stream)

        return Batch(candidate_stream,
                     iteration_scheme=ConstantScheme(n_candidates))
Esempio n. 47
0
def get_stream(input_file, vocab_file, **kwards):
    unk_token = kwards.pop('unk_token')
    eos_token = kwards.pop('eos_token')

    dataset = TextFile(files=[input_file],
                       encoding='UTF-8',
                       preprocess=to_lower_case,
                       dictionary=pkl.load(open(vocab_file, 'rb')),
                       level='word',
                       unk_token=unk_token,
                       bos_token=None,
                       eos_token=eos_token)

    stream = DataStream(dataset)

    return stream
Esempio n. 48
0
def test_sources_selection():
    features = [5, 6, 7, 1]
    targets = [1, 0, 1, 1]
    stream = DataStream(IterableDataset(OrderedDict(
        [('features', features), ('targets', targets)])))
    assert list(stream.get_epoch_iterator()) == list(zip(features, targets))

    stream = DataStream(IterableDataset(
        {'features': features, 'targets': targets},
        sources=('targets',)))
    assert list(stream.get_epoch_iterator()) == list(zip(targets))
Esempio n. 49
0
def define_stream(which_sets=('train',),
                initial_scale=1,
                scale=0.5,
                batch_size=64,
                seq_length=64,
                frame_size=128,
                tbptt_flag = True,
                num_examples=None):

    def _segment_axis(data):
        # Defined inside so that frame_size is available
        x = tuple([numpy.array([segment_axis(x, frame_size, 0) for x in var])
                   for var in data])
        return x

    scale = float(scale)

    dataset = Blizzard(which_sets=which_sets)

    if num_examples is None:
        num_examples = batch_size*(dataset.num_examples/batch_size)

    data_stream = DataStream.default_stream(
            dataset,
            iteration_scheme=SequentialScheme(num_examples, batch_size))

    data_stream = ScaleAndShift(data_stream,
                                scale=1/data_std,
                                shift=-data_mean/float(data_std))

    # Original sampling rate
    data_stream = Resample(data_stream, scale=initial_scale)
    data_stream = Mapping(data_stream, _copy, add_sources=('upsampled',))
    data_stream = Resample(data_stream, scale=scale, which_sources=('upsampled',))
    data_stream = Resample(data_stream, scale=1/scale, which_sources=('upsampled',))

    # data_stream = Mapping(data_stream, _downsample_and_upsample,
    #                       add_sources=('upsampled',))
    data_stream = Mapping(data_stream, _equalize_size)
    data_stream = Mapping(data_stream, _get_residual,
                          add_sources=('residual',))
    data_stream = FilterSources(data_stream,
                                sources=('upsampled', 'residual',))
    data_stream = Mapping(data_stream, _segment_axis)
    data_stream = Mapping(data_stream, _transpose)
    return data_stream
Esempio n. 50
0
def test_data_stream_mapping_sort_multisource_ndarrays():
    data = OrderedDict()
    data['x'] = [numpy.array([1, 2, 3]),
                 numpy.array([2, 3, 1]),
                 numpy.array([3, 2, 1])]
    data['y'] = [numpy.array([6, 5, 4]),
                 numpy.array([6, 5, 4]),
                 numpy.array([6, 5, 4])]
    data_sorted = [(numpy.array([1, 2, 3]), numpy.array([6, 5, 4])),
                   (numpy.array([1, 2, 3]), numpy.array([4, 6, 5])),
                   (numpy.array([1, 2, 3]), numpy.array([4, 5, 6]))]
    stream = DataStream(IterableDataset(data))
    wrapper = Mapping(stream, mapping=SortMapping(operator.itemgetter(0)))
    for output, ground_truth in zip(wrapper.get_epoch_iterator(), data_sorted):
        assert len(output) == len(ground_truth)
        assert (output[0] == ground_truth[0]).all()
        assert (output[1] == ground_truth[1]).all()
    def create_data(data):

        stream = DataStream(data, iteration_scheme=ShuffledScheme(data.num_examples, batch_size))

        # Data Augmentation
        stream = MinimumImageDimensions(stream, image_size, which_sources=('image_features',))
        stream = MaximumImageDimensions(stream, image_size, which_sources=('image_features',))
        stream = RandomHorizontalSwap(stream, which_sources=('image_features',))
        stream = Random2DRotation(stream, which_sources=('image_features',))
        #stream = ScikitResize(stream, image_size, which_sources=('image_features',))

        # Data Preprocessing

        # Data Transformation
        stream = ScaleAndShift(stream, 1./255, 0, which_sources=('image_features',))
        stream = Cast(stream, dtype='float32', which_sources=('image_features',))
        return stream
Esempio n. 52
0
    def get_stream(self, part, batch_size=None, max_length=None, seed=None):
        dataset = self.get_dataset(part, max_length)
        if self._layout == 'lambada' and part == 'train':
            stream = DataStream(dataset,
                                iteration_scheme=RandomSpanScheme(
                                    dataset.num_examples, max_length, seed))
            stream = Mapping(stream, listify)
        else:
            stream = dataset.get_example_stream()

        stream = SourcewiseMapping(stream,
                                   functools.partial(add_bos, Vocabulary.BOS))
        stream = SourcewiseMapping(stream, vectorize)
        if not batch_size:
            return stream
        stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))
        stream = Padding(stream)
        return stream
Esempio n. 53
0
def get_tst_stream(val_set=None,
                   src_vocab=None,
                   src_vocab_size=30000,
                   unk_id=1,
                   **kwargs):

    tst_stream = None
    if val_set is not None and src_vocab is not None:
        # Load dictionaries and ensure special tokens exist
        src_vocab = ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab)),
                                          bos_idx=0,
                                          eos_idx=src_vocab_size - 1,
                                          unk_idx=unk_id)

        tst_dataset = TextFile([val_set], src_vocab, None)
        tst_stream = DataStream(tst_dataset)
    return tst_stream
Esempio n. 54
0
 def get_stream(self,
                which_set,
                shuffle=True,
                monitor=False,
                num_examples=None,
                center=True):
     scheme_klass = ShuffledScheme if shuffle else SequentialScheme
     if num_examples is None:
         num_examples = self.get_stream_num_examples(which_set,
                                                     monitor=monitor)
     scheme = scheme_klass(num_examples, self.batch_size)
     stream = DataStream.default_stream(dataset=self.datasets[which_set],
                                        iteration_scheme=scheme)
     stream = self.apply_default_transformers(stream)
     stream = Canonicalize(stream, mapping=self.preprocess)
     if center:
         stream = transformers.Mapping(stream, mapping=self.center)
     return stream
Esempio n. 55
0
def stream_handwriting(which_sets, batch_size, seq_size, tbptt=True):
    dataset = Handwriting(which_sets)
    data_stream = DataStream.default_stream(
        dataset,
        iteration_scheme=ShuffledScheme(
            batch_size * (dataset.num_examples / batch_size), batch_size))
    data_stream = FilterSources(data_stream, sources=('features', ))
    data_stream = Padding(data_stream)
    data_stream = Mapping(data_stream, _transpose)

    if tbptt:
        data_stream = SegmentSequence(data_stream,
                                      add_flag=True,
                                      seq_size=seq_size)

    data_stream = ForceFloatX(data_stream)

    return data_stream
Esempio n. 56
0
    def start_server(port, which_set):
        fuel.server.logger.setLevel('WARN')
        dataset = IMDBText(which_set, sorted=True)

        n_train = dataset.num_examples
        #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size)
        scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size)

        stream = DataStream(
                dataset=dataset,
                iteration_scheme=scheme)
        print "loading glove"
        glove = GloveTransformer(glove_version, data_stream=stream)
        padded = Padding(
                data_stream=glove,
                mask_sources=('features',)
                )

        fuel.server.start_server(padded, port=port, hwm=20)
Esempio n. 57
0
def get_comb_stream(fea2obj, which_set, batch_size=None, shuffle=True):
    streams = []
    for fea in fea2obj:
        obj = fea2obj[fea]
        dataset = H5PYDataset(obj.fuelfile, which_sets=(which_set,),load_in_memory=True)
        if batch_size == None: batch_size = dataset.num_examples
        if shuffle: 
            iterschema = ShuffledScheme(examples=dataset.num_examples, batch_size=batch_size)
        else: 
            iterschema = SequentialScheme(examples=dataset.num_examples, batch_size=batch_size)
        stream = DataStream(dataset=dataset, iteration_scheme=iterschema)
        if fea in seq_features:
            stream = CutInput(stream, obj.max_len)
            if obj.rec == True:
                logger.info('transforming data for recursive input')
                stream = LettersTransposer(stream, which_sources=fea)# Required because Recurrent bricks receive as input [sequence, batch,# features]
        streams.append(stream)
    stream = Merge(streams, tuple(fea2obj.keys()))
    return stream, dataset.num_examples
Esempio n. 58
0
def get_dev_streams(config):
    """Setup development set stream if necessary."""
    dev_streams = {}
    for cg in config['cgs']:
        if 'val_sets' in config and cg in config['val_sets']:
            logger.info('Building development stream for cg:[{}]'.format(cg))
            eid = p_(cg)[0]
            dev_file = config['val_sets'][cg]

            # Get dictionary and fix EOS
            dictionary = cPickle.load(open(config['src_vocabs'][eid]))
            dictionary['<S>'] = 0
            dictionary['<UNK>'] = config['unk_id']
            dictionary['</S>'] = config['src_eos_idxs'][eid]

            # Get as a text file and convert it into a stream
            dev_dataset = TextFile([dev_file], dictionary, None)
            dev_streams[cg] = DataStream(dev_dataset)
    return dev_streams
Esempio n. 59
0
 def create_data(data):
     stream = DataStream.default_stream(data,
                                        iteration_scheme=ShuffledScheme(
                                            data.num_examples, batch_size))
     stream_downscale = MinimumImageDimensions(
         stream, image_size, which_sources=('image_features', ))
     #stream_rotate = Random2DRotation(stream_downscale, which_sources=('image_features',))
     stream_max = ScikitResize(stream_downscale,
                               image_size,
                               which_sources=('image_features', ))
     stream_scale = ScaleAndShift(stream_max,
                                  1. / 255,
                                  0,
                                  which_sources=('image_features', ))
     stream_cast = Cast(stream_scale,
                        dtype='float32',
                        which_sources=('image_features', ))
     #stream_flat = Flatten(stream_scale, which_sources=('image_features',))
     return stream_cast