def test_dataset_evaluators(): X = theano.tensor.matrix('X') brick = TestBrick(name='test_brick') Y = brick.apply(X) graph = ComputationGraph([Y]) monitor_variables = [v for v in graph.auxiliary_variables] validator = DatasetEvaluator(monitor_variables) data = [ numpy.arange(1, 5, dtype=floatX).reshape(2, 2), numpy.arange(10, 16, dtype=floatX).reshape(3, 2) ] data_stream = ContainerDataset(dict(X=data)).get_default_stream() values = validator.evaluate(data_stream) assert values['test_brick_apply_V_squared'] == 4 numpy.testing.assert_allclose(values['test_brick_apply_mean_row_mean'], numpy.vstack(data).mean()) per_batch_mean = numpy.mean([batch.mean() for batch in data]) numpy.testing.assert_allclose( values['test_brick_apply_mean_batch_element'], per_batch_mean) with assert_raises(Exception) as ar: data_stream = ContainerDataset(dict(X2=data)).get_default_stream() validator.evaluate(data_stream) assert "Not all data sources" in ar.exception.args[0]
def test_sources_selection(): features = [5, 6, 7, 1] targets = [1, 0, 1, 1] stream = ContainerDataset(OrderedDict( [('features', features), ('targets', targets)])).get_default_stream() assert list(stream.get_epoch_iterator()) == list(zip(features, targets)) stream = ContainerDataset({'features': features, 'targets': targets}, sources=('targets',)).get_default_stream() assert list(stream.get_epoch_iterator()) == list(zip(targets))
def test_floatx(): x = [numpy.array(d, dtype="float64") for d in [[1, 2], [3, 4]]] y = [numpy.array(d, dtype="int64") for d in [1, 2, 3]] dataset = ContainerDataset(OrderedDict([("x", x), ("y", y)])) data = next(ForceFloatX(dataset.get_default_stream()).get_epoch_iterator()) assert str(data[0].dtype) == floatX assert str(data[1].dtype) == "int64"
def do_test(with_serialization): data_stream = ContainerDataset(range(10)).get_default_stream() main_loop = MainLoop(None, data_stream, MockAlgorithm(), extensions=[FinishAfter(after_n_batches=14)]) main_loop.run() assert main_loop.log.status.iterations_done == 14 if with_serialization: string_io = BytesIO() dill.dump(main_loop, string_io, fmode=dill.CONTENTS_FMODE) string_io.seek(0) main_loop = dill.load(string_io) finish_after = unpack([ ext for ext in main_loop.extensions if isinstance(ext, FinishAfter) ], singleton=True) finish_after.add_condition( "after_batch", predicate=lambda log: log.status.iterations_done == 27) main_loop.run() assert main_loop.log.status.iterations_done == 27 assert main_loop.log.status.epochs_done == 2 for i in range(27): assert main_loop.log[i].batch == {"data": i % 10}
def get_data_stream(iterable): dataset = ContainerDataset({'numbers': iterable}) data_stream = DataStreamMapping(dataset.get_default_stream(), _data_sqrt, add_sources=('roots', )) data_stream = DataStreamMapping(data_stream, _array_tuple) return BatchDataStream(data_stream, ConstantScheme(20))
def test_shared_variable_modifier_two_params(): weights = numpy.array([-1, 1], dtype=floatX) features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] targets = [(weights * f).sum() for f in features] n_batches = 3 dataset = ContainerDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') cost = ((x * W).sum() - y)**2 cost.name = 'cost' step_rule = Scale(0.001) sgd = GradientDescent(cost=cost, params=[W], step_rule=step_rule) modifier = SharedVariableModifier( step_rule.learning_rate, lambda _, val: numpy.cast[floatX](val * 0.2)) main_loop = MainLoop(model=None, data_stream=dataset.get_default_stream(), algorithm=sgd, extensions=[FinishAfter(after_n_epochs=1), modifier]) main_loop.run() new_value = step_rule.learning_rate.get_value() assert_allclose(new_value, 0.001 * 0.2**n_batches, atol=1e-5)
def test_cache(): dataset = ContainerDataset(range(100)) stream = DataStream(dataset) batched_stream = BatchDataStream(stream, ConstantScheme(11)) cached_stream = CachedDataStream(batched_stream, ConstantScheme(7)) epoch = cached_stream.get_epoch_iterator() # Make sure that cache is filled as expected for (features,), cache_size in zip(epoch, [4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 0, 4]): assert len(cached_stream.cache[0]) == cache_size # Make sure that the epoch finishes correctly for (features,) in cached_stream.get_epoch_iterator(): pass assert len(features) == 100 % 7 assert not cached_stream.cache[0] # Ensure that the epoch transition is correct cached_stream = CachedDataStream(batched_stream, ConstantScheme(7, times=3)) for _, epoch in zip(range(2), cached_stream.iterate_epochs()): cache_sizes = [4, 8, 1] for i, (features,) in enumerate(epoch): assert len(cached_stream.cache[0]) == cache_sizes[i] assert len(features) == 7 assert numpy.all(range(100)[i * 7:(i + 1) * 7] == features) assert i == 2
def do_test(with_serialization): data_stream = ContainerDataset(range(10)).get_default_stream() main_loop = MainLoop(MockAlgorithm(), data_stream, extensions=[ WriteBatchExtension(), FinishAfter(after_n_batches=14) ]) main_loop.run() assert main_loop.log.status.iterations_done == 14 if with_serialization: main_loop = cPickle.loads(cPickle.dumps(main_loop)) finish_after = unpack([ ext for ext in main_loop.extensions if isinstance(ext, FinishAfter) ], singleton=True) finish_after.add_condition( "after_batch", predicate=lambda log: log.status.iterations_done == 27) main_loop.run() assert main_loop.log.status.iterations_done == 27 assert main_loop.log.status.epochs_done == 2 for i in range(27): assert main_loop.log[i + 1].batch == {"data": i % 10}
def get_data_stream(iterable): dataset = ContainerDataset({'numbers': iterable}) data_stream = DataStreamMapping(dataset.get_default_stream(), lambda data: (math.sqrt(data[0]), ), add_sources=('roots', )) data_stream = DataStreamMapping( data_stream, lambda data: tuple( (numpy.asarray(d, dtype=floatX) for d in data))) return BatchDataStream(data_stream, ConstantScheme(20))
def test_training_data_monitoring(): weights = numpy.array([-1, 1], dtype=floatX) features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] targets = [(weights * f).sum() for f in features] n_batches = 3 dataset = ContainerDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') V = shared_floatx(7, name='V') W_sum = named_copy(W.sum(), 'W_sum') cost = ((x * W).sum() - y)**2 cost.name = 'cost' class TrueCostExtension(TrainingExtension): def before_batch(self, data): self.main_loop.log.current_row.true_cost = (( (W.get_value() * data["features"]).sum() - data["targets"])**2) main_loop = MainLoop(model=None, data_stream=dataset.get_default_stream(), algorithm=GradientDescent(cost=cost, params=[W], step_rule=Scale(0.001)), extensions=[ FinishAfter(after_n_epochs=1), TrainingDataMonitoring([W_sum, cost, V], prefix="train1", after_every_batch=True), TrainingDataMonitoring( [aggregation.mean(W_sum), cost], prefix="train2", after_every_epoch=True), TrueCostExtension() ]) main_loop.run() # Check monitoring of a shared varible assert_allclose(main_loop.log.current_row.train1_V, 7.0) for i in range(n_batches): # The ground truth is written to the log before the batch is # processed, where as the extension writes after the batch is # processed. This is why the iteration numbers differs here. assert_allclose(main_loop.log[i].true_cost, main_loop.log[i + 1].train1_cost) assert_allclose( main_loop.log[n_batches].train2_cost, sum([main_loop.log[i].true_cost for i in range(n_batches)]) / n_batches) assert_allclose( main_loop.log[n_batches].train2_W_sum, sum([main_loop.log[i].train1_W_sum for i in range(1, n_batches + 1)]) / n_batches)
def test_batch_data_stream(): stream = ContainerDataset([1, 2, 3, 4, 5]).get_default_stream() batches = list(BatchDataStream(stream, ConstantScheme(2)) .get_epoch_iterator()) expected = [(numpy.array([1, 2]),), (numpy.array([3, 4]),), (numpy.array([5]),)] assert len(batches) == len(expected) for b, e in zip(batches, expected): assert (b[0] == e[0]).all() # Check the `strict` flag def try_strict(): list(BatchDataStream(stream, ConstantScheme(2), strict=True) .get_epoch_iterator()) assert_raises(ValueError, try_strict) stream2 = ContainerDataset([1, 2, 3, 4, 5, 6]).get_default_stream() assert len(list(BatchDataStream(stream2, ConstantScheme(2), strict=True) .get_epoch_iterator())) == 3
def test_data_stream_mapping(): data = [1, 2, 3] data_doubled = [2, 4, 6] stream = ContainerDataset(data).get_default_stream() wrapper1 = DataStreamMapping( stream, lambda d: (2 * d[0],)) assert list(wrapper1.get_epoch_iterator()) == list(zip(data_doubled)) wrapper2 = DataStreamMapping( stream, lambda d: (2 * d[0],), add_sources=("doubled",)) assert wrapper2.sources == ("data", "doubled") assert list(wrapper2.get_epoch_iterator()) == list(zip(data, data_doubled))
def test_data_stream_mapping_sort_multisource(): data = OrderedDict() data['x'] = [[1, 2, 3], [2, 3, 1], [3, 2, 1]] data['y'] = [[6, 5, 4], [6, 5, 4], [6, 5, 4]] data_sorted = [([1, 2, 3], [6, 5, 4]), ([1, 2, 3], [4, 6, 5]), ([1, 2, 3], [4, 5, 6])] stream = ContainerDataset(data).get_default_stream() wrapper = DataStreamMapping(stream, mapping=SortMapping(operator.itemgetter(0))) assert list(wrapper.get_epoch_iterator()) == data_sorted
def test_dataset(): data = [1, 2, 3] # The default stream requests an example at a time stream = ContainerDataset(data).get_default_stream() epoch = stream.get_epoch_iterator() assert list(epoch) == list(zip(data)) # Check if iterating over multiple epochs works for i, epoch in zip(range(2), stream.iterate_epochs()): assert list(epoch) == list(zip(data)) # Check whether the returning as a dictionary of sources works assert next(stream.get_epoch_iterator(as_dict=True)) == {"data": 1}
def test_padding_data_stream(): # 1-D sequences stream = BatchDataStream( ContainerDataset([[1], [2, 3], [], [4, 5, 6], [7]]) .get_default_stream(), ConstantScheme(2)) mask_stream = PaddingDataStream(stream) assert mask_stream.sources == ("data", "data_mask") it = mask_stream.get_epoch_iterator() data, mask = next(it) assert (data == numpy.array([[1, 0], [2, 3]])).all() assert (mask == numpy.array([[1, 0], [1, 1]])).all() data, mask = next(it) assert (data == numpy.array([[0, 0, 0], [4, 5, 6]])).all() assert (mask == numpy.array([[0, 0, 0], [1, 1, 1]])).all() data, mask = next(it) assert (data == numpy.array([[7]])).all() assert (mask == numpy.array([[1]])).all() # 2D sequences stream2 = BatchDataStream( ContainerDataset([numpy.ones((3, 4)), 2 * numpy.ones((2, 4))]) .get_default_stream(), ConstantScheme(2)) it = PaddingDataStream(stream2).get_epoch_iterator() data, mask = next(it) assert data.shape == (2, 3, 4) assert (data[0, :, :] == 1).all() assert (data[1, :2, :] == 2).all() assert (mask == numpy.array([[1, 1, 1], [1, 1, 0]])).all() # 2 sources stream3 = PaddingDataStream(BatchDataStream( ContainerDataset(dict(features=[[1], [2, 3], []], targets=[[4, 5, 6], [7]])) .get_default_stream(), ConstantScheme(2))) assert len(next(stream3.get_epoch_iterator())) == 4
def test_data_stream_mapping_sort(): data = [[1, 2, 3], [2, 3, 1], [3, 2, 1]] data_sorted = [[1, 2, 3]] * 3 data_sorted_rev = [[3, 2, 1]] * 3 stream = ContainerDataset(data).get_default_stream() wrapper1 = DataStreamMapping(stream, mapping=SortMapping(operator.itemgetter(0))) assert list(wrapper1.get_epoch_iterator()) == list(zip(data_sorted)) wrapper2 = DataStreamMapping(stream, SortMapping(lambda x: -x[0])) assert list(wrapper2.get_epoch_iterator()) == list(zip(data_sorted_rev)) wrapper3 = DataStreamMapping(stream, SortMapping(operator.itemgetter(0), reverse=True)) assert list(wrapper3.get_epoch_iterator()) == list(zip(data_sorted_rev))
def test_data_stream_mapping_sort_multisource_ndarrays(): data = OrderedDict() data['x'] = [numpy.array([1, 2, 3]), numpy.array([2, 3, 1]), numpy.array([3, 2, 1])] data['y'] = [numpy.array([6, 5, 4]), numpy.array([6, 5, 4]), numpy.array([6, 5, 4])] data_sorted = [(numpy.array([1, 2, 3]), numpy.array([6, 5, 4])), (numpy.array([1, 2, 3]), numpy.array([4, 6, 5])), (numpy.array([1, 2, 3]), numpy.array([4, 5, 6]))] stream = ContainerDataset(data).get_default_stream() wrapper = DataStreamMapping(stream, mapping=SortMapping(operator.itemgetter(0))) for output, ground_truth in zip(wrapper.get_epoch_iterator(), data_sorted): assert len(output) == len(ground_truth) assert (output[0] == ground_truth[0]).all() assert (output[1] == ground_truth[1]).all()
def test_dataset(): data = [1, 2, 3] data_doubled = [2, 4, 6] # The default stream requests an example at a time stream = ContainerDataset([1, 2, 3]).get_default_stream() epoch = stream.get_epoch_iterator() assert list(epoch) == list(zip(data)) # Check if iterating over multiple epochs works for i, epoch in zip(range(2), stream.iterate_epochs()): assert list(epoch) == list(zip(data)) for i, epoch in enumerate(stream.iterate_epochs()): assert list(epoch) == list(zip(data)) if i == 1: break # Check whether the returning as a dictionary of sources works assert next(stream.get_epoch_iterator(as_dict=True)) == {"data": 1} # Check whether basic stream wrappers work wrapper = DataStreamMapping(stream, lambda d: (2 * d[0], )) assert list(wrapper.get_epoch_iterator()) == list(zip(data_doubled))
def setup_mainloop(extension): """Create a MainLoop, register the given extension, supply it with a DataStream and a minimal model/cost to optimize. """ features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] dataset = ContainerDataset(dict(features=features)) W = shared_floatx([0, 0], name='W') x = tensor.vector('features') cost = tensor.sum((x-W)**2) cost.name = "cost" algorithm = GradientDescent(cost=cost, params=[W], step_rule=Scale(1e-3)) main_loop = MainLoop( model=None, data_stream=dataset.get_default_stream(), algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=1), extension]) return main_loop
def test_data_stream_filter(): data = [1, 2, 3] data_filtered = [1, 3] stream = ContainerDataset(data).get_default_stream() wrapper = DataStreamFilter(stream, lambda d: d[0] % 2 == 1) assert list(wrapper.get_epoch_iterator()) == list(zip(data_filtered))