예제 #1
0
def test_dataset_evaluators():
    X = theano.tensor.matrix('X')
    brick = TestBrick(name='test_brick')
    Y = brick.apply(X)
    graph = ComputationGraph([Y])
    monitor_variables = [v for v in graph.auxiliary_variables]
    validator = DatasetEvaluator(monitor_variables)

    data = [
        numpy.arange(1, 5, dtype=floatX).reshape(2, 2),
        numpy.arange(10, 16, dtype=floatX).reshape(3, 2)
    ]
    data_stream = ContainerDataset(dict(X=data)).get_default_stream()

    values = validator.evaluate(data_stream)
    assert values['test_brick_apply_V_squared'] == 4
    numpy.testing.assert_allclose(values['test_brick_apply_mean_row_mean'],
                                  numpy.vstack(data).mean())
    per_batch_mean = numpy.mean([batch.mean() for batch in data])
    numpy.testing.assert_allclose(
        values['test_brick_apply_mean_batch_element'], per_batch_mean)

    with assert_raises(Exception) as ar:
        data_stream = ContainerDataset(dict(X2=data)).get_default_stream()
        validator.evaluate(data_stream)
    assert "Not all data sources" in ar.exception.args[0]
예제 #2
0
def test_sources_selection():
    features = [5, 6, 7, 1]
    targets = [1, 0, 1, 1]
    stream = ContainerDataset(OrderedDict(
        [('features', features), ('targets', targets)])).get_default_stream()
    assert list(stream.get_epoch_iterator()) == list(zip(features, targets))

    stream = ContainerDataset({'features': features, 'targets': targets},
                              sources=('targets',)).get_default_stream()
    assert list(stream.get_epoch_iterator()) == list(zip(targets))
예제 #3
0
def test_floatx():
    x = [numpy.array(d, dtype="float64") for d in [[1, 2], [3, 4]]]
    y = [numpy.array(d, dtype="int64") for d in [1, 2, 3]]
    dataset = ContainerDataset(OrderedDict([("x", x), ("y", y)]))
    data = next(ForceFloatX(dataset.get_default_stream()).get_epoch_iterator())
    assert str(data[0].dtype) == floatX
    assert str(data[1].dtype) == "int64"
예제 #4
0
    def do_test(with_serialization):
        data_stream = ContainerDataset(range(10)).get_default_stream()
        main_loop = MainLoop(None,
                             data_stream,
                             MockAlgorithm(),
                             extensions=[FinishAfter(after_n_batches=14)])
        main_loop.run()
        assert main_loop.log.status.iterations_done == 14

        if with_serialization:
            string_io = BytesIO()
            dill.dump(main_loop, string_io, fmode=dill.CONTENTS_FMODE)
            string_io.seek(0)
            main_loop = dill.load(string_io)

        finish_after = unpack([
            ext
            for ext in main_loop.extensions if isinstance(ext, FinishAfter)
        ],
                              singleton=True)
        finish_after.add_condition(
            "after_batch",
            predicate=lambda log: log.status.iterations_done == 27)
        main_loop.run()
        assert main_loop.log.status.iterations_done == 27
        assert main_loop.log.status.epochs_done == 2
        for i in range(27):
            assert main_loop.log[i].batch == {"data": i % 10}
예제 #5
0
def get_data_stream(iterable):
    dataset = ContainerDataset({'numbers': iterable})
    data_stream = DataStreamMapping(dataset.get_default_stream(),
                                    _data_sqrt,
                                    add_sources=('roots', ))
    data_stream = DataStreamMapping(data_stream, _array_tuple)
    return BatchDataStream(data_stream, ConstantScheme(20))
예제 #6
0
def test_shared_variable_modifier_two_params():
    weights = numpy.array([-1, 1], dtype=floatX)
    features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]]
    targets = [(weights * f).sum() for f in features]
    n_batches = 3
    dataset = ContainerDataset(dict(features=features, targets=targets))

    x = tensor.vector('features')
    y = tensor.scalar('targets')
    W = shared_floatx([0, 0], name='W')
    cost = ((x * W).sum() - y)**2
    cost.name = 'cost'

    step_rule = Scale(0.001)
    sgd = GradientDescent(cost=cost, params=[W], step_rule=step_rule)
    modifier = SharedVariableModifier(
        step_rule.learning_rate, lambda _, val: numpy.cast[floatX](val * 0.2))
    main_loop = MainLoop(model=None,
                         data_stream=dataset.get_default_stream(),
                         algorithm=sgd,
                         extensions=[FinishAfter(after_n_epochs=1), modifier])

    main_loop.run()

    new_value = step_rule.learning_rate.get_value()
    assert_allclose(new_value, 0.001 * 0.2**n_batches, atol=1e-5)
예제 #7
0
def test_cache():
    dataset = ContainerDataset(range(100))
    stream = DataStream(dataset)
    batched_stream = BatchDataStream(stream, ConstantScheme(11))
    cached_stream = CachedDataStream(batched_stream, ConstantScheme(7))
    epoch = cached_stream.get_epoch_iterator()

    # Make sure that cache is filled as expected
    for (features,), cache_size in zip(epoch, [4, 8, 1, 5, 9, 2,
                                               6, 10, 3, 7, 0, 4]):
        assert len(cached_stream.cache[0]) == cache_size

    # Make sure that the epoch finishes correctly
    for (features,) in cached_stream.get_epoch_iterator():
        pass
    assert len(features) == 100 % 7
    assert not cached_stream.cache[0]

    # Ensure that the epoch transition is correct
    cached_stream = CachedDataStream(batched_stream,
                                     ConstantScheme(7, times=3))
    for _, epoch in zip(range(2), cached_stream.iterate_epochs()):
        cache_sizes = [4, 8, 1]
        for i, (features,) in enumerate(epoch):
            assert len(cached_stream.cache[0]) == cache_sizes[i]
            assert len(features) == 7
            assert numpy.all(range(100)[i * 7:(i + 1) * 7] == features)
        assert i == 2
예제 #8
0
    def do_test(with_serialization):
        data_stream = ContainerDataset(range(10)).get_default_stream()
        main_loop = MainLoop(MockAlgorithm(),
                             data_stream,
                             extensions=[
                                 WriteBatchExtension(),
                                 FinishAfter(after_n_batches=14)
                             ])
        main_loop.run()
        assert main_loop.log.status.iterations_done == 14

        if with_serialization:
            main_loop = cPickle.loads(cPickle.dumps(main_loop))

        finish_after = unpack([
            ext
            for ext in main_loop.extensions if isinstance(ext, FinishAfter)
        ],
                              singleton=True)
        finish_after.add_condition(
            "after_batch",
            predicate=lambda log: log.status.iterations_done == 27)
        main_loop.run()
        assert main_loop.log.status.iterations_done == 27
        assert main_loop.log.status.epochs_done == 2
        for i in range(27):
            assert main_loop.log[i + 1].batch == {"data": i % 10}
예제 #9
0
def get_data_stream(iterable):
    dataset = ContainerDataset({'numbers': iterable})
    data_stream = DataStreamMapping(dataset.get_default_stream(),
                                    lambda data: (math.sqrt(data[0]), ),
                                    add_sources=('roots', ))
    data_stream = DataStreamMapping(
        data_stream, lambda data: tuple(
            (numpy.asarray(d, dtype=floatX) for d in data)))
    return BatchDataStream(data_stream, ConstantScheme(20))
예제 #10
0
def test_training_data_monitoring():
    weights = numpy.array([-1, 1], dtype=floatX)
    features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]]
    targets = [(weights * f).sum() for f in features]
    n_batches = 3
    dataset = ContainerDataset(dict(features=features, targets=targets))

    x = tensor.vector('features')
    y = tensor.scalar('targets')
    W = shared_floatx([0, 0], name='W')
    V = shared_floatx(7, name='V')
    W_sum = named_copy(W.sum(), 'W_sum')
    cost = ((x * W).sum() - y)**2
    cost.name = 'cost'

    class TrueCostExtension(TrainingExtension):
        def before_batch(self, data):
            self.main_loop.log.current_row.true_cost = ((
                (W.get_value() * data["features"]).sum() - data["targets"])**2)

    main_loop = MainLoop(model=None,
                         data_stream=dataset.get_default_stream(),
                         algorithm=GradientDescent(cost=cost,
                                                   params=[W],
                                                   step_rule=Scale(0.001)),
                         extensions=[
                             FinishAfter(after_n_epochs=1),
                             TrainingDataMonitoring([W_sum, cost, V],
                                                    prefix="train1",
                                                    after_every_batch=True),
                             TrainingDataMonitoring(
                                 [aggregation.mean(W_sum), cost],
                                 prefix="train2",
                                 after_every_epoch=True),
                             TrueCostExtension()
                         ])

    main_loop.run()

    # Check monitoring of a shared varible
    assert_allclose(main_loop.log.current_row.train1_V, 7.0)

    for i in range(n_batches):
        # The ground truth is written to the log before the batch is
        # processed, where as the extension writes after the batch is
        # processed. This is why the iteration numbers differs here.
        assert_allclose(main_loop.log[i].true_cost,
                        main_loop.log[i + 1].train1_cost)
    assert_allclose(
        main_loop.log[n_batches].train2_cost,
        sum([main_loop.log[i].true_cost
             for i in range(n_batches)]) / n_batches)
    assert_allclose(
        main_loop.log[n_batches].train2_W_sum,
        sum([main_loop.log[i].train1_W_sum
             for i in range(1, n_batches + 1)]) / n_batches)
예제 #11
0
def test_batch_data_stream():
    stream = ContainerDataset([1, 2, 3, 4, 5]).get_default_stream()
    batches = list(BatchDataStream(stream, ConstantScheme(2))
                   .get_epoch_iterator())
    expected = [(numpy.array([1, 2]),),
                (numpy.array([3, 4]),),
                (numpy.array([5]),)]
    assert len(batches) == len(expected)
    for b, e in zip(batches, expected):
        assert (b[0] == e[0]).all()

    # Check the `strict` flag
    def try_strict():
        list(BatchDataStream(stream, ConstantScheme(2), strict=True)
             .get_epoch_iterator())
    assert_raises(ValueError, try_strict)
    stream2 = ContainerDataset([1, 2, 3, 4, 5, 6]).get_default_stream()
    assert len(list(BatchDataStream(stream2, ConstantScheme(2), strict=True)
                    .get_epoch_iterator())) == 3
예제 #12
0
def test_data_stream_mapping():
    data = [1, 2, 3]
    data_doubled = [2, 4, 6]
    stream = ContainerDataset(data).get_default_stream()
    wrapper1 = DataStreamMapping(
        stream, lambda d: (2 * d[0],))
    assert list(wrapper1.get_epoch_iterator()) == list(zip(data_doubled))
    wrapper2 = DataStreamMapping(
        stream, lambda d: (2 * d[0],), add_sources=("doubled",))
    assert wrapper2.sources == ("data", "doubled")
    assert list(wrapper2.get_epoch_iterator()) == list(zip(data, data_doubled))
예제 #13
0
def test_data_stream_mapping_sort_multisource():
    data = OrderedDict()
    data['x'] = [[1, 2, 3], [2, 3, 1], [3, 2, 1]]
    data['y'] = [[6, 5, 4], [6, 5, 4], [6, 5, 4]]
    data_sorted = [([1, 2, 3], [6, 5, 4]),
                   ([1, 2, 3], [4, 6, 5]),
                   ([1, 2, 3], [4, 5, 6])]
    stream = ContainerDataset(data).get_default_stream()
    wrapper = DataStreamMapping(stream,
                                mapping=SortMapping(operator.itemgetter(0)))
    assert list(wrapper.get_epoch_iterator()) == data_sorted
예제 #14
0
def test_dataset():
    data = [1, 2, 3]
    # The default stream requests an example at a time
    stream = ContainerDataset(data).get_default_stream()
    epoch = stream.get_epoch_iterator()
    assert list(epoch) == list(zip(data))

    # Check if iterating over multiple epochs works
    for i, epoch in zip(range(2), stream.iterate_epochs()):
        assert list(epoch) == list(zip(data))

    # Check whether the returning as a dictionary of sources works
    assert next(stream.get_epoch_iterator(as_dict=True)) == {"data": 1}
예제 #15
0
def test_padding_data_stream():
    # 1-D sequences
    stream = BatchDataStream(
        ContainerDataset([[1], [2, 3], [], [4, 5, 6], [7]])
        .get_default_stream(),
        ConstantScheme(2))
    mask_stream = PaddingDataStream(stream)
    assert mask_stream.sources == ("data", "data_mask")
    it = mask_stream.get_epoch_iterator()
    data, mask = next(it)
    assert (data == numpy.array([[1, 0], [2, 3]])).all()
    assert (mask == numpy.array([[1, 0], [1, 1]])).all()
    data, mask = next(it)
    assert (data == numpy.array([[0, 0, 0], [4, 5, 6]])).all()
    assert (mask == numpy.array([[0, 0, 0], [1, 1, 1]])).all()
    data, mask = next(it)
    assert (data == numpy.array([[7]])).all()
    assert (mask == numpy.array([[1]])).all()

    # 2D sequences
    stream2 = BatchDataStream(
        ContainerDataset([numpy.ones((3, 4)), 2 * numpy.ones((2, 4))])
        .get_default_stream(),
        ConstantScheme(2))
    it = PaddingDataStream(stream2).get_epoch_iterator()
    data, mask = next(it)
    assert data.shape == (2, 3, 4)
    assert (data[0, :, :] == 1).all()
    assert (data[1, :2, :] == 2).all()
    assert (mask == numpy.array([[1, 1, 1], [1, 1, 0]])).all()

    # 2 sources
    stream3 = PaddingDataStream(BatchDataStream(
        ContainerDataset(dict(features=[[1], [2, 3], []],
                              targets=[[4, 5, 6], [7]]))
        .get_default_stream(),
        ConstantScheme(2)))
    assert len(next(stream3.get_epoch_iterator())) == 4
예제 #16
0
def test_data_stream_mapping_sort():
    data = [[1, 2, 3],
            [2, 3, 1],
            [3, 2, 1]]
    data_sorted = [[1, 2, 3]] * 3
    data_sorted_rev = [[3, 2, 1]] * 3
    stream = ContainerDataset(data).get_default_stream()
    wrapper1 = DataStreamMapping(stream,
                                 mapping=SortMapping(operator.itemgetter(0)))
    assert list(wrapper1.get_epoch_iterator()) == list(zip(data_sorted))
    wrapper2 = DataStreamMapping(stream, SortMapping(lambda x: -x[0]))
    assert list(wrapper2.get_epoch_iterator()) == list(zip(data_sorted_rev))
    wrapper3 = DataStreamMapping(stream, SortMapping(operator.itemgetter(0),
                                                     reverse=True))
    assert list(wrapper3.get_epoch_iterator()) == list(zip(data_sorted_rev))
예제 #17
0
def test_data_stream_mapping_sort_multisource_ndarrays():
    data = OrderedDict()
    data['x'] = [numpy.array([1, 2, 3]),
                 numpy.array([2, 3, 1]),
                 numpy.array([3, 2, 1])]
    data['y'] = [numpy.array([6, 5, 4]),
                 numpy.array([6, 5, 4]),
                 numpy.array([6, 5, 4])]
    data_sorted = [(numpy.array([1, 2, 3]), numpy.array([6, 5, 4])),
                   (numpy.array([1, 2, 3]), numpy.array([4, 6, 5])),
                   (numpy.array([1, 2, 3]), numpy.array([4, 5, 6]))]
    stream = ContainerDataset(data).get_default_stream()
    wrapper = DataStreamMapping(stream,
                                mapping=SortMapping(operator.itemgetter(0)))
    for output, ground_truth in zip(wrapper.get_epoch_iterator(), data_sorted):
        assert len(output) == len(ground_truth)
        assert (output[0] == ground_truth[0]).all()
        assert (output[1] == ground_truth[1]).all()
예제 #18
0
def test_dataset():
    data = [1, 2, 3]
    data_doubled = [2, 4, 6]

    # The default stream requests an example at a time
    stream = ContainerDataset([1, 2, 3]).get_default_stream()
    epoch = stream.get_epoch_iterator()
    assert list(epoch) == list(zip(data))

    # Check if iterating over multiple epochs works
    for i, epoch in zip(range(2), stream.iterate_epochs()):
        assert list(epoch) == list(zip(data))
    for i, epoch in enumerate(stream.iterate_epochs()):
        assert list(epoch) == list(zip(data))
        if i == 1:
            break

    # Check whether the returning as a dictionary of sources works
    assert next(stream.get_epoch_iterator(as_dict=True)) == {"data": 1}

    # Check whether basic stream wrappers work
    wrapper = DataStreamMapping(stream, lambda d: (2 * d[0], ))
    assert list(wrapper.get_epoch_iterator()) == list(zip(data_doubled))
예제 #19
0
def setup_mainloop(extension):
    """Create a MainLoop, register the given extension, supply it with a
        DataStream and a minimal model/cost to optimize.
    """
    features = [numpy.array(f, dtype=floatX)
                for f in [[1, 2], [3, 4], [5, 6]]]
    dataset = ContainerDataset(dict(features=features))

    W = shared_floatx([0, 0], name='W')
    x = tensor.vector('features')
    cost = tensor.sum((x-W)**2)
    cost.name = "cost"

    algorithm = GradientDescent(cost=cost, params=[W],
                                step_rule=Scale(1e-3))

    main_loop = MainLoop(
        model=None, data_stream=dataset.get_default_stream(),
        algorithm=algorithm,
        extensions=[
            FinishAfter(after_n_epochs=1),
            extension])

    return main_loop
예제 #20
0
def test_data_stream_filter():
    data = [1, 2, 3]
    data_filtered = [1, 3]
    stream = ContainerDataset(data).get_default_stream()
    wrapper = DataStreamFilter(stream, lambda d: d[0] % 2 == 1)
    assert list(wrapper.get_epoch_iterator()) == list(zip(data_filtered))