def test_large_minibatch(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_DENSE_2) mb_source = MinibatchSource(CTFDeserializer( tmpfile, StreamDefs(features=StreamDef(field='S0', shape=1), labels=StreamDef(field='S1', shape=1))), randomization_window_in_chunks=0) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) features = mb[features_si] labels = mb[labels_si] # Actually, the minibatch spans over multiple sweeps, # not sure if this is an artificial situation, but # maybe instead of a boolean flag we should indicate # the largest sweep index the data was taken from. assert features.end_of_sweep assert labels.end_of_sweep assert features.num_samples == 1000 - 1000 % 7 assert labels.num_samples == 5 * (1000 // 7) assert mb[features_si].num_sequences == (1000 // 7) assert mb[labels_si].num_sequences == (1000 // 7)
def test_large_minibatch(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_DENSE_2) mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features = StreamDef(field='S0', shape=1), labels = StreamDef(field='S1', shape=1))), randomization_window_in_chunks=0) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) features = mb[features_si] labels = mb[labels_si] # Actually, the minibatch spans over multiple sweeps, # not sure if this is an artificial situation, but # maybe instead of a boolean flag we should indicate # the largest sweep index the data was taken from. assert features.end_of_sweep assert labels.end_of_sweep assert features.num_samples == 1000 - 1000 % 7 assert labels.num_samples == 5 * (1000 // 7) assert mb[features_si].num_sequences == (1000 // 7) assert mb[labels_si].num_sequences == (1000 // 7)
def test_minibatch(tmpdir): mbdata = r'''0 |S0 0 |S1 0 0 |S0 1 |S1 1 0 |S0 2 0 |S0 3 |S1 3 1 |S0 4 1 |S0 5 |S1 1 1 |S0 6 |S1 2 ''' tmpfile = str(tmpdir/'mbtest.txt') with open(tmpfile, 'w') as f: f.write(mbdata) from cntk.io import CTFDeserializer, MinibatchSource, StreamDef, StreamDefs mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features = StreamDef(field='S0', shape=1), labels = StreamDef(field='S1', shape=1)))) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) assert mb[features_si].num_sequences == 2 assert mb[labels_si].num_sequences == 2 features = mb[features_si] assert len(features.value) == 2 expected_features = \ [ [[0],[1],[2],[3]], [[4],[5],[6]] ] for res, exp in zip (features.value, expected_features): assert np.allclose(res, exp) assert np.allclose(features.mask, [[2, 1, 1, 1], [2, 1, 1, 0]]) labels = mb[labels_si] assert len(labels.value) == 2 expected_labels = \ [ [[0],[1],[3]], [[1],[2]] ] for res, exp in zip (labels.value, expected_labels): assert np.allclose(res, exp) assert np.allclose(labels.mask, [[2, 1, 1], [2, 1, 0]])
def test_text_format(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_SPARSE) input_dim = 1000 num_output_classes = 5 mb_source = MinibatchSource(CTFDeserializer( tmpfile, StreamDefs(features=StreamDef(field='x', shape=input_dim, is_sparse=True), labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False))), randomize=False) assert isinstance(mb_source, MinibatchSource) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(7) features = mb[features_si] # 2 samples, max seq len 4, 1000 dim assert features.shape == (2, 4, input_dim) assert features.end_of_sweep assert features.num_sequences == 2 assert features.num_samples == 7 assert features.is_sparse labels = mb[labels_si] # 2 samples, max seq len 1, 5 dim assert labels.shape == (2, 1, num_output_classes) assert labels.end_of_sweep assert labels.num_sequences == 2 assert labels.num_samples == 2 assert not labels.is_sparse label_data = labels.asarray() assert np.allclose( label_data, np.asarray([[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]]])) mb = mb_source.next_minibatch(1) features = mb[features_si] labels = mb[labels_si] assert not features.end_of_sweep assert not labels.end_of_sweep assert features.num_samples < 7 assert labels.num_samples == 1
def test_text_format(tmpdir): from cntk.io import CTFDeserializer, MinibatchSource, StreamDef, StreamDefs mbdata = r'''0 |x 560:1 |y 1 0 0 0 0 0 |x 0:1 0 |x 0:1 1 |x 560:1 |y 0 1 0 0 0 1 |x 0:1 1 |x 0:1 1 |x 424:1 ''' tmpfile = str(tmpdir/'mbdata.txt') with open(tmpfile, 'w') as f: f.write(mbdata) input_dim = 1000 num_output_classes = 5 mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features = StreamDef(field='x', shape=input_dim, is_sparse=True), labels = StreamDef(field='y', shape=num_output_classes, is_sparse=False) ))) assert isinstance(mb_source, MinibatchSource) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(7) features = mb[features_si] # 2 samples, max seq len 4, 1000 dim assert features.shape == (2, 4, input_dim) assert features.is_sparse # TODO features is sparse and cannot be accessed right now: # *** RuntimeError: DataBuffer/WritableDataBuffer methods can only be called for NDArrayiew objects with dense storage format # 2 samples, max seq len 4, 1000 dim #assert features.data().shape().dimensions() == (2, 4, input_dim) #assert features.data().is_sparse() labels = mb[labels_si] # 2 samples, max seq len 1, 5 dim assert labels.shape == (2, 1, num_output_classes) assert not labels.is_sparse label_data = np.asarray(labels) assert np.allclose(label_data, np.asarray([ [[ 1., 0., 0., 0., 0.]], [[ 0., 1., 0., 0., 0.]] ]))
def test_text_format(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_SPARSE) input_dim = 1000 num_output_classes = 5 mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features=StreamDef(field='x', shape=input_dim, is_sparse=True), labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False) )), randomize=False) assert isinstance(mb_source, MinibatchSource) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(7) features = mb[features_si] # 2 samples, max seq len 4, 1000 dim assert features.shape == (2, 4, input_dim) assert features.end_of_sweep assert features.num_sequences == 2 assert features.num_samples == 7 assert features.is_sparse labels = mb[labels_si] # 2 samples, max seq len 1, 5 dim assert labels.shape == (2, 1, num_output_classes) assert labels.end_of_sweep assert labels.num_sequences == 2 assert labels.num_samples == 2 assert not labels.is_sparse label_data = labels.asarray() assert np.allclose(label_data, np.asarray([ [[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]] ])) mb = mb_source.next_minibatch(1) features = mb[features_si] labels = mb[labels_si] assert not features.end_of_sweep assert not labels.end_of_sweep assert features.num_samples < 7 assert labels.num_samples == 1
def test_MinibatchData_and_Value_as_input(tmpdir): mbdata = r'''0 |S0 100''' tmpfile = str(tmpdir / 'mbtest.txt') with open(tmpfile, 'w') as f: f.write(mbdata) defs = StreamDefs(f1=StreamDef(field='S0', shape=1)) mb_source = MinibatchSource(CTFDeserializer(tmpfile, defs), randomize=False) f1_si = mb_source.stream_info('f1') mb = mb_source.next_minibatch(1) f1 = input(shape=(1, ), needs_gradient=True, name='f') res = f1 * 2 assert res.eval({f1: mb[f1_si]}) == [[200]] # Test MinibatchData assert res.eval(mb[f1_si]) == [[200]] # Test Value assert res.eval(mb[f1_si].data) == [[200]] # Test NumPy (converted back from MinibatchData) assert res.eval(mb[f1_si].asarray()) == [[200]] # Test Value assert res.eval(mb[f1_si].data) == [[200]]
def test_MinibatchData_and_Value_as_input(tmpdir): mbdata = r'''0 |S0 100''' tmpfile = str(tmpdir/'mbtest.txt') with open(tmpfile, 'w') as f: f.write(mbdata) defs = StreamDefs(f1 = StreamDef(field='S0', shape=1)) mb_source = MinibatchSource(CTFDeserializer(tmpfile, defs), randomize=False) f1_si = mb_source.stream_info('f1') mb = mb_source.next_minibatch(1) f1 = input_variable(shape=(1,), needs_gradient=True, name='f') res = f1 * 2 assert res.eval({f1: mb[f1_si]}) == [[200]] # Test MinibatchData assert res.eval(mb[f1_si]) == [[200]] # Test Value assert res.eval(mb[f1_si].data) == [[200]] # Test NumPy (converted back from MinibatchData) assert res.eval(mb[f1_si].value) == [[200]] # Test Value assert res.eval(mb[f1_si].data) == [[200]]
def test_full_sweep_minibatch(tmpdir): tmpfile = _write_data(tmpdir, MBDATA_DENSE_1) mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs( features = StreamDef(field='S0', shape=1), labels = StreamDef(field='S1', shape=1))), randomization_window_in_chunks=0, max_sweeps=1) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(1000) assert mb[features_si].num_sequences == 2 assert mb[labels_si].num_sequences == 2 features = mb[features_si] assert features.end_of_sweep assert len(features.as_sequences()) == 2 expected_features = \ [ [[0], [1], [2], [3]], [[4], [5], [6]] ] for res, exp in zip(features.as_sequences(), expected_features): assert np.allclose(res, exp) assert np.allclose(features.data.mask, [[2, 1, 1, 1], [2, 1, 1, 0]]) labels = mb[labels_si] assert labels.end_of_sweep assert len(labels.as_sequences()) == 2 expected_labels = \ [ [[0],[1],[3]], [[1],[2]] ] for res, exp in zip(labels.as_sequences(), expected_labels): assert np.allclose(res, exp) assert np.allclose(labels.data.mask, [[2, 1, 1], [2, 1, 0]])