def test_minibatch(tmpdir): mbdata = r'''0 |S0 0 |S1 0 0 |S0 1 |S1 1 0 |S0 2 0 |S0 3 |S1 3 1 |S0 4 1 |S0 5 |S1 1 1 |S0 6 |S1 2 ''' tmpfile = str(tmpdir/'mbtest.txt') with open(tmpfile, 'w') as f: f.write(mbdata) feature_stream_name = 'features' labels_stream_name = 'labels' from cntk.io import text_format_minibatch_source, StreamConfiguration mb_source = text_format_minibatch_source(tmpfile, [ StreamConfiguration(feature_stream_name, 1, False, 'S0'), StreamConfiguration(labels_stream_name, 1, False, 'S1')], randomize=False) features_si = mb_source.stream_info(feature_stream_name) labels_si = mb_source.stream_info(labels_stream_name) mb = mb_source.next_minibatch(1000) assert mb[features_si].num_sequences == 2 assert mb[labels_si].num_sequences == 2 features = mb[features_si] assert len(features.value) == 2 expected_features = \ [ [[0],[1],[2],[3]], [[4],[5],[6]] ] for res, exp in zip (features.value, expected_features): assert np.allclose(res, exp) assert np.allclose(features.mask, [[2, 1, 1, 1], [2, 1, 1, 0]]) labels = mb[labels_si] assert len(labels.value) == 2 expected_labels = \ [ [[0],[1],[3]], [[1],[2]] ] for res, exp in zip (labels.value, expected_labels): assert np.allclose(res, exp) assert np.allclose(labels.mask, [[2, 1, 1], [2, 1, 0]])
def test_text_format(tmpdir): from cntk.io import text_format_minibatch_source, StreamConfiguration, MinibatchSource mbdata = r'''0 |x 560:1 |y 1 0 0 0 0 0 |x 0:1 0 |x 0:1 1 |x 560:1 |y 0 1 0 0 0 1 |x 0:1 1 |x 0:1 1 |x 424:1 ''' tmpfile = str(tmpdir/'mbdata.txt') with open(tmpfile, 'w') as f: f.write(mbdata) input_dim = 1000 num_output_classes = 5 mb_source = text_format_minibatch_source(tmpfile, [ StreamConfiguration( 'features', input_dim, True, 'x' ), StreamConfiguration( 'labels', num_output_classes, False, 'y')]) assert isinstance(mb_source, MinibatchSource) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.next_minibatch(7) features = mb[features_si] # 2 samples, max seq len 4, 1000 dim assert features.shape == (2, 4, input_dim) assert features.is_sparse # TODO features is sparse and cannot be accessed right now: # *** RuntimeError: DataBuffer/WritableDataBuffer methods can only be called for NDArrayiew objects with dense storage format # 2 samples, max seq len 4, 1000 dim #assert features.data().shape().dimensions() == (2, 4, input_dim) #assert features.data().is_sparse() labels = mb[labels_si] # 2 samples, max seq len 1, 5 dim assert labels.shape == (2, 1, num_output_classes) assert not labels.is_sparse label_data = np.asarray(labels) assert np.allclose(label_data, np.asarray([ [[ 1., 0., 0., 0., 0.]], [[ 0., 1., 0., 0., 0.]] ]))
def test_text_format(): from cntk.io import text_format_minibatch_source, StreamConfiguration, MinibatchSource # 0 |x 560 |y 1 0 0 0 0 # 0 |x 0 # 0 |x 0 # 1 |x 560 |y 0 1 0 0 0 # 1 |x 0 # 1 |x 0 # 1 |x 424 path = os.path.join(abs_path, "tf_data.txt") input_dim = 1000 num_output_classes = 5 mb_source = text_format_minibatch_source( path, [ StreamConfiguration("features", input_dim, True, "x"), StreamConfiguration("labels", num_output_classes, False, "y"), ], 0, ) assert isinstance(mb_source, MinibatchSource) features_si = mb_source.stream_info("features") labels_si = mb_source.stream_info("labels") mb = mb_source.get_next_minibatch(7) features = mb[features_si].m_data # 2 samples, max seq len 4, 1000 dim assert features.data().shape().dimensions() == (2, 4, input_dim) assert features.data().is_sparse() # TODO features is sparse and cannot be accessed right now: # *** RuntimeError: DataBuffer/WritableDataBuffer methods can only be called for NDArrayiew objects with dense storage format labels = mb[labels_si].m_data # 2 samples, max seq len 1, 5 dim assert labels.data().shape().dimensions() == (2, 1, num_output_classes) assert not labels.data().is_sparse() assert np.allclose(np.asarray(labels), np.asarray([[[1.0, 0.0, 0.0, 0.0, 0.0]], [[0.0, 1.0, 0.0, 0.0, 0.0]]]))
def test_text_format(): from cntk.io import text_format_minibatch_source, StreamConfiguration, MinibatchSource # 0 |x 560 |y 1 0 0 0 0 # 0 |x 0 # 0 |x 0 # 1 |x 560 |y 0 1 0 0 0 # 1 |x 0 # 1 |x 0 # 1 |x 424 path = os.path.join(abs_path, 'tf_data.txt') input_dim = 1000 num_output_classes = 5 mb_source = text_format_minibatch_source(path, [ StreamConfiguration( 'features', input_dim, True, 'x' ), StreamConfiguration( 'labels', num_output_classes, False, 'y')]) assert isinstance(mb_source, MinibatchSource) features_si = mb_source.stream_info('features') labels_si = mb_source.stream_info('labels') mb = mb_source.get_next_minibatch(7) features = mb[features_si].m_data # TODO features is sparse and cannot be accessed right now: # *** RuntimeError: DataBuffer/WritableDataBuffer methods can only be called for NDArrayiew objects with dense storage format # 2 samples, max seq len 4, 1000 dim #assert features.data().shape().dimensions() == (2, 4, input_dim) #assert features.data().is_sparse() labels = mb[labels_si].m_data # 2 samples, max seq len 1, 5 dim assert labels.data().shape().dimensions() == (2, 1, num_output_classes) assert not labels.data().is_sparse() assert np.allclose(np.asarray(labels), np.asarray([ [[ 1., 0., 0., 0., 0.]], [[ 0., 1., 0., 0., 0.]] ]))