Exemple #1
0
def test_minibatch(tmpdir):

    mbdata = r'''0	|S0 0   |S1 0
0	|S0 1 	|S1 1 
0	|S0 2 	
0	|S0 3 	|S1 3 
1	|S0 4 	
1	|S0 5 	|S1 1
1	|S0 6	|S1 2 
'''

    tmpfile = str(tmpdir/'mbtest.txt')
    with open(tmpfile, 'w') as f:
        f.write(mbdata)

    feature_stream_name = 'features'
    labels_stream_name = 'labels'

    from cntk.io import text_format_minibatch_source, StreamConfiguration
    mb_source = text_format_minibatch_source(tmpfile, [
        StreamConfiguration(feature_stream_name, 1, False, 'S0'),
        StreamConfiguration(labels_stream_name, 1, False, 'S1')],
        randomize=False)

    features_si = mb_source.stream_info(feature_stream_name)
    labels_si = mb_source.stream_info(labels_stream_name)
    
    mb = mb_source.next_minibatch(1000)
    assert mb[features_si].num_sequences == 2
    assert mb[labels_si].num_sequences == 2

    features = mb[features_si]
    assert len(features.value) == 2
    expected_features = \
            [
                [[0],[1],[2],[3]],
                [[4],[5],[6]]
            ]

    for res, exp in zip (features.value, expected_features):
        assert np.allclose(res, exp)

    assert np.allclose(features.mask, 
            [[2, 1, 1, 1],
             [2, 1, 1, 0]])

    labels = mb[labels_si]
    assert len(labels.value) == 2
    expected_labels = \
            [
                [[0],[1],[3]], 
                [[1],[2]]
            ]
    for res, exp in zip (labels.value, expected_labels):
        assert np.allclose(res, exp)

    assert np.allclose(labels.mask, 
            [[2, 1, 1],
             [2, 1, 0]])
Exemple #2
0
def test_minibatch(tmpdir):

    mbdata = r'''0	|S0 0   |S1 0
0	|S0 1 	|S1 1 
0	|S0 2 	
0	|S0 3 	|S1 3 
1	|S0 4 	
1	|S0 5 	|S1 1
1	|S0 6	|S1 2 
'''

    tmpfile = str(tmpdir/'mbtest.txt')
    with open(tmpfile, 'w') as f:
        f.write(mbdata)

    feature_stream_name = 'features'
    labels_stream_name = 'labels'

    from cntk.io import text_format_minibatch_source, StreamConfiguration
    mb_source = text_format_minibatch_source(tmpfile, [
        StreamConfiguration(feature_stream_name, 1, False, 'S0'),
        StreamConfiguration(labels_stream_name, 1, False, 'S1')],
        randomize=False)

    features_si = mb_source.stream_info(feature_stream_name)
    labels_si = mb_source.stream_info(labels_stream_name)
    
    mb = mb_source.next_minibatch(1000)
    assert mb[features_si].num_sequences == 2
    assert mb[labels_si].num_sequences == 2

    features = mb[features_si]
    assert len(features.value) == 2
    expected_features = \
            [
                [[0],[1],[2],[3]],
                [[4],[5],[6]]
            ]

    for res, exp in zip (features.value, expected_features):
        assert np.allclose(res, exp)

    assert np.allclose(features.mask, 
            [[2, 1, 1, 1],
             [2, 1, 1, 0]])

    labels = mb[labels_si]
    assert len(labels.value) == 2
    expected_labels = \
            [
                [[0],[1],[3]], 
                [[1],[2]]
            ]
    for res, exp in zip (labels.value, expected_labels):
        assert np.allclose(res, exp)

    assert np.allclose(labels.mask, 
            [[2, 1, 1],
             [2, 1, 0]])
Exemple #3
0
def test_text_format(tmpdir):
    from cntk.io import text_format_minibatch_source, StreamConfiguration, MinibatchSource

    mbdata = r'''0	|x 560:1	|y 1 0 0 0 0
0	|x 0:1
0	|x 0:1
1	|x 560:1	|y 0 1 0 0 0
1	|x 0:1
1	|x 0:1
1	|x 424:1
'''
    tmpfile = str(tmpdir/'mbdata.txt')
    with open(tmpfile, 'w') as f:
        f.write(mbdata)

    input_dim = 1000
    num_output_classes = 5

    mb_source = text_format_minibatch_source(tmpfile, [
                    StreamConfiguration( 'features', input_dim, True, 'x' ),
                    StreamConfiguration( 'labels', num_output_classes, False, 'y')])
    assert isinstance(mb_source, MinibatchSource)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(7)

    features = mb[features_si]
    # 2 samples, max seq len 4, 1000 dim
    assert features.shape == (2, 4, input_dim)
    assert features.is_sparse
    # TODO features is sparse and cannot be accessed right now:
    # *** RuntimeError: DataBuffer/WritableDataBuffer methods can only be called for NDArrayiew objects with dense storage format
    # 2 samples, max seq len 4, 1000 dim
    #assert features.data().shape().dimensions() == (2, 4, input_dim)
    #assert features.data().is_sparse()

    labels = mb[labels_si]
    # 2 samples, max seq len 1, 5 dim
    assert labels.shape == (2, 1, num_output_classes)
    assert not labels.is_sparse

    label_data = np.asarray(labels)
    assert np.allclose(label_data,
            np.asarray([
                [[ 1.,  0.,  0.,  0.,  0.]],
                [[ 0.,  1.,  0.,  0.,  0.]]
                ]))
Exemple #4
0
def test_text_format(tmpdir):
    from cntk.io import text_format_minibatch_source, StreamConfiguration, MinibatchSource

    mbdata = r'''0	|x 560:1	|y 1 0 0 0 0
0	|x 0:1
0	|x 0:1
1	|x 560:1	|y 0 1 0 0 0
1	|x 0:1
1	|x 0:1
1	|x 424:1
'''
    tmpfile = str(tmpdir/'mbdata.txt')
    with open(tmpfile, 'w') as f:
        f.write(mbdata)

    input_dim = 1000
    num_output_classes = 5

    mb_source = text_format_minibatch_source(tmpfile, [
                    StreamConfiguration( 'features', input_dim, True, 'x' ),
                    StreamConfiguration( 'labels', num_output_classes, False, 'y')])
    assert isinstance(mb_source, MinibatchSource)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.next_minibatch(7)

    features = mb[features_si]
    # 2 samples, max seq len 4, 1000 dim
    assert features.shape == (2, 4, input_dim)
    assert features.is_sparse
    # TODO features is sparse and cannot be accessed right now:
    # *** RuntimeError: DataBuffer/WritableDataBuffer methods can only be called for NDArrayiew objects with dense storage format
    # 2 samples, max seq len 4, 1000 dim
    #assert features.data().shape().dimensions() == (2, 4, input_dim)
    #assert features.data().is_sparse()

    labels = mb[labels_si]
    # 2 samples, max seq len 1, 5 dim
    assert labels.shape == (2, 1, num_output_classes)
    assert not labels.is_sparse

    label_data = np.asarray(labels)
    assert np.allclose(label_data,
            np.asarray([
                [[ 1.,  0.,  0.,  0.,  0.]],
                [[ 0.,  1.,  0.,  0.,  0.]]
                ]))
Exemple #5
0
def test_text_format():
    from cntk.io import text_format_minibatch_source, StreamConfiguration, MinibatchSource

    # 0	|x 560	|y 1 0 0 0 0
    # 0	|x 0
    # 0	|x 0
    # 1	|x 560	|y 0 1 0 0 0
    # 1	|x 0
    # 1	|x 0
    # 1	|x 424
    path = os.path.join(abs_path, "tf_data.txt")

    input_dim = 1000
    num_output_classes = 5

    mb_source = text_format_minibatch_source(
        path,
        [
            StreamConfiguration("features", input_dim, True, "x"),
            StreamConfiguration("labels", num_output_classes, False, "y"),
        ],
        0,
    )
    assert isinstance(mb_source, MinibatchSource)

    features_si = mb_source.stream_info("features")
    labels_si = mb_source.stream_info("labels")

    mb = mb_source.get_next_minibatch(7)

    features = mb[features_si].m_data
    # 2 samples, max seq len 4, 1000 dim
    assert features.data().shape().dimensions() == (2, 4, input_dim)
    assert features.data().is_sparse()
    # TODO features is sparse and cannot be accessed right now:
    # *** RuntimeError: DataBuffer/WritableDataBuffer methods can only be called for NDArrayiew objects with dense storage format

    labels = mb[labels_si].m_data
    # 2 samples, max seq len 1, 5 dim
    assert labels.data().shape().dimensions() == (2, 1, num_output_classes)
    assert not labels.data().is_sparse()

    assert np.allclose(np.asarray(labels), np.asarray([[[1.0, 0.0, 0.0, 0.0, 0.0]], [[0.0, 1.0, 0.0, 0.0, 0.0]]]))
Exemple #6
0
def test_text_format():
    from cntk.io import text_format_minibatch_source, StreamConfiguration, MinibatchSource

    # 0	|x 560	|y 1 0 0 0 0
    # 0	|x 0
    # 0	|x 0
    # 1	|x 560	|y 0 1 0 0 0
    # 1	|x 0
    # 1	|x 0
    # 1	|x 424
    path = os.path.join(abs_path, 'tf_data.txt')

    input_dim = 1000
    num_output_classes = 5

    mb_source = text_format_minibatch_source(path, [
                    StreamConfiguration( 'features', input_dim, True, 'x' ),
                    StreamConfiguration( 'labels', num_output_classes, False, 'y')])
    assert isinstance(mb_source, MinibatchSource)

    features_si = mb_source.stream_info('features')
    labels_si = mb_source.stream_info('labels')

    mb = mb_source.get_next_minibatch(7)

    features = mb[features_si].m_data
    # TODO features is sparse and cannot be accessed right now:
    # *** RuntimeError: DataBuffer/WritableDataBuffer methods can only be called for NDArrayiew objects with dense storage format
    # 2 samples, max seq len 4, 1000 dim
    #assert features.data().shape().dimensions() == (2, 4, input_dim)
    #assert features.data().is_sparse()

    labels = mb[labels_si].m_data
    # 2 samples, max seq len 1, 5 dim
    assert labels.data().shape().dimensions() == (2, 1, num_output_classes)
    assert not labels.data().is_sparse()

    assert np.allclose(np.asarray(labels),
            np.asarray([
                [[ 1.,  0.,  0.,  0.,  0.]],
                [[ 0.,  1.,  0.,  0.,  0.]]
                ]))