Example #1
0
def test_shuffled_ignore_missing(temp_file_1_name, temp_file_2_name):
    with io_open('ark:' + temp_file_1_name, 't', mode='w') as token_f:
        token_f.write('1', 'cool')
        token_f.write('3', 'bean')
        token_f.write('4', 'casserole')
    keys = [str(i) for i in range(6)]
    data = corpus.ShuffledData(('ark:' + temp_file_1_name, 't'),
                               key_list=keys,
                               ignore_missing=True,
                               rng=NonRandomState())
    assert len(data) == 3
    act_samples = list(data)
    assert all(ex == act
               for ex, act in zip(['casserole', 'bean', 'cool'], act_samples))
    with io_open('ark:' + temp_file_2_name, 'B', mode='w') as bool_f:
        bool_f.write('0', True)
        bool_f.write('1', False)
        bool_f.write('2', True)
        bool_f.write('4', False)
    data = corpus.ShuffledData(('ark:' + temp_file_1_name, 't'),
                               ('ark:' + temp_file_2_name, 'B'),
                               key_list=keys,
                               ignore_missing=True,
                               rng=NonRandomState())
    assert len(data) == 2
    act_tok_samples, act_bool_samples = list(zip(*iter(data)))
    assert all(ex == act
               for ex, act in zip(['casserole', 'cool'], act_tok_samples))
    assert all(not act for act in act_bool_samples)
Example #2
0
def test_seeded_shuffled_is_predictable(temp_file_1_name, seed):
    samples = np.arange(100000).reshape((1000, 100)).astype(np.float32)
    with io_open('ark:' + temp_file_1_name, 'fv', mode='w') as f:
        for idx, sample in enumerate(samples):
            f.write(str(idx), sample)
    data_1 = corpus.ShuffledData(('ark:' + temp_file_1_name, 'fv'),
                                 batch_size=13,
                                 rng=seed)
    data_2 = corpus.ShuffledData(('ark:' + temp_file_1_name, 'fv'),
                                 batch_size=13,
                                 rng=seed)
    for _ in range(2):
        for batch_1, batch_2 in zip(data_1, data_2):
            assert np.allclose(batch_1, batch_2)
Example #3
0
def test_shuffled_data_basic(temp_file_1_name):
    samples = np.arange(100000).reshape((10, 200, 50)).astype(
        np.float64 if KaldiDataType.BaseMatrix.is_double else np.float32)
    keys = tuple(str(i) for i in range(10))
    with io_open('ark:' + temp_file_1_name, 'bm', mode='w') as f:
        for key, sample in zip(keys, samples):
            f.write(key, sample)
    data = corpus.ShuffledData('ark:' + temp_file_1_name,
                               batch_size=3,
                               rng=NonRandomState())
    assert isinstance(data.rng, NonRandomState)
    assert len(data) == int(np.ceil(len(keys) / 3))
    assert keys == tuple(data.key_list)
    for _ in range(2):
        ex_samp_idx = len(samples)
        for batch in data:
            for act_sample in batch:
                ex_samp_idx -= 1
                assert np.allclose(samples[ex_samp_idx], act_sample)
Example #4
0
def test_shuffled_data_tups(temp_file_1_name, temp_file_2_name):
    feats = [
        [[1, 2, 3, 4], [5, 6, 7, 8]],
        [[9, 10], [11, 12]],
        [[13, 14, 15], [16, 17, 18]],
        [[19], [20]],
    ]
    labels = [
        np.array([[1, 2], [3, 4]], dtype=np.float64),
        np.array([[5, 6, 7, 8], [9, 10, 11, 12]], dtype=np.float64),
        np.array([[13], [14]], dtype=np.float64),
        np.array([[15, 16, 17], [18, 19, 20]], dtype=np.float64)
    ]
    keys = tuple(str(i) for i in range(4))
    with io_open('ark:' + temp_file_1_name, 'ivv', mode='w') as feat_f, \
            io_open('ark:' + temp_file_2_name, 'dm', mode='w') as lab_f:
        for key, feat, label in zip(keys, feats, labels):
            feat_f.write(key, feat)
            lab_f.write(key, label)
    data = corpus.ShuffledData(('ark:' + temp_file_1_name, 'ivv'),
                               ('ark:' + temp_file_2_name, 'dm'),
                               batch_size=2,
                               batch_pad_mode='constant',
                               key_list=keys,
                               axis_lengths=1,
                               rng=NonRandomState(),
                               batch_cast_to_array=(np.int32, None, None))
    for _ in range(2):
        ex_samp_idx = len(feats)
        for feat_batch, _, len_batch in data:
            for act_feat, act_len in zip(feat_batch, len_batch):
                ex_samp_idx -= 1
                ex_feat = np.array(feats[ex_samp_idx], copy=False)
                ex_len = ex_feat.shape[1]
                assert ex_len == act_len
                assert np.allclose(ex_feat, act_feat[:, :ex_len])
                assert np.allclose(act_feat[:, ex_len:], 0)
    data = corpus.ShuffledData(
        ('ark:' + temp_file_1_name, 'ivv'), ('ark:' + temp_file_2_name, 'dm'),
        batch_size=3,
        batch_pad_mode='constant',
        key_list=keys,
        axis_lengths=((1, 1), (0, 1)),
        rng=NonRandomState(),
        batch_cast_to_array=(np.int32, None, None, None))
    for _ in range(2):
        ex_samp_idx = len(feats)
        for feat_batch, label_batch, lablen_batch, featlen_batch in data:
            for act_feat, act_label, act_lablen, act_featlen in zip(
                    feat_batch, label_batch, lablen_batch, featlen_batch):
                ex_samp_idx -= 1
                ex_feat = np.array(feats[ex_samp_idx], copy=False)
                ex_label = labels[ex_samp_idx]
                ex_featlen = ex_feat.shape[1]
                ex_lablen = ex_label.shape[1]
                assert ex_featlen == act_featlen
                assert ex_lablen == act_lablen
                assert np.allclose(ex_feat, act_feat[:, :ex_featlen])
                assert np.allclose(act_feat[:, ex_featlen:], 0)
                assert np.allclose(ex_label, act_label[:, :ex_lablen])
                assert np.allclose(act_label[:, ex_lablen:], 0)