Ejemplo n.º 1
0
def test_read_sequential(temp_file_1_name):
    values = (
        [[1, 2] * 10] * 10,
        np.eye(1000, dtype=np.float32),
        [[]],
        np.outer(
            np.arange(1000, dtype=np.float32),
            np.arange(1000, dtype=np.float32)),
    )
    writer = io_open('ark:{}'.format(temp_file_1_name), 'fm', mode='w')
    for key, value in enumerate(values):
        writer.write(str(key), value)
    writer.close()
    count = 0
    reader = io_open('ark:{}'.format(temp_file_1_name), 'fm')
    for act_value, reader_value in zip(values, iter(reader)):
        assert np.allclose(act_value, reader_value)
        count += 1
    assert count == len(values)
    reader.close()
    # check that the keys are all savvy
    reader = io_open('ark:{}'.format(temp_file_1_name), 'fm')
    for idx, tup in enumerate(reader.items()):
        key, value = tup
        assert str(idx) == key
Ejemplo n.º 2
0
 def __init__(self, table, *additional_tables, **kwargs):
     key_list = kwargs.pop('key_list', None)
     rng = kwargs.pop('rng', None)
     super(ShuffledData, self).__init__(table, *additional_tables, **kwargs)
     try:
         key_list = tuple(key_list)
     except TypeError:
         pass
     if key_list is None:
         _, rx_fn, rx_type, _ = parse_kaldi_input_path(
             self.table_specifiers[0][0])
         if rx_type == RxfilenameType.InvalidInput:
             raise IOError('Invalid rspecifier {}'.format(rx_fn))
         elif rx_type == RxfilenameType.StandardInput:
             raise IOError(
                 'Cannot infer key list from stdin (cannot reopen)')
         with io_open(*self.table_specifiers[0][:2]) as reader:
             self.key_list = tuple(reader.keys())
     else:
         self.key_list = tuple(key_list)
     if self.ignore_missing:
         self._num_samples = None
     else:
         self._num_samples = len(self.key_list)
     if isinstance(rng, np.random.RandomState):
         self.rng = rng
     else:
         self.rng = np.random.RandomState(rng)
     self.table_handles = tuple(
         io_open(rspecifier, kdtype, mode='r+', **o_kwargs)
         for rspecifier, kdtype, o_kwargs in self.table_specifiers)
Ejemplo n.º 3
0
def test_shuffled_ignore_missing(temp_file_1_name, temp_file_2_name):
    with io_open('ark:' + temp_file_1_name, 't', mode='w') as token_f:
        token_f.write('1', 'cool')
        token_f.write('3', 'bean')
        token_f.write('4', 'casserole')
    keys = [str(i) for i in range(6)]
    data = corpus.ShuffledData(('ark:' + temp_file_1_name, 't'),
                               key_list=keys,
                               ignore_missing=True,
                               rng=NonRandomState())
    assert len(data) == 3
    act_samples = list(data)
    assert all(ex == act
               for ex, act in zip(['casserole', 'bean', 'cool'], act_samples))
    with io_open('ark:' + temp_file_2_name, 'B', mode='w') as bool_f:
        bool_f.write('0', True)
        bool_f.write('1', False)
        bool_f.write('2', True)
        bool_f.write('4', False)
    data = corpus.ShuffledData(('ark:' + temp_file_1_name, 't'),
                               ('ark:' + temp_file_2_name, 'B'),
                               key_list=keys,
                               ignore_missing=True,
                               rng=NonRandomState())
    assert len(data) == 2
    act_tok_samples, act_bool_samples = list(zip(*iter(data)))
    assert all(ex == act
               for ex, act in zip(['casserole', 'cool'], act_tok_samples))
    assert all(not act for act in act_bool_samples)
Ejemplo n.º 4
0
def test_shuffled_ignore_missing(temp_file_1_name, temp_file_2_name):
    with io_open('ark:' + temp_file_1_name, 't', mode='w') as token_f:
        token_f.write('1', 'cool')
        token_f.write('3', 'bean')
        token_f.write('4', 'casserole')
    keys = [str(i) for i in range(6)]
    data = corpus.ShuffledData(
        ('ark:' + temp_file_1_name, 't'), key_list=keys, ignore_missing=True,
        rng=NonRandomState())
    assert len(data) == 3
    act_samples = list(data)
    assert all(ex == act for ex, act in zip(
        ['casserole', 'bean', 'cool'], act_samples))
    with io_open('ark:' + temp_file_2_name, 'B', mode='w') as bool_f:
        bool_f.write('0', True)
        bool_f.write('1', False)
        bool_f.write('2', True)
        bool_f.write('4', False)
    data = corpus.ShuffledData(
        ('ark:' + temp_file_1_name, 't'), ('ark:' + temp_file_2_name, 'B'),
        key_list=keys, ignore_missing=True, rng=NonRandomState())
    assert len(data) == 2
    act_tok_samples, act_bool_samples = list(zip(*iter(data)))
    assert all(ex == act for ex, act in zip(
        ['casserole', 'cool'], act_tok_samples))
    assert all(not act for act in act_bool_samples)
Ejemplo n.º 5
0
 def __init__(self, table, *additional_tables, **kwargs):
     key_list = kwargs.pop('key_list', None)
     rng = kwargs.pop('rng', None)
     super(ShuffledData, self).__init__(table, *additional_tables, **kwargs)
     try:
         key_list = tuple(key_list)
     except TypeError:
         pass
     if key_list is None:
         _, rx_fn, rx_type, _ = parse_kaldi_input_path(
             self.table_specifiers[0][0])
         if rx_type == RxfilenameType.InvalidInput:
             raise IOError('Invalid rspecifier {}'.format(rx_fn))
         elif rx_type == RxfilenameType.StandardInput:
             raise IOError(
                 'Cannot infer key list from stdin (cannot reopen)')
         with io_open(*self.table_specifiers[0][:2]) as reader:
             self.key_list = tuple(reader.keys())
     else:
         self.key_list = tuple(key_list)
     if self.ignore_missing:
         self._num_samples = None
     else:
         self._num_samples = len(self.key_list)
     if isinstance(rng, np.random.RandomState):
         self.rng = rng
     else:
         self.rng = np.random.RandomState(rng)
     self.table_handles = tuple(
         io_open(rspecifier, kdtype, mode='r+', **o_kwargs)
         for rspecifier, kdtype, o_kwargs in self.table_specifiers
     )
Ejemplo n.º 6
0
def test_sequential_data_tups(temp_file_1_name, temp_file_2_name):
    feats = np.random.random((4, 10, 100)).astype(np.float64)
    labels = [
        ('foo', ),
        (
            'bar',
            'baz',
        ),
        ('bingo', ),
        ('bango', 'bongo', 'eugene'),
    ]
    with io_open('ark:' + temp_file_1_name, 'dm', mode='w') as feats_f, \
            io_open('ark:' + temp_file_2_name, 'tv', mode='w') as labels_f:
        for idx, (feat, label) in enumerate(zip(feats, labels)):
            feats_f.write(str(idx), feat)
            labels_f.write(str(idx), label)
    data = corpus.SequentialData(('ark,s:' + temp_file_1_name, 'dm'),
                                 ('ark,s:' + temp_file_2_name, 'tv'),
                                 axis_lengths=0)
    batch_start = 0
    for ex_feat, ex_label, (act_feat, act_label,
                            act_len) in zip(feats, labels, data):
        assert np.allclose(ex_feat, act_feat)
        assert ex_label == act_label
        assert act_len == 10
        batch_start += 1
    assert batch_start == len(feats)
    assert len(data) == batch_start
Ejemplo n.º 7
0
def test_read_write_valid(temp_file_1_name, ktype, value, binary):
    with io_open(temp_file_1_name, mode='w', header=False) as outp:
        outp.write(value, ktype, write_binary=binary)
    with io_open(temp_file_1_name, header=False) as inp:
        read_value = inp.read(ktype, read_binary=binary)
    if ktype in ('bv', 'bm', 'fv', 'fm', 'dv', 'dm', 'b', 'd', 'bpv'):
        assert np.allclose(read_value, value)
    else:
        assert read_value == value
Ejemplo n.º 8
0
def test_read_write_pipe_posix(temp_file_1_name):
    value = np.ones((1000, 10000), dtype=np.float32)
    writer = io_open(
        'ark:| gzip -c > {}'.format(temp_file_1_name), 'fm', mode='w')
    writer.write('bar', value)
    writer.close()
    reader = io_open(
        'ark:gunzip -c {}|'.format(temp_file_1_name), 'fm', mode='r+')
    assert np.allclose(reader['bar'], value)
Ejemplo n.º 9
0
def test_read_write_valid(temp_file_1_name, ktype, value, binary):
    with io_open(temp_file_1_name, mode='w', header=False) as outp:
        outp.write(value, ktype, write_binary=binary)
    with io_open(temp_file_1_name, header=False) as inp:
        read_value = inp.read(ktype, read_binary=binary)
    if ktype in ('bv', 'bm', 'fv', 'fm', 'dv', 'dm', 'b', 'd', 'bpv'):
        assert np.allclose(read_value, value)
    else:
        assert read_value == value
Ejemplo n.º 10
0
def test_write_read_numpy_versions(temp_file_1_name, ktype, dtype, value):
    npy_value = np.array(value).astype(dtype)
    with io_open('ark:' + temp_file_1_name, ktype, mode='w') as writer:
        writer.write('key', npy_value)
    with io_open('ark:' + temp_file_1_name, ktype) as reader:
        act_value = next(reader)
    if ktype in ('b', 'bpv'):
        assert np.allclose(value, act_value)
    else:
        assert value == act_value
Ejemplo n.º 11
0
def test_write_read_numpy_versions(temp_file_1_name, ktype, dtype, value):
    npy_value = np.array(value).astype(dtype)
    with io_open(temp_file_1_name, mode='w', header=False) as outp:
        outp.write(npy_value, ktype)
    with io_open(temp_file_1_name, header=False) as inp:
        act_value = inp.read(ktype)
    if ktype in ('b', 'bpv'):
        assert np.allclose(value, act_value)
    else:
        assert value == act_value
Ejemplo n.º 12
0
def test_write_read_numpy_versions(temp_file_1_name, ktype, dtype, value):
    npy_value = np.array(value).astype(dtype)
    with io_open('ark:' + temp_file_1_name, ktype, mode='w') as writer:
        writer.write('key', npy_value)
    with io_open('ark:' + temp_file_1_name, ktype) as reader:
        act_value = next(reader)
    if ktype in ('b', 'bpv'):
        assert np.allclose(value, act_value)
    else:
        assert value == act_value
Ejemplo n.º 13
0
def test_write_read_numpy_versions(temp_file_1_name, ktype, dtype, value):
    npy_value = np.array(value).astype(dtype)
    with io_open(temp_file_1_name, mode='w', header=False) as outp:
        outp.write(npy_value, ktype)
    with io_open(temp_file_1_name, header=False) as inp:
        act_value = inp.read(ktype)
    if ktype in ('b', 'bpv'):
        assert np.allclose(value, act_value)
    else:
        assert value == act_value
Ejemplo n.º 14
0
def test_shuffled_data_tups(temp_file_1_name, temp_file_2_name):
    feats = [
        [[1, 2, 3, 4], [5, 6, 7, 8]],
        [[9, 10], [11, 12]],
        [[13, 14, 15], [16, 17, 18]],
        [[19], [20]],
    ]
    labels = [
        np.array([[1, 2], [3, 4]], dtype=np.float64),
        np.array([[5, 6, 7, 8], [9, 10, 11, 12]], dtype=np.float64),
        np.array([[13], [14]], dtype=np.float64),
        np.array([[15, 16, 17], [18, 19, 20]], dtype=np.float64)
    ]
    keys = tuple(str(i) for i in range(4))
    with io_open('ark:' + temp_file_1_name, 'ivv', mode='w') as feat_f, \
            io_open('ark:' + temp_file_2_name, 'dm', mode='w') as lab_f:
        for key, feat, label in zip(keys, feats, labels):
            feat_f.write(key, feat)
            lab_f.write(key, label)
    data = corpus.ShuffledData(
        ('ark:' + temp_file_1_name, 'ivv'), ('ark:' + temp_file_2_name, 'dm'),
        batch_size=2, batch_pad_mode='constant',
        key_list=keys, axis_lengths=1, rng=NonRandomState(),
        batch_cast_to_array=(np.int32, None, None))
    for _ in range(2):
        ex_samp_idx = len(feats)
        for feat_batch, _, len_batch in data:
            for act_feat, act_len in zip(feat_batch, len_batch):
                ex_samp_idx -= 1
                ex_feat = np.array(feats[ex_samp_idx], copy=False)
                ex_len = ex_feat.shape[1]
                assert ex_len == act_len
                assert np.allclose(ex_feat, act_feat[:, :ex_len])
                assert np.allclose(act_feat[:, ex_len:], 0)
    data = corpus.ShuffledData(
        ('ark:' + temp_file_1_name, 'ivv'), ('ark:' + temp_file_2_name, 'dm'),
        batch_size=3, batch_pad_mode='constant',
        key_list=keys, axis_lengths=((1, 1), (0, 1)), rng=NonRandomState(),
        batch_cast_to_array=(np.int32, None, None, None))
    for _ in range(2):
        ex_samp_idx = len(feats)
        for feat_batch, label_batch, lablen_batch, featlen_batch in data:
            for act_feat, act_label, act_lablen, act_featlen in zip(
                    feat_batch, label_batch, lablen_batch, featlen_batch):
                ex_samp_idx -= 1
                ex_feat = np.array(feats[ex_samp_idx], copy=False)
                ex_label = labels[ex_samp_idx]
                ex_featlen = ex_feat.shape[1]
                ex_lablen = ex_label.shape[1]
                assert ex_featlen == act_featlen
                assert ex_lablen == act_lablen
                assert np.allclose(ex_feat, act_feat[:, :ex_featlen])
                assert np.allclose(act_feat[:, ex_featlen:], 0)
                assert np.allclose(ex_label, act_label[:, :ex_lablen])
                assert np.allclose(act_label[:, ex_lablen:], 0)
Ejemplo n.º 15
0
def test_filehandle_open(temp_file_1_name):
    specifier = 'ark:{}'.format(temp_file_1_name)
    kaldi_io = io_open(specifier, 'bm', mode='w')
    assert isinstance(kaldi_io, table_streams.KaldiTable)
    assert isinstance(kaldi_io, table_streams.KaldiWriter)
    kaldi_io = io_open(specifier, 'bm')
    assert isinstance(kaldi_io, table_streams.KaldiSequentialReader)
    kaldi_io = io_open(specifier, 'bm', mode='r')
    assert isinstance(kaldi_io, table_streams.KaldiSequentialReader)
    kaldi_io = io_open(specifier, 'bm', mode='r+')
    assert isinstance(kaldi_io, table_streams.KaldiRandomAccessReader)
Ejemplo n.º 16
0
def test_read_write_pipe_posix(temp_file_1_name):
    value = np.ones((1000, 10000), dtype=np.float32)
    writer = io_open('ark:| gzip -c > {}'.format(temp_file_1_name),
                     'fm',
                     mode='w')
    writer.write('bar', value)
    writer.close()
    reader = io_open('ark:gunzip -c {}|'.format(temp_file_1_name),
                     'fm',
                     mode='r+')
    assert np.allclose(reader['bar'], value)
Ejemplo n.º 17
0
def test_filehandle_open(temp_file_1_name):
    specifier = 'ark:{}'.format(temp_file_1_name)
    kaldi_io = io_open(specifier, 'bm', mode='w')
    assert isinstance(kaldi_io, table_streams.KaldiTable)
    assert isinstance(kaldi_io, table_streams.KaldiWriter)
    kaldi_io = io_open(specifier, 'bm')
    assert isinstance(kaldi_io, table_streams.KaldiSequentialReader)
    kaldi_io = io_open(specifier, 'bm', mode='r')
    assert isinstance(kaldi_io, table_streams.KaldiSequentialReader)
    kaldi_io = io_open(specifier, 'bm', mode='r+')
    assert isinstance(kaldi_io, table_streams.KaldiRandomAccessReader)
Ejemplo n.º 18
0
def test_read_random(temp_file_1_name):
    writer = io_open('ark:{}'.format(temp_file_1_name), 'dv', mode='w')
    writer.write('able', [])
    writer.write('was', [2])
    writer.write('I', [3, 3])
    writer.write('ere', [4, 4])
    writer.close()
    reader = io_open('ark,o:{}'.format(temp_file_1_name), 'dv', mode='r+')
    assert np.allclose(reader['I'], [3, 3])
    assert np.allclose(reader['able'], [])
    assert np.allclose(reader['was'], [2])
Ejemplo n.º 19
0
def test_read_random(temp_file_1_name):
    writer = io_open('ark:{}'.format(temp_file_1_name), 'dv', mode='w')
    writer.write('able', [])
    writer.write('was', [2])
    writer.write('I', [3, 3])
    writer.write('ere', [4, 4])
    writer.close()
    reader = io_open(
        'ark,o:{}'.format(temp_file_1_name), 'dv', mode='r+')
    assert np.allclose(reader['I'], [3, 3])
    assert np.allclose(reader['able'], [])
    assert np.allclose(reader['was'], [2])
Ejemplo n.º 20
0
def test_invalid_scp(temp_file_1_name):
    # make sure invalid scp files don't segfault
    with open(temp_file_1_name, mode='wb') as writer:
        writer.write(np.random.bytes(1000))
    try:
        with io_open('scp:' + temp_file_1_name) as reader:
            next(reader)
    except Exception:
        pass
    with open(temp_file_1_name, mode='wb') as writer:
        writer.write(b'foo ' + np.random.bytes(1000))
    try:
        with io_open('scp:' + temp_file_1_name) as reader:
            next(reader)
    except Exception:
        pass
Ejemplo n.º 21
0
def test_invalid_scp(temp_file_1_name):
    # make sure invalid scp files don't segfault
    with open(temp_file_1_name, mode='wb') as writer:
        writer.write(np.random.bytes(1000))
    try:
        with io_open('scp:' + temp_file_1_name) as reader:
            next(reader)
    except Exception:
        pass
    with open(temp_file_1_name, mode='wb') as writer:
        writer.write(b'foo ' + np.random.bytes(1000))
    try:
        with io_open('scp:' + temp_file_1_name) as reader:
            next(reader)
    except Exception:
        pass
Ejemplo n.º 22
0
def test_invalid_tv_does_not_segfault(temp_file_1_name):
    # weird bug I found
    tv = 'foo bar'
    writer = io_open('ark:' + temp_file_1_name, 'tv', mode='w')
    with pytest.raises(Exception):
        writer.write('foo', tv)
    with pytest.raises(Exception):
        writer.write('foo', np.array(tv))
Ejemplo n.º 23
0
def test_sequential_ignore_missing(temp_file_1_name, temp_file_2_name):
    with io_open('ark:' + temp_file_1_name, 'ipv', mode='w') as pair_f:
        pair_f.write('10', [(10, 9), (8, 7), (6, 5)])
        pair_f.write('11', [(11, 10), (9, 8)])
        pair_f.write('12', [(12, 11)])
        pair_f.write('14', [])
    with io_open('ark:' + temp_file_2_name, 'd', mode='w') as double_f:
        double_f.write('09', 3.14)
        double_f.write('10', .159)
        double_f.write('12', .265)
        double_f.write('13', .357)
    data = corpus.SequentialData(('ark,s:' + temp_file_1_name, 'ipv'),
                                 ('ark,s:' + temp_file_2_name, 'd'),
                                 ignore_missing=True)
    act_pair_samples, act_double_samples = list(zip(*iter(data)))
    assert act_pair_samples == (((10, 9), (8, 7), (6, 5)), ((12, 11), ))
    assert np.allclose(act_double_samples, (.159, .265))
Ejemplo n.º 24
0
def test_write_invalid(temp_file_1_name, dtype, value, is_text):
    if is_text:
        specifier = 'ark,t:{}'.format(temp_file_1_name)
    else:
        specifier = 'ark:{}'.format(temp_file_1_name)
    writer = io_open(specifier, dtype, mode='w')
    with pytest.raises(Exception):
        writer.write('a', value)
Ejemplo n.º 25
0
def test_cache(temp_file_1_name):
    with io_open('ark:' + temp_file_1_name, 'B', mode='w') as writer:
        writer.write('a', True)
        writer.write('b', False)
    with io_open('ark:' + temp_file_1_name, 'B', mode='r+', cache=True) as r:
        assert r.cache_dict == dict()
        assert 'a' not in r.cache_dict
        assert 'a' in r
        assert r['a']
        assert r.cache_dict == {'a': True}
        assert 'a' in r.cache_dict
        assert 'b' not in r.cache_dict
        assert 'b' in r
        assert not r['b']
        assert r.cache_dict == {'a': True, 'b': False}
        r.cache_dict['b'] = True
        assert r['b']
Ejemplo n.º 26
0
def test_write_invalid(temp_file_1_name, dtype, value, is_text):
    if is_text:
        specifier = 'ark,t:{}'.format(temp_file_1_name)
    else:
        specifier = 'ark:{}'.format(temp_file_1_name)
    writer = io_open(specifier, dtype, mode='w')
    with pytest.raises(Exception):
        writer.write('a', value)
Ejemplo n.º 27
0
def test_invalid_tv_does_not_segfault(temp_file_1_name):
    # weird bug I found
    tv = 'foo bar'
    writer = io_open('ark:' + temp_file_1_name, 'tv', mode='w')
    with pytest.raises(Exception):
        writer.write('foo', tv)
    with pytest.raises(Exception):
        writer.write('foo', np.array(tv))
Ejemplo n.º 28
0
def test_cache(temp_file_1_name):
    with io_open('ark:' + temp_file_1_name, 'B', mode='w') as writer:
        writer.write('a', True)
        writer.write('b', False)
    with io_open('ark:' + temp_file_1_name, 'B', mode='r+', cache=True) as r:
        assert r.cache_dict == dict()
        assert 'a' not in r.cache_dict
        assert 'a' in r
        assert r['a']
        assert r.cache_dict == {'a': True}
        assert 'a' in r.cache_dict
        assert 'b' not in r.cache_dict
        assert 'b' in r
        assert not r['b']
        assert r.cache_dict == {'a': True, 'b': False}
        r.cache_dict['b'] = True
        assert r['b']
Ejemplo n.º 29
0
def decode_cnn_ctc(args=None):
    '''Decode CNN w/ CTC using kaldi data tables'''
    logger = logging.getLogger(sys.argv[0])
    if not logger.handlers:
        logger.addHandler(logging.StreamHandler())
    register_logger_for_kaldi(sys.argv[0])
    options = _decode_cnn_ctc_parse_args(args, logger)
    logger.log(9, 'Parsed options')
    id2label_map = dict()
    with open(options.label_to_id_map_path) as file_obj:
        for line in file_obj:
            label, idee = line.strip().split()
            idee = int(idee)
            if idee < 0:
                logger.error('All label ids must be nonnegative')
                return 1
            id2label_map[idee] = label
    if (len(id2label_map) + 1) != options.num_labels:
        logger.error('Expected {} labels in id to label map, got {}'.format(
            options.num_labels - 1, len(id2label_map)))
        return 1
    for idee in range(options.num_labels - 1):
        if idee not in id2label_map.keys():
            logger.error('label to id map missing id: {}'.format(idee))
            return 1
    logger.log(9, 'Loaded label to id map')
    model_config = ModelConfig(**vars(options))
    decode_config = DecodeConfig(**vars(options))
    decode_data = DecodeData(
        options.data_rspecifier,
        delta_order=model_config.delta_order,
        cmvn_rxfilename=model_config.cmvn_rxfilename,
        batch_size=decode_config.batch_size,
    )
    total_batches = len(decode_data)
    labels_out = io_open(options.output_wspecifier, 'tv', mode='w')
    logger.log(9, 'Set up eval data and opened label output file')
    with redirect_stdout_to_stderr():
        logger.log(9, 'Creating model')
        from pydrobert.mol.model import ConvCTC
        model = ConvCTC(model_config)
        logger.log(9, 'Beginning decoding')
        batches_decoded = 0
        logger.log(9, '000/{:03d} batches decoded'.format(total_batches))
        for label_batch in model.decode_generator(decode_config, decode_data):
            if decode_data.batch_size:
                for key, label_ids in label_batch:
                    labels_out.write(
                        key, tuple(id2label_map[idee] for idee in label_ids))
            else:
                labels_out.write(label_batch[0], tuple(label_batch[1]))
            batches_decoded += 1
            if batches_decoded % max(1, total_batches // 10) == 0:
                logger.log(
                    9, '{:03d}/{:03d} batches decoded'.format(
                        batches_decoded, total_batches))
    logger.info('Done decoding')
Ejemplo n.º 30
0
def test_incorrect_open_read(temp_file_1_name, temp_file_2_name, ktype, value,
                             is_text, bg):
    if ktype == 'wm' and is_text:
        pytest.skip("WaveMatrix can only be written as binary")
    opts = ['', 't'] if is_text else ['']
    specifier_1 = 'ark' + ','.join(opts) + ':' + temp_file_1_name
    specifier_2 = 'ark' + ','.join(opts) + ':' + temp_file_2_name
    with io_open(specifier_1,
                 ktype, mode='w') as writer_1, io_open(specifier_2,
                                                       ktype,
                                                       mode='w') as writer_2:
        writer_1.write('0', value)
        writer_2.write('0', value)
    if bg:
        opts += ['bg']
        specifier_1 = 'ark' + ','.join(opts) + ':' + temp_file_1_name
        specifier_2 = 'ark' + ','.join(opts) + ':' + temp_file_2_name
    for bad_ktype in KaldiDataType:
        try:
            with io_open(specifier_1, bad_ktype) as reader:
                next(reader)
        except Exception:
            # sometimes it'll work, and the expected output will be
            # correct (in the case of basic types). We don't care. All
            # we care about here is that we don't segfault
            pass
    # now we add some garbage data to the end of the file and try to
    # iterate through. Chances are this will end in failure (hopefully
    # not a segfault)
    with open(temp_file_1_name, mode='ab') as writer:
        writer.write(np.random.bytes(1000))
    try:
        with io_open(specifier_1, ktype) as reader:
            list(reader)
    except Exception:
        pass
    # do the same, but only corrupt *after* the key
    with open(temp_file_2_name, mode='ab') as writer:
        writer.write(b'1 ' + np.random.bytes(1000))
    try:
        with io_open(specifier_2, ktype) as reader:
            list(reader)
    except Exception:
        pass
Ejemplo n.º 31
0
def test_read_write(temp_file_1_name, dtype, value, is_text, bg):
    opts = ['', 't'] if is_text else ['']
    specifier = 'ark' + ','.join(opts) + ':' + temp_file_1_name
    writer = io_open(specifier, dtype, mode='w')
    writer.write('a', value)
    writer.close()
    if bg:
        opts += ['bg']
        specifier = 'ark' + ','.join(opts) + ':' + temp_file_1_name
    reader = io_open(specifier, dtype)
    once = True
    for read_value in iter(reader):
        assert once, "Multiple values"
        try:
            assert np.allclose(read_value, value)
        except TypeError:
            assert read_value == value
        once = False
    reader.close()
Ejemplo n.º 32
0
def alt_add_deltas(args=None):
    '''Python-based code for adding deltas

    Used for debugging
    '''
    logger = logging.getLogger(sys.argv[0])
    if not logger.handlers:
        logger.addHandler(logging.StreamHandler())
    register_logger_for_kaldi(sys.argv[0])
    options = _alt_add_deltas_parse_args(args, logger)
    feats_in = io_open(options.feats_in, 'bm')
    feats_out = io_open(options.feats_out, 'bm', mode='w')
    num_utts = 0
    for utt_id, feats in feats_in.items():
        feats = calculate_deltas(feats, options.delta_order)
        feats_out.write(utt_id, feats)
        num_utts += 1
    logger.info('Added {} deltas to {} utterances'.format(
        options.delta_order, num_utts))
Ejemplo n.º 33
0
def alt_apply_cmvn(args=None):
    '''Python-based code for CMVN application

    Used for debugging
    '''
    logger = logging.getLogger(sys.argv[0])
    if not logger.handlers:
        logger.addHandler(logging.StreamHandler())
    register_logger_for_kaldi(sys.argv[0])
    options = _alt_apply_cmvn_parse_args(args, logger)
    cmvn = CMVNCalculator(options.cmvn_stats_in)
    feats_in = io_open(options.feats_in, 'bm')
    feats_out = io_open(options.feats_out, 'bm', mode='w')
    num_utts = 0
    for utt_id, feats in feats_in.items():
        feats = cmvn.apply(feats, in_place=True)
        feats_out.write(utt_id, feats)
        num_utts += 1
    logger.info('Applied CMVN to {} utterances'.format(num_utts))
Ejemplo n.º 34
0
def test_read_write(temp_file_1_name, dtype, value, is_text, bg):
    opts = ['', 't'] if is_text else ['']
    specifier = 'ark' + ','.join(opts) + ':' + temp_file_1_name
    writer = io_open(specifier, dtype, mode='w')
    writer.write('a', value)
    writer.close()
    if bg:
        opts += ['bg']
        specifier = 'ark' + ','.join(opts) + ':' + temp_file_1_name
    reader = io_open(specifier, dtype)
    once = True
    for read_value in iter(reader):
        assert once, "Multiple values"
        try:
            assert np.allclose(read_value, value)
        except TypeError:
            assert read_value == value
        once = False
    reader.close()
Ejemplo n.º 35
0
def test_incorrect_open_read(
        temp_file_1_name, temp_file_2_name, ktype, value, is_text, bg):
    if ktype == 'wm' and is_text:
        pytest.skip("WaveMatrix can only be written as binary")
    opts = ['', 't'] if is_text else ['']
    specifier_1 = 'ark' + ','.join(opts) + ':' + temp_file_1_name
    specifier_2 = 'ark' + ','.join(opts) + ':' + temp_file_2_name
    with io_open(specifier_1, ktype, mode='w') as writer_1, io_open(
            specifier_2, ktype, mode='w') as writer_2:
        writer_1.write('0', value)
        writer_2.write('0', value)
    if bg:
        opts += ['bg']
        specifier_1 = 'ark' + ','.join(opts) + ':' + temp_file_1_name
        specifier_2 = 'ark' + ','.join(opts) + ':' + temp_file_2_name
    for bad_ktype in KaldiDataType:
        try:
            with io_open(specifier_1, bad_ktype) as reader:
                next(reader)
        except Exception:
            # sometimes it'll work, and the expected output will be
            # correct (in the case of basic types). We don't care. All
            # we care about here is that we don't segfault
            pass
    # now we add some garbage data to the end of the file and try to
    # iterate through. Chances are this will end in failure (hopefully
    # not a segfault)
    with open(temp_file_1_name, mode='ab') as writer:
        writer.write(np.random.bytes(1000))
    try:
        with io_open(specifier_1, ktype) as reader:
            list(reader)
    except Exception:
        pass
    # do the same, but only corrupt *after* the key
    with open(temp_file_2_name, mode='ab') as writer:
        writer.write(b'1 ' + np.random.bytes(1000))
    try:
        with io_open(specifier_2, ktype) as reader:
            list(reader)
    except Exception:
        pass
Ejemplo n.º 36
0
def test_write_int32_correct_size(temp_file_1_name):
    with io_open('ark:' + temp_file_1_name, 'i', mode='w') as writer:
        writer.write('9', 182)
    # size should be 9
    # 2 bytes for '9 '
    # 2 byte for binary marker \0B
    # 1 byte for size of type in bytes (4)
    # 4 bytes for actual int
    with open(temp_file_1_name, 'rb') as file_obj:
        buf = file_obj.read()
    assert len(buf) == 9
Ejemplo n.º 37
0
def test_write_script_and_archive(temp_file_1_name, temp_file_2_name):
    values = {
        'foo': np.ones((21, 32), dtype=np.float64),
        'bar': np.zeros((10, 1000), dtype=np.float64),
        'baz': -1e10 * np.eye(20, dtype=np.float64),
    }
    keys = list(values)
    writer = io_open('ark,scp:{},{}'.format(temp_file_1_name,
                                            temp_file_2_name),
                     'dm',
                     mode='w')
    # to make a missing entry, append it to the file's end with a subproc
    for key in keys:
        writer.write(key, values[key])
    writer.close()
    keys.reverse()
    reader = io_open('scp:{}'.format(temp_file_2_name), 'dm', mode='r+')
    for key in keys:
        assert np.allclose(reader[key], values[key]), key
    assert np.allclose(reader['bar'], values['bar']), "Failed doublecheck"
Ejemplo n.º 38
0
 def _ignore_epoch(self):
     '''Epoch of samples w/ ignore_missing'''
     iters = tuple(
         io_open(spec[0], spec[1], **spec[2]).items()
         for spec in self.table_specifiers
     )
     num_samples = 0
     num_tabs = len(iters)
     try:
         while True:
             samp_tup = [None] * num_tabs
             high_key = None
             tab_idx = 0
             while tab_idx < num_tabs:
                 if samp_tup[tab_idx] is None:
                     key, value = next(iters[tab_idx])
                     if high_key is None:
                         high_key = key
                     elif high_key < key:
                         # key is further along than keys in
                         # samp_tup. Discard those and keep this
                         samp_tup = [None] * num_tabs
                         samp_tup[tab_idx] = value
                         high_key = key
                         tab_idx = 0
                         continue
                     elif high_key > key:
                         # key is behind high_key. keep pushing this
                         # iterator forward
                         continue
                     samp_tup[tab_idx] = value
                 tab_idx += 1
             num_samples += 1
             for sub_batch_idx, axis_idx in self.axis_lengths:
                 samp_tup.append(
                     np.array(
                         samp_tup[sub_batch_idx],
                         copy=False).shape[axis_idx])
             if self.add_key:
                 samp_tup.insert(0, key)
             if self.num_sub != 1:
                 yield tuple(samp_tup)
             else:
                 yield samp_tup[0]
     except StopIteration:
         pass
     # don't care if one iterator ends first - rest will be missing
     # that iterator's value
     if self._num_samples is None:
         self._num_samples = num_samples
     elif self._num_samples != num_samples:
         raise IOError(
             'Different number of samples from last time! (is a '
             'table from stdin?)')
Ejemplo n.º 39
0
def test_write_int32_correct_size(temp_file_1_name):
    with io_open('ark:' + temp_file_1_name, 'i', mode='w') as writer:
        writer.write('9', 182)
    # size should be 9
    # 2 bytes for '9 '
    # 2 byte for binary marker \0B
    # 1 byte for size of type in bytes (4)
    # 4 bytes for actual int
    with open(temp_file_1_name, 'rb') as file_obj:
        buf = file_obj.read()
    assert len(buf) == 9
Ejemplo n.º 40
0
def test_write_script_and_archive(temp_file_1_name, temp_file_2_name):
    values = {
        'foo': np.ones((21, 32), dtype=np.float64),
        'bar': np.zeros((10, 1000), dtype=np.float64),
        'baz': -1e10 * np.eye(20, dtype=np.float64),
    }
    keys = list(values)
    writer = io_open(
        'ark,scp:{},{}'.format(temp_file_1_name, temp_file_2_name),
        'dm', mode='w'
    )
    # to make a missing entry, append it to the file's end with a subproc
    for key in keys:
        writer.write(key, values[key])
    writer.close()
    keys.reverse()
    reader = io_open('scp:{}'.format(temp_file_2_name), 'dm', mode='r+')
    for key in keys:
        assert np.allclose(reader[key], values[key]), key
    assert np.allclose(reader['bar'], values['bar']), "Failed doublecheck"
Ejemplo n.º 41
0
def test_sequential_ignore_missing(temp_file_1_name, temp_file_2_name):
    with io_open('ark:' + temp_file_1_name, 'ipv', mode='w') as pair_f:
        pair_f.write('10', [(10, 9), (8, 7), (6, 5)])
        pair_f.write('11', [(11, 10), (9, 8)])
        pair_f.write('12', [(12, 11)])
        pair_f.write('14', [])
    with io_open('ark:' + temp_file_2_name, 'd', mode='w') as double_f:
        double_f.write('09', 3.14)
        double_f.write('10', .159)
        double_f.write('12', .265)
        double_f.write('13', .357)
    data = corpus.SequentialData(
        ('ark,s:' + temp_file_1_name, 'ipv'),
        ('ark,s:' + temp_file_2_name, 'd'), ignore_missing=True)
    act_pair_samples, act_double_samples = list(zip(*iter(data)))
    assert act_pair_samples == (
        ((10, 9), (8, 7), (6, 5)),
        ((12, 11),)
    )
    assert np.allclose(act_double_samples, (.159, .265))
Ejemplo n.º 42
0
 def _no_ignore_epoch(self):
     '''Epoch of samples w/o ignore_missing'''
     iters = tuple(
         io_open(spec[0], spec[1], **spec[2]).items()
         for spec in self.table_specifiers
     )
     num_samples = 0
     for kv_pairs in zip(*iters):
         samp_tup = []
         past_key = None
         for tab_idx, (key, sample) in enumerate(kv_pairs):
             if past_key is None:
                 past_key = key
             elif past_key != key:
                 # assume sorted, base on which is first
                 if past_key < key:
                     miss_rspec = self.table_specifiers[tab_idx][0]
                     miss_key = past_key
                 else:
                     miss_rspec = self.table_specifiers[tab_idx - 1][0]
                     miss_key = key
                 raise IOError(
                     'Table {} missing key {} (or tables are sorted '
                     'differently)'.format(miss_rspec, miss_key))
             samp_tup.append(sample)
         num_samples += 1
         for sub_batch_idx, axis_idx in self.axis_lengths:
             samp_tup.append(
                 np.array(
                     samp_tup[sub_batch_idx], copy=False).shape[axis_idx])
         if self.add_key:
             samp_tup.insert(0, key)
         if self.num_sub != 1:
             yield tuple(samp_tup)
         else:
             yield samp_tup[0]
     # make sure all iterators ended at the same time
     for tab_idx, it in enumerate(iters):
         try:
             miss_key, _ = next(it)
             if tab_idx:
                 miss_rspec = self.table_specifiers[0][0]
             else:
                 miss_rspec = self.table_specifiers[1][0]
             raise IOError(
                 'Table {} missing key {}'.format(miss_rspec, miss_key))
         except StopIteration:
             pass
     if self._num_samples is None:
         self._num_samples = num_samples
     elif self._num_samples != num_samples:
         raise IOError(
             'Different number of samples from last time! (is a '
             'table from stdin?)')
Ejemplo n.º 43
0
 def _no_ignore_epoch(self):
     '''Epoch of samples w/o ignore_missing'''
     iters = tuple(
         io_open(spec[0], spec[1], **spec[2]).items()
         for spec in self.table_specifiers
     )
     num_samples = 0
     for kv_pairs in zip(*iters):
         samp_tup = []
         past_key = None
         for tab_idx, (key, sample) in enumerate(kv_pairs):
             if past_key is None:
                 past_key = key
             elif past_key != key:
                 # assume sorted, base on which is first
                 if past_key < key:
                     miss_rspec = self.table_specifiers[tab_idx][0]
                     miss_key = past_key
                 else:
                     miss_rspec = self.table_specifiers[tab_idx - 1][0]
                     miss_key = key
                 raise IOError(
                     'Table {} missing key {} (or tables are sorted '
                     'differently)'.format(miss_rspec, miss_key))
             samp_tup.append(sample)
         num_samples += 1
         for sub_batch_idx, axis_idx in self.axis_lengths:
             samp_tup.append(
                 np.array(
                     samp_tup[sub_batch_idx], copy=False).shape[axis_idx])
         if self.add_key:
             samp_tup.insert(0, key)
         if self.num_sub != 1:
             yield tuple(samp_tup)
         else:
             yield samp_tup[0]
     # make sure all iterators ended at the same time
     for tab_idx, it in enumerate(iters):
         try:
             miss_key, _ = next(it)
             if tab_idx:
                 miss_rspec = self.table_specifiers[0][0]
             else:
                 miss_rspec = self.table_specifiers[1][0]
             raise IOError(
                 'Table {} missing key {}'.format(miss_rspec, miss_key))
         except StopIteration:
             pass
     if self._num_samples is None:
         self._num_samples = num_samples
     elif self._num_samples != num_samples:
         raise IOError(
             'Different number of samples from last time! (is a '
             'table from stdin?)')
Ejemplo n.º 44
0
 def _ignore_epoch(self):
     '''Epoch of samples w/ ignore_missing'''
     iters = tuple(
         io_open(spec[0], spec[1], **spec[2]).items()
         for spec in self.table_specifiers
     )
     num_samples = 0
     num_tabs = len(iters)
     try:
         while True:
             samp_tup = [None] * num_tabs
             high_key = None
             tab_idx = 0
             while tab_idx < num_tabs:
                 if samp_tup[tab_idx] is None:
                     key, value = next(iters[tab_idx])
                     if high_key is None:
                         high_key = key
                     elif high_key < key:
                         # key is further along than keys in
                         # samp_tup. Discard those and keep this
                         samp_tup = [None] * num_tabs
                         samp_tup[tab_idx] = value
                         high_key = key
                         tab_idx = 0
                         continue
                     elif high_key > key:
                         # key is behind high_key. keep pushing this
                         # iterator forward
                         continue
                     samp_tup[tab_idx] = value
                 tab_idx += 1
             num_samples += 1
             for sub_batch_idx, axis_idx in self.axis_lengths:
                 samp_tup.append(
                     np.array(
                         samp_tup[sub_batch_idx],
                         copy=False).shape[axis_idx])
             if self.add_key:
                 samp_tup.insert(0, key)
             if self.num_sub != 1:
                 yield tuple(samp_tup)
             else:
                 yield samp_tup[0]
     except StopIteration:
         pass
     # don't care if one iterator ends first - rest will be missing
     # that iterator's value
     if self._num_samples is None:
         self._num_samples = num_samples
     elif self._num_samples != num_samples:
         raise IOError(
             'Different number of samples from last time! (is a '
             'table from stdin?)')
Ejemplo n.º 45
0
def test_seeded_shuffled_is_predictable(temp_file_1_name, seed):
    samples = np.arange(100000).reshape((1000, 100)).astype(np.float32)
    with io_open('ark:' + temp_file_1_name, 'fv', mode='w') as f:
        for idx, sample in enumerate(samples):
            f.write(str(idx), sample)
    data_1 = corpus.ShuffledData(
        ('ark:' + temp_file_1_name, 'fv'), batch_size=13, rng=seed)
    data_2 = corpus.ShuffledData(
        ('ark:' + temp_file_1_name, 'fv'), batch_size=13, rng=seed)
    for _ in range(2):
        for batch_1, batch_2 in zip(data_1, data_2):
            assert np.allclose(batch_1, batch_2)
Ejemplo n.º 46
0
def test_wave_read_write_valid(temp_file_1_name):
    specifier = 'ark:{}'.format(temp_file_1_name)
    writer = io_open(specifier, 'wm', mode='w')
    n_waves = 10
    keys = [str(i) for i in range(n_waves)]
    n_samples = [np.random.randint(1, 100000) for _ in keys]
    n_channels = [np.random.randint(1, 3) for _ in keys]
    # always written as pcm 16
    bufs = [(np.random.random((y, x)) * 30000 - 15000).astype(np.int16)
            for x, y in zip(n_samples, n_channels)]
    for key, buf in zip(keys, bufs):
        writer.write(key, buf)
    writer.close()
    reader = io_open(specifier, 'wm', value_style='sbd')
    for vals, expected_buf in zip(reader, bufs):
        sample_rate, actual_buf, dur = vals
        assert int(sample_rate) == 16000
        assert isinstance(dur, float)
        assert np.allclose(actual_buf, expected_buf)
        n_waves -= 1
    assert not n_waves, "Incorrect number of reads!"
Ejemplo n.º 47
0
def test_chained(temp_file_1_name):
    # wholly too limited a test
    obj_list = [
        ('iv', tuple(x for x in range(1000))),
        ('fm', [[1, 2.5], [1e-10, 4]]),
        ('dv', np.random.random(1)),
        ('dm', np.random.random((100, 20))),
        ('t', 'fiddlesticks'),
        ('t', 'munsters'),
    ]
    shuffle(obj_list)
    with io_open(temp_file_1_name, mode='w') as outp:
        for dtype, obj in obj_list:
            outp.write(obj, dtype)
    with io_open(temp_file_1_name) as inp:
        for dtype, obj in obj_list:
            read = inp.read(dtype)
            if dtype in ('fv', 'fm', 'dv', 'dm'):
                assert np.allclose(read, obj)
            else:
                assert read == obj
Ejemplo n.º 48
0
def test_chained(temp_file_1_name):
    # wholly too limited a test
    obj_list = [
        ('iv', tuple(x for x in range(1000))),
        ('fm', [[1, 2.5], [1e-10, 4]]),
        ('dv', np.random.random(1)),
        ('dm', np.random.random((100, 20))),
        ('t', 'fiddlesticks'),
        ('t', 'munsters'),
    ]
    shuffle(obj_list)
    with io_open(temp_file_1_name, mode='w') as outp:
        for dtype, obj in obj_list:
            outp.write(obj, dtype)
    with io_open(temp_file_1_name) as inp:
        for dtype, obj in obj_list:
            read = inp.read(dtype)
            if dtype in ('fv', 'fm', 'dv', 'dm'):
                assert np.allclose(read, obj)
            else:
                assert read == obj
Ejemplo n.º 49
0
def test_wave_read_write_valid(temp_file_1_name):
    specifier = 'ark:{}'.format(temp_file_1_name)
    writer = io_open(specifier, 'wm', mode='w')
    n_waves = 10
    keys = [str(i) for i in range(n_waves)]
    n_samples = [np.random.randint(1, 100000) for _ in keys]
    n_channels = [np.random.randint(1, 3) for _ in keys]
    # always written as pcm 16
    bufs = [
        (np.random.random((y, x)) * 30000 - 15000).astype(np.int16)
        for x, y in zip(n_samples, n_channels)
    ]
    for key, buf in zip(keys, bufs):
        writer.write(key, buf)
    writer.close()
    reader = io_open(specifier, 'wm', value_style='sbd')
    for vals, expected_buf in zip(reader, bufs):
        sample_rate, actual_buf, dur = vals
        assert int(sample_rate) == 16000
        assert isinstance(dur, float)
        assert np.allclose(actual_buf, expected_buf)
        n_waves -= 1
    assert not n_waves, "Incorrect number of reads!"
Ejemplo n.º 50
0
def test_sequential_basic(temp_file_1_name):
    samples = np.arange(1000).reshape((10, 100)).astype(np.int32)
    with io_open('ark:' + temp_file_1_name, 'iv', mode='w') as f:
        for idx, sample in enumerate(samples):
            f.write(str(idx), sample)
    data = corpus.SequentialData(('ark,s:' + temp_file_1_name, 'iv'),
                                 batch_size=3)
    assert len(data) == 4
    batch_start = 0
    for act_batch in data:
        ex_batch = samples[batch_start:batch_start + 3]
        assert np.allclose(ex_batch, act_batch)
        batch_start += len(ex_batch)
    assert batch_start == len(samples)
Ejemplo n.º 51
0
def test_seeded_shuffled_is_predictable(temp_file_1_name, seed):
    samples = np.arange(100000).reshape((1000, 100)).astype(np.float32)
    with io_open('ark:' + temp_file_1_name, 'fv', mode='w') as f:
        for idx, sample in enumerate(samples):
            f.write(str(idx), sample)
    data_1 = corpus.ShuffledData(('ark:' + temp_file_1_name, 'fv'),
                                 batch_size=13,
                                 rng=seed)
    data_2 = corpus.ShuffledData(('ark:' + temp_file_1_name, 'fv'),
                                 batch_size=13,
                                 rng=seed)
    for _ in range(2):
        for batch_1, batch_2 in zip(data_1, data_2):
            assert np.allclose(batch_1, batch_2)
Ejemplo n.º 52
0
def test_sequential_data_tups(temp_file_1_name, temp_file_2_name):
    feats = np.random.random((4, 10, 100)).astype(np.float64)
    labels = [
        ('foo',), ('bar', 'baz',),
        ('bingo',), ('bango', 'bongo', 'eugene'),
    ]
    with io_open('ark:' + temp_file_1_name, 'dm', mode='w') as feats_f, \
            io_open('ark:' + temp_file_2_name, 'tv', mode='w') as labels_f:
        for idx, (feat, label) in enumerate(zip(feats, labels)):
            feats_f.write(str(idx), feat)
            labels_f.write(str(idx), label)
    data = corpus.SequentialData(
        ('ark,s:' + temp_file_1_name, 'dm'),
        ('ark,s:' + temp_file_2_name, 'tv'), axis_lengths=0)
    batch_start = 0
    for ex_feat, ex_label, (act_feat, act_label, act_len) in zip(
            feats, labels, data):
        assert np.allclose(ex_feat, act_feat)
        assert ex_label == act_label
        assert act_len == 10
        batch_start += 1
    assert batch_start == len(feats)
    assert len(data) == batch_start
Ejemplo n.º 53
0
def test_sequential_basic(temp_file_1_name):
    samples = np.arange(1000).reshape((10, 100)).astype(np.int32)
    with io_open('ark:' + temp_file_1_name, 'iv', mode='w') as f:
        for idx, sample in enumerate(samples):
            f.write(str(idx), sample)
    data = corpus.SequentialData(
        ('ark,s:' + temp_file_1_name, 'iv'), batch_size=3)
    assert len(data) == 4
    batch_start = 0
    for act_batch in data:
        ex_batch = samples[batch_start:batch_start + 3]
        assert np.allclose(ex_batch, act_batch)
        batch_start += len(ex_batch)
    assert batch_start == len(samples)
Ejemplo n.º 54
0
def test_read_sequential(temp_file_1_name):
    values = (
        [[1, 2] * 10] * 10,
        np.eye(1000, dtype=np.float32),
        [[]],
        np.outer(np.arange(1000, dtype=np.float32),
                 np.arange(1000, dtype=np.float32)),
    )
    writer = io_open('ark:{}'.format(temp_file_1_name), 'fm', mode='w')
    for key, value in enumerate(values):
        writer.write(str(key), value)
    writer.close()
    count = 0
    reader = io_open('ark:{}'.format(temp_file_1_name), 'fm')
    for act_value, reader_value in zip(values, iter(reader)):
        assert np.allclose(act_value, reader_value)
        count += 1
    assert count == len(values)
    reader.close()
    # check that the keys are all savvy
    reader = io_open('ark:{}'.format(temp_file_1_name), 'fm')
    for idx, tup in enumerate(reader.items()):
        key, value = tup
        assert str(idx) == key
Ejemplo n.º 55
0
def test_shuffled_data_basic(temp_file_1_name):
    samples = np.arange(100000).reshape((10, 200, 50)).astype(
        np.float64 if KaldiDataType.BaseMatrix.is_double else np.float32)
    keys = tuple(str(i) for i in range(10))
    with io_open('ark:' + temp_file_1_name, 'bm', mode='w') as f:
        for key, sample in zip(keys, samples):
            f.write(key, sample)
    data = corpus.ShuffledData(
        'ark:' + temp_file_1_name, batch_size=3, rng=NonRandomState())
    assert isinstance(data.rng, NonRandomState)
    assert len(data) == int(np.ceil(len(keys) / 3))
    assert keys == tuple(data.key_list)
    for _ in range(2):
        ex_samp_idx = len(samples)
        for batch in data:
            for act_sample in batch:
                ex_samp_idx -= 1
                assert np.allclose(samples[ex_samp_idx], act_sample)
Ejemplo n.º 56
0
def test_shuffled_data_basic(temp_file_1_name):
    samples = np.arange(100000).reshape((10, 200, 50)).astype(
        np.float64 if KaldiDataType.BaseMatrix.is_double else np.float32)
    keys = tuple(str(i) for i in range(10))
    with io_open('ark:' + temp_file_1_name, 'bm', mode='w') as f:
        for key, sample in zip(keys, samples):
            f.write(key, sample)
    data = corpus.ShuffledData('ark:' + temp_file_1_name,
                               batch_size=3,
                               rng=NonRandomState())
    assert isinstance(data.rng, NonRandomState)
    assert len(data) == int(np.ceil(len(keys) / 3))
    assert keys == tuple(data.key_list)
    for _ in range(2):
        ex_samp_idx = len(samples)
        for batch in data:
            for act_sample in batch:
                ex_samp_idx -= 1
                assert np.allclose(samples[ex_samp_idx], act_sample)
Ejemplo n.º 57
0
def alt_compute_cmvn_stats(args=None):
    '''Python-based code for CMVN statistics computation

    Used for debugging
    '''
    logger = logging.getLogger(sys.argv[0])
    if not logger.handlers:
        logger.addHandler(logging.StreamHandler())
    register_logger_for_kaldi(sys.argv[0])
    options = _alt_compute_cmvn_stats_parse_args(args, logger)
    cmvn = CMVNCalculator()
    feat_table = io_open(options.feats_in, 'bm')
    num_utts = 0
    for feats in feat_table:
        cmvn.accumulate(feats)
        num_utts += 1
    logger.info('Accumulated stats for {} utterances'.format(num_utts))
    cmvn.save(options.cmvn_stats_out)
    logger.info('Wrote stats to {}'.format(options.cmvn_stats_out))
Ejemplo n.º 58
0
def test_invalid_data_type(temp_file_1_name):
    specifier = 'ark:{}'.format(temp_file_1_name)
    with pytest.raises(ValueError):
        io_open(specifier, 'foo', mode='w')
Ejemplo n.º 59
0
def test_open_string_or_data_type(temp_file_1_name):
    specifier = 'ark:{}'.format(temp_file_1_name)
    io_open(specifier, 'bm', mode='w')
    io_open(specifier, table_streams.KaldiDataType.BaseMatrix, mode='w')
    io_open(specifier, 'bm', mode='r')
    io_open(specifier, table_streams.KaldiDataType.BaseMatrix, mode='r')
    io_open(specifier, 'bm', mode='r+')
    io_open(specifier, table_streams.KaldiDataType.BaseMatrix, mode='r+')
Ejemplo n.º 60
0
def test_no_exception_on_double_close(temp_file_1_name):
    specifier = 'ark:{}'.format(temp_file_1_name)
    kaldi_io = io_open(specifier, 'bm', mode='w')
    kaldi_io.close()
    kaldi_io.close()