def test_read_sequential(temp_file_1_name): values = ( [[1, 2] * 10] * 10, np.eye(1000, dtype=np.float32), [[]], np.outer( np.arange(1000, dtype=np.float32), np.arange(1000, dtype=np.float32)), ) writer = io_open('ark:{}'.format(temp_file_1_name), 'fm', mode='w') for key, value in enumerate(values): writer.write(str(key), value) writer.close() count = 0 reader = io_open('ark:{}'.format(temp_file_1_name), 'fm') for act_value, reader_value in zip(values, iter(reader)): assert np.allclose(act_value, reader_value) count += 1 assert count == len(values) reader.close() # check that the keys are all savvy reader = io_open('ark:{}'.format(temp_file_1_name), 'fm') for idx, tup in enumerate(reader.items()): key, value = tup assert str(idx) == key
def __init__(self, table, *additional_tables, **kwargs): key_list = kwargs.pop('key_list', None) rng = kwargs.pop('rng', None) super(ShuffledData, self).__init__(table, *additional_tables, **kwargs) try: key_list = tuple(key_list) except TypeError: pass if key_list is None: _, rx_fn, rx_type, _ = parse_kaldi_input_path( self.table_specifiers[0][0]) if rx_type == RxfilenameType.InvalidInput: raise IOError('Invalid rspecifier {}'.format(rx_fn)) elif rx_type == RxfilenameType.StandardInput: raise IOError( 'Cannot infer key list from stdin (cannot reopen)') with io_open(*self.table_specifiers[0][:2]) as reader: self.key_list = tuple(reader.keys()) else: self.key_list = tuple(key_list) if self.ignore_missing: self._num_samples = None else: self._num_samples = len(self.key_list) if isinstance(rng, np.random.RandomState): self.rng = rng else: self.rng = np.random.RandomState(rng) self.table_handles = tuple( io_open(rspecifier, kdtype, mode='r+', **o_kwargs) for rspecifier, kdtype, o_kwargs in self.table_specifiers)
def test_shuffled_ignore_missing(temp_file_1_name, temp_file_2_name): with io_open('ark:' + temp_file_1_name, 't', mode='w') as token_f: token_f.write('1', 'cool') token_f.write('3', 'bean') token_f.write('4', 'casserole') keys = [str(i) for i in range(6)] data = corpus.ShuffledData(('ark:' + temp_file_1_name, 't'), key_list=keys, ignore_missing=True, rng=NonRandomState()) assert len(data) == 3 act_samples = list(data) assert all(ex == act for ex, act in zip(['casserole', 'bean', 'cool'], act_samples)) with io_open('ark:' + temp_file_2_name, 'B', mode='w') as bool_f: bool_f.write('0', True) bool_f.write('1', False) bool_f.write('2', True) bool_f.write('4', False) data = corpus.ShuffledData(('ark:' + temp_file_1_name, 't'), ('ark:' + temp_file_2_name, 'B'), key_list=keys, ignore_missing=True, rng=NonRandomState()) assert len(data) == 2 act_tok_samples, act_bool_samples = list(zip(*iter(data))) assert all(ex == act for ex, act in zip(['casserole', 'cool'], act_tok_samples)) assert all(not act for act in act_bool_samples)
def test_shuffled_ignore_missing(temp_file_1_name, temp_file_2_name): with io_open('ark:' + temp_file_1_name, 't', mode='w') as token_f: token_f.write('1', 'cool') token_f.write('3', 'bean') token_f.write('4', 'casserole') keys = [str(i) for i in range(6)] data = corpus.ShuffledData( ('ark:' + temp_file_1_name, 't'), key_list=keys, ignore_missing=True, rng=NonRandomState()) assert len(data) == 3 act_samples = list(data) assert all(ex == act for ex, act in zip( ['casserole', 'bean', 'cool'], act_samples)) with io_open('ark:' + temp_file_2_name, 'B', mode='w') as bool_f: bool_f.write('0', True) bool_f.write('1', False) bool_f.write('2', True) bool_f.write('4', False) data = corpus.ShuffledData( ('ark:' + temp_file_1_name, 't'), ('ark:' + temp_file_2_name, 'B'), key_list=keys, ignore_missing=True, rng=NonRandomState()) assert len(data) == 2 act_tok_samples, act_bool_samples = list(zip(*iter(data))) assert all(ex == act for ex, act in zip( ['casserole', 'cool'], act_tok_samples)) assert all(not act for act in act_bool_samples)
def __init__(self, table, *additional_tables, **kwargs): key_list = kwargs.pop('key_list', None) rng = kwargs.pop('rng', None) super(ShuffledData, self).__init__(table, *additional_tables, **kwargs) try: key_list = tuple(key_list) except TypeError: pass if key_list is None: _, rx_fn, rx_type, _ = parse_kaldi_input_path( self.table_specifiers[0][0]) if rx_type == RxfilenameType.InvalidInput: raise IOError('Invalid rspecifier {}'.format(rx_fn)) elif rx_type == RxfilenameType.StandardInput: raise IOError( 'Cannot infer key list from stdin (cannot reopen)') with io_open(*self.table_specifiers[0][:2]) as reader: self.key_list = tuple(reader.keys()) else: self.key_list = tuple(key_list) if self.ignore_missing: self._num_samples = None else: self._num_samples = len(self.key_list) if isinstance(rng, np.random.RandomState): self.rng = rng else: self.rng = np.random.RandomState(rng) self.table_handles = tuple( io_open(rspecifier, kdtype, mode='r+', **o_kwargs) for rspecifier, kdtype, o_kwargs in self.table_specifiers )
def test_sequential_data_tups(temp_file_1_name, temp_file_2_name): feats = np.random.random((4, 10, 100)).astype(np.float64) labels = [ ('foo', ), ( 'bar', 'baz', ), ('bingo', ), ('bango', 'bongo', 'eugene'), ] with io_open('ark:' + temp_file_1_name, 'dm', mode='w') as feats_f, \ io_open('ark:' + temp_file_2_name, 'tv', mode='w') as labels_f: for idx, (feat, label) in enumerate(zip(feats, labels)): feats_f.write(str(idx), feat) labels_f.write(str(idx), label) data = corpus.SequentialData(('ark,s:' + temp_file_1_name, 'dm'), ('ark,s:' + temp_file_2_name, 'tv'), axis_lengths=0) batch_start = 0 for ex_feat, ex_label, (act_feat, act_label, act_len) in zip(feats, labels, data): assert np.allclose(ex_feat, act_feat) assert ex_label == act_label assert act_len == 10 batch_start += 1 assert batch_start == len(feats) assert len(data) == batch_start
def test_read_write_valid(temp_file_1_name, ktype, value, binary): with io_open(temp_file_1_name, mode='w', header=False) as outp: outp.write(value, ktype, write_binary=binary) with io_open(temp_file_1_name, header=False) as inp: read_value = inp.read(ktype, read_binary=binary) if ktype in ('bv', 'bm', 'fv', 'fm', 'dv', 'dm', 'b', 'd', 'bpv'): assert np.allclose(read_value, value) else: assert read_value == value
def test_read_write_pipe_posix(temp_file_1_name): value = np.ones((1000, 10000), dtype=np.float32) writer = io_open( 'ark:| gzip -c > {}'.format(temp_file_1_name), 'fm', mode='w') writer.write('bar', value) writer.close() reader = io_open( 'ark:gunzip -c {}|'.format(temp_file_1_name), 'fm', mode='r+') assert np.allclose(reader['bar'], value)
def test_write_read_numpy_versions(temp_file_1_name, ktype, dtype, value): npy_value = np.array(value).astype(dtype) with io_open('ark:' + temp_file_1_name, ktype, mode='w') as writer: writer.write('key', npy_value) with io_open('ark:' + temp_file_1_name, ktype) as reader: act_value = next(reader) if ktype in ('b', 'bpv'): assert np.allclose(value, act_value) else: assert value == act_value
def test_write_read_numpy_versions(temp_file_1_name, ktype, dtype, value): npy_value = np.array(value).astype(dtype) with io_open(temp_file_1_name, mode='w', header=False) as outp: outp.write(npy_value, ktype) with io_open(temp_file_1_name, header=False) as inp: act_value = inp.read(ktype) if ktype in ('b', 'bpv'): assert np.allclose(value, act_value) else: assert value == act_value
def test_shuffled_data_tups(temp_file_1_name, temp_file_2_name): feats = [ [[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10], [11, 12]], [[13, 14, 15], [16, 17, 18]], [[19], [20]], ] labels = [ np.array([[1, 2], [3, 4]], dtype=np.float64), np.array([[5, 6, 7, 8], [9, 10, 11, 12]], dtype=np.float64), np.array([[13], [14]], dtype=np.float64), np.array([[15, 16, 17], [18, 19, 20]], dtype=np.float64) ] keys = tuple(str(i) for i in range(4)) with io_open('ark:' + temp_file_1_name, 'ivv', mode='w') as feat_f, \ io_open('ark:' + temp_file_2_name, 'dm', mode='w') as lab_f: for key, feat, label in zip(keys, feats, labels): feat_f.write(key, feat) lab_f.write(key, label) data = corpus.ShuffledData( ('ark:' + temp_file_1_name, 'ivv'), ('ark:' + temp_file_2_name, 'dm'), batch_size=2, batch_pad_mode='constant', key_list=keys, axis_lengths=1, rng=NonRandomState(), batch_cast_to_array=(np.int32, None, None)) for _ in range(2): ex_samp_idx = len(feats) for feat_batch, _, len_batch in data: for act_feat, act_len in zip(feat_batch, len_batch): ex_samp_idx -= 1 ex_feat = np.array(feats[ex_samp_idx], copy=False) ex_len = ex_feat.shape[1] assert ex_len == act_len assert np.allclose(ex_feat, act_feat[:, :ex_len]) assert np.allclose(act_feat[:, ex_len:], 0) data = corpus.ShuffledData( ('ark:' + temp_file_1_name, 'ivv'), ('ark:' + temp_file_2_name, 'dm'), batch_size=3, batch_pad_mode='constant', key_list=keys, axis_lengths=((1, 1), (0, 1)), rng=NonRandomState(), batch_cast_to_array=(np.int32, None, None, None)) for _ in range(2): ex_samp_idx = len(feats) for feat_batch, label_batch, lablen_batch, featlen_batch in data: for act_feat, act_label, act_lablen, act_featlen in zip( feat_batch, label_batch, lablen_batch, featlen_batch): ex_samp_idx -= 1 ex_feat = np.array(feats[ex_samp_idx], copy=False) ex_label = labels[ex_samp_idx] ex_featlen = ex_feat.shape[1] ex_lablen = ex_label.shape[1] assert ex_featlen == act_featlen assert ex_lablen == act_lablen assert np.allclose(ex_feat, act_feat[:, :ex_featlen]) assert np.allclose(act_feat[:, ex_featlen:], 0) assert np.allclose(ex_label, act_label[:, :ex_lablen]) assert np.allclose(act_label[:, ex_lablen:], 0)
def test_filehandle_open(temp_file_1_name): specifier = 'ark:{}'.format(temp_file_1_name) kaldi_io = io_open(specifier, 'bm', mode='w') assert isinstance(kaldi_io, table_streams.KaldiTable) assert isinstance(kaldi_io, table_streams.KaldiWriter) kaldi_io = io_open(specifier, 'bm') assert isinstance(kaldi_io, table_streams.KaldiSequentialReader) kaldi_io = io_open(specifier, 'bm', mode='r') assert isinstance(kaldi_io, table_streams.KaldiSequentialReader) kaldi_io = io_open(specifier, 'bm', mode='r+') assert isinstance(kaldi_io, table_streams.KaldiRandomAccessReader)
def test_read_write_pipe_posix(temp_file_1_name): value = np.ones((1000, 10000), dtype=np.float32) writer = io_open('ark:| gzip -c > {}'.format(temp_file_1_name), 'fm', mode='w') writer.write('bar', value) writer.close() reader = io_open('ark:gunzip -c {}|'.format(temp_file_1_name), 'fm', mode='r+') assert np.allclose(reader['bar'], value)
def test_read_random(temp_file_1_name): writer = io_open('ark:{}'.format(temp_file_1_name), 'dv', mode='w') writer.write('able', []) writer.write('was', [2]) writer.write('I', [3, 3]) writer.write('ere', [4, 4]) writer.close() reader = io_open('ark,o:{}'.format(temp_file_1_name), 'dv', mode='r+') assert np.allclose(reader['I'], [3, 3]) assert np.allclose(reader['able'], []) assert np.allclose(reader['was'], [2])
def test_read_random(temp_file_1_name): writer = io_open('ark:{}'.format(temp_file_1_name), 'dv', mode='w') writer.write('able', []) writer.write('was', [2]) writer.write('I', [3, 3]) writer.write('ere', [4, 4]) writer.close() reader = io_open( 'ark,o:{}'.format(temp_file_1_name), 'dv', mode='r+') assert np.allclose(reader['I'], [3, 3]) assert np.allclose(reader['able'], []) assert np.allclose(reader['was'], [2])
def test_invalid_scp(temp_file_1_name): # make sure invalid scp files don't segfault with open(temp_file_1_name, mode='wb') as writer: writer.write(np.random.bytes(1000)) try: with io_open('scp:' + temp_file_1_name) as reader: next(reader) except Exception: pass with open(temp_file_1_name, mode='wb') as writer: writer.write(b'foo ' + np.random.bytes(1000)) try: with io_open('scp:' + temp_file_1_name) as reader: next(reader) except Exception: pass
def test_invalid_tv_does_not_segfault(temp_file_1_name): # weird bug I found tv = 'foo bar' writer = io_open('ark:' + temp_file_1_name, 'tv', mode='w') with pytest.raises(Exception): writer.write('foo', tv) with pytest.raises(Exception): writer.write('foo', np.array(tv))
def test_sequential_ignore_missing(temp_file_1_name, temp_file_2_name): with io_open('ark:' + temp_file_1_name, 'ipv', mode='w') as pair_f: pair_f.write('10', [(10, 9), (8, 7), (6, 5)]) pair_f.write('11', [(11, 10), (9, 8)]) pair_f.write('12', [(12, 11)]) pair_f.write('14', []) with io_open('ark:' + temp_file_2_name, 'd', mode='w') as double_f: double_f.write('09', 3.14) double_f.write('10', .159) double_f.write('12', .265) double_f.write('13', .357) data = corpus.SequentialData(('ark,s:' + temp_file_1_name, 'ipv'), ('ark,s:' + temp_file_2_name, 'd'), ignore_missing=True) act_pair_samples, act_double_samples = list(zip(*iter(data))) assert act_pair_samples == (((10, 9), (8, 7), (6, 5)), ((12, 11), )) assert np.allclose(act_double_samples, (.159, .265))
def test_write_invalid(temp_file_1_name, dtype, value, is_text): if is_text: specifier = 'ark,t:{}'.format(temp_file_1_name) else: specifier = 'ark:{}'.format(temp_file_1_name) writer = io_open(specifier, dtype, mode='w') with pytest.raises(Exception): writer.write('a', value)
def test_cache(temp_file_1_name): with io_open('ark:' + temp_file_1_name, 'B', mode='w') as writer: writer.write('a', True) writer.write('b', False) with io_open('ark:' + temp_file_1_name, 'B', mode='r+', cache=True) as r: assert r.cache_dict == dict() assert 'a' not in r.cache_dict assert 'a' in r assert r['a'] assert r.cache_dict == {'a': True} assert 'a' in r.cache_dict assert 'b' not in r.cache_dict assert 'b' in r assert not r['b'] assert r.cache_dict == {'a': True, 'b': False} r.cache_dict['b'] = True assert r['b']
def decode_cnn_ctc(args=None): '''Decode CNN w/ CTC using kaldi data tables''' logger = logging.getLogger(sys.argv[0]) if not logger.handlers: logger.addHandler(logging.StreamHandler()) register_logger_for_kaldi(sys.argv[0]) options = _decode_cnn_ctc_parse_args(args, logger) logger.log(9, 'Parsed options') id2label_map = dict() with open(options.label_to_id_map_path) as file_obj: for line in file_obj: label, idee = line.strip().split() idee = int(idee) if idee < 0: logger.error('All label ids must be nonnegative') return 1 id2label_map[idee] = label if (len(id2label_map) + 1) != options.num_labels: logger.error('Expected {} labels in id to label map, got {}'.format( options.num_labels - 1, len(id2label_map))) return 1 for idee in range(options.num_labels - 1): if idee not in id2label_map.keys(): logger.error('label to id map missing id: {}'.format(idee)) return 1 logger.log(9, 'Loaded label to id map') model_config = ModelConfig(**vars(options)) decode_config = DecodeConfig(**vars(options)) decode_data = DecodeData( options.data_rspecifier, delta_order=model_config.delta_order, cmvn_rxfilename=model_config.cmvn_rxfilename, batch_size=decode_config.batch_size, ) total_batches = len(decode_data) labels_out = io_open(options.output_wspecifier, 'tv', mode='w') logger.log(9, 'Set up eval data and opened label output file') with redirect_stdout_to_stderr(): logger.log(9, 'Creating model') from pydrobert.mol.model import ConvCTC model = ConvCTC(model_config) logger.log(9, 'Beginning decoding') batches_decoded = 0 logger.log(9, '000/{:03d} batches decoded'.format(total_batches)) for label_batch in model.decode_generator(decode_config, decode_data): if decode_data.batch_size: for key, label_ids in label_batch: labels_out.write( key, tuple(id2label_map[idee] for idee in label_ids)) else: labels_out.write(label_batch[0], tuple(label_batch[1])) batches_decoded += 1 if batches_decoded % max(1, total_batches // 10) == 0: logger.log( 9, '{:03d}/{:03d} batches decoded'.format( batches_decoded, total_batches)) logger.info('Done decoding')
def test_incorrect_open_read(temp_file_1_name, temp_file_2_name, ktype, value, is_text, bg): if ktype == 'wm' and is_text: pytest.skip("WaveMatrix can only be written as binary") opts = ['', 't'] if is_text else [''] specifier_1 = 'ark' + ','.join(opts) + ':' + temp_file_1_name specifier_2 = 'ark' + ','.join(opts) + ':' + temp_file_2_name with io_open(specifier_1, ktype, mode='w') as writer_1, io_open(specifier_2, ktype, mode='w') as writer_2: writer_1.write('0', value) writer_2.write('0', value) if bg: opts += ['bg'] specifier_1 = 'ark' + ','.join(opts) + ':' + temp_file_1_name specifier_2 = 'ark' + ','.join(opts) + ':' + temp_file_2_name for bad_ktype in KaldiDataType: try: with io_open(specifier_1, bad_ktype) as reader: next(reader) except Exception: # sometimes it'll work, and the expected output will be # correct (in the case of basic types). We don't care. All # we care about here is that we don't segfault pass # now we add some garbage data to the end of the file and try to # iterate through. Chances are this will end in failure (hopefully # not a segfault) with open(temp_file_1_name, mode='ab') as writer: writer.write(np.random.bytes(1000)) try: with io_open(specifier_1, ktype) as reader: list(reader) except Exception: pass # do the same, but only corrupt *after* the key with open(temp_file_2_name, mode='ab') as writer: writer.write(b'1 ' + np.random.bytes(1000)) try: with io_open(specifier_2, ktype) as reader: list(reader) except Exception: pass
def test_read_write(temp_file_1_name, dtype, value, is_text, bg): opts = ['', 't'] if is_text else [''] specifier = 'ark' + ','.join(opts) + ':' + temp_file_1_name writer = io_open(specifier, dtype, mode='w') writer.write('a', value) writer.close() if bg: opts += ['bg'] specifier = 'ark' + ','.join(opts) + ':' + temp_file_1_name reader = io_open(specifier, dtype) once = True for read_value in iter(reader): assert once, "Multiple values" try: assert np.allclose(read_value, value) except TypeError: assert read_value == value once = False reader.close()
def alt_add_deltas(args=None): '''Python-based code for adding deltas Used for debugging ''' logger = logging.getLogger(sys.argv[0]) if not logger.handlers: logger.addHandler(logging.StreamHandler()) register_logger_for_kaldi(sys.argv[0]) options = _alt_add_deltas_parse_args(args, logger) feats_in = io_open(options.feats_in, 'bm') feats_out = io_open(options.feats_out, 'bm', mode='w') num_utts = 0 for utt_id, feats in feats_in.items(): feats = calculate_deltas(feats, options.delta_order) feats_out.write(utt_id, feats) num_utts += 1 logger.info('Added {} deltas to {} utterances'.format( options.delta_order, num_utts))
def alt_apply_cmvn(args=None): '''Python-based code for CMVN application Used for debugging ''' logger = logging.getLogger(sys.argv[0]) if not logger.handlers: logger.addHandler(logging.StreamHandler()) register_logger_for_kaldi(sys.argv[0]) options = _alt_apply_cmvn_parse_args(args, logger) cmvn = CMVNCalculator(options.cmvn_stats_in) feats_in = io_open(options.feats_in, 'bm') feats_out = io_open(options.feats_out, 'bm', mode='w') num_utts = 0 for utt_id, feats in feats_in.items(): feats = cmvn.apply(feats, in_place=True) feats_out.write(utt_id, feats) num_utts += 1 logger.info('Applied CMVN to {} utterances'.format(num_utts))
def test_incorrect_open_read( temp_file_1_name, temp_file_2_name, ktype, value, is_text, bg): if ktype == 'wm' and is_text: pytest.skip("WaveMatrix can only be written as binary") opts = ['', 't'] if is_text else [''] specifier_1 = 'ark' + ','.join(opts) + ':' + temp_file_1_name specifier_2 = 'ark' + ','.join(opts) + ':' + temp_file_2_name with io_open(specifier_1, ktype, mode='w') as writer_1, io_open( specifier_2, ktype, mode='w') as writer_2: writer_1.write('0', value) writer_2.write('0', value) if bg: opts += ['bg'] specifier_1 = 'ark' + ','.join(opts) + ':' + temp_file_1_name specifier_2 = 'ark' + ','.join(opts) + ':' + temp_file_2_name for bad_ktype in KaldiDataType: try: with io_open(specifier_1, bad_ktype) as reader: next(reader) except Exception: # sometimes it'll work, and the expected output will be # correct (in the case of basic types). We don't care. All # we care about here is that we don't segfault pass # now we add some garbage data to the end of the file and try to # iterate through. Chances are this will end in failure (hopefully # not a segfault) with open(temp_file_1_name, mode='ab') as writer: writer.write(np.random.bytes(1000)) try: with io_open(specifier_1, ktype) as reader: list(reader) except Exception: pass # do the same, but only corrupt *after* the key with open(temp_file_2_name, mode='ab') as writer: writer.write(b'1 ' + np.random.bytes(1000)) try: with io_open(specifier_2, ktype) as reader: list(reader) except Exception: pass
def test_write_int32_correct_size(temp_file_1_name): with io_open('ark:' + temp_file_1_name, 'i', mode='w') as writer: writer.write('9', 182) # size should be 9 # 2 bytes for '9 ' # 2 byte for binary marker \0B # 1 byte for size of type in bytes (4) # 4 bytes for actual int with open(temp_file_1_name, 'rb') as file_obj: buf = file_obj.read() assert len(buf) == 9
def test_write_script_and_archive(temp_file_1_name, temp_file_2_name): values = { 'foo': np.ones((21, 32), dtype=np.float64), 'bar': np.zeros((10, 1000), dtype=np.float64), 'baz': -1e10 * np.eye(20, dtype=np.float64), } keys = list(values) writer = io_open('ark,scp:{},{}'.format(temp_file_1_name, temp_file_2_name), 'dm', mode='w') # to make a missing entry, append it to the file's end with a subproc for key in keys: writer.write(key, values[key]) writer.close() keys.reverse() reader = io_open('scp:{}'.format(temp_file_2_name), 'dm', mode='r+') for key in keys: assert np.allclose(reader[key], values[key]), key assert np.allclose(reader['bar'], values['bar']), "Failed doublecheck"
def _ignore_epoch(self): '''Epoch of samples w/ ignore_missing''' iters = tuple( io_open(spec[0], spec[1], **spec[2]).items() for spec in self.table_specifiers ) num_samples = 0 num_tabs = len(iters) try: while True: samp_tup = [None] * num_tabs high_key = None tab_idx = 0 while tab_idx < num_tabs: if samp_tup[tab_idx] is None: key, value = next(iters[tab_idx]) if high_key is None: high_key = key elif high_key < key: # key is further along than keys in # samp_tup. Discard those and keep this samp_tup = [None] * num_tabs samp_tup[tab_idx] = value high_key = key tab_idx = 0 continue elif high_key > key: # key is behind high_key. keep pushing this # iterator forward continue samp_tup[tab_idx] = value tab_idx += 1 num_samples += 1 for sub_batch_idx, axis_idx in self.axis_lengths: samp_tup.append( np.array( samp_tup[sub_batch_idx], copy=False).shape[axis_idx]) if self.add_key: samp_tup.insert(0, key) if self.num_sub != 1: yield tuple(samp_tup) else: yield samp_tup[0] except StopIteration: pass # don't care if one iterator ends first - rest will be missing # that iterator's value if self._num_samples is None: self._num_samples = num_samples elif self._num_samples != num_samples: raise IOError( 'Different number of samples from last time! (is a ' 'table from stdin?)')
def test_write_script_and_archive(temp_file_1_name, temp_file_2_name): values = { 'foo': np.ones((21, 32), dtype=np.float64), 'bar': np.zeros((10, 1000), dtype=np.float64), 'baz': -1e10 * np.eye(20, dtype=np.float64), } keys = list(values) writer = io_open( 'ark,scp:{},{}'.format(temp_file_1_name, temp_file_2_name), 'dm', mode='w' ) # to make a missing entry, append it to the file's end with a subproc for key in keys: writer.write(key, values[key]) writer.close() keys.reverse() reader = io_open('scp:{}'.format(temp_file_2_name), 'dm', mode='r+') for key in keys: assert np.allclose(reader[key], values[key]), key assert np.allclose(reader['bar'], values['bar']), "Failed doublecheck"
def test_sequential_ignore_missing(temp_file_1_name, temp_file_2_name): with io_open('ark:' + temp_file_1_name, 'ipv', mode='w') as pair_f: pair_f.write('10', [(10, 9), (8, 7), (6, 5)]) pair_f.write('11', [(11, 10), (9, 8)]) pair_f.write('12', [(12, 11)]) pair_f.write('14', []) with io_open('ark:' + temp_file_2_name, 'd', mode='w') as double_f: double_f.write('09', 3.14) double_f.write('10', .159) double_f.write('12', .265) double_f.write('13', .357) data = corpus.SequentialData( ('ark,s:' + temp_file_1_name, 'ipv'), ('ark,s:' + temp_file_2_name, 'd'), ignore_missing=True) act_pair_samples, act_double_samples = list(zip(*iter(data))) assert act_pair_samples == ( ((10, 9), (8, 7), (6, 5)), ((12, 11),) ) assert np.allclose(act_double_samples, (.159, .265))
def _no_ignore_epoch(self): '''Epoch of samples w/o ignore_missing''' iters = tuple( io_open(spec[0], spec[1], **spec[2]).items() for spec in self.table_specifiers ) num_samples = 0 for kv_pairs in zip(*iters): samp_tup = [] past_key = None for tab_idx, (key, sample) in enumerate(kv_pairs): if past_key is None: past_key = key elif past_key != key: # assume sorted, base on which is first if past_key < key: miss_rspec = self.table_specifiers[tab_idx][0] miss_key = past_key else: miss_rspec = self.table_specifiers[tab_idx - 1][0] miss_key = key raise IOError( 'Table {} missing key {} (or tables are sorted ' 'differently)'.format(miss_rspec, miss_key)) samp_tup.append(sample) num_samples += 1 for sub_batch_idx, axis_idx in self.axis_lengths: samp_tup.append( np.array( samp_tup[sub_batch_idx], copy=False).shape[axis_idx]) if self.add_key: samp_tup.insert(0, key) if self.num_sub != 1: yield tuple(samp_tup) else: yield samp_tup[0] # make sure all iterators ended at the same time for tab_idx, it in enumerate(iters): try: miss_key, _ = next(it) if tab_idx: miss_rspec = self.table_specifiers[0][0] else: miss_rspec = self.table_specifiers[1][0] raise IOError( 'Table {} missing key {}'.format(miss_rspec, miss_key)) except StopIteration: pass if self._num_samples is None: self._num_samples = num_samples elif self._num_samples != num_samples: raise IOError( 'Different number of samples from last time! (is a ' 'table from stdin?)')
def test_seeded_shuffled_is_predictable(temp_file_1_name, seed): samples = np.arange(100000).reshape((1000, 100)).astype(np.float32) with io_open('ark:' + temp_file_1_name, 'fv', mode='w') as f: for idx, sample in enumerate(samples): f.write(str(idx), sample) data_1 = corpus.ShuffledData( ('ark:' + temp_file_1_name, 'fv'), batch_size=13, rng=seed) data_2 = corpus.ShuffledData( ('ark:' + temp_file_1_name, 'fv'), batch_size=13, rng=seed) for _ in range(2): for batch_1, batch_2 in zip(data_1, data_2): assert np.allclose(batch_1, batch_2)
def test_wave_read_write_valid(temp_file_1_name): specifier = 'ark:{}'.format(temp_file_1_name) writer = io_open(specifier, 'wm', mode='w') n_waves = 10 keys = [str(i) for i in range(n_waves)] n_samples = [np.random.randint(1, 100000) for _ in keys] n_channels = [np.random.randint(1, 3) for _ in keys] # always written as pcm 16 bufs = [(np.random.random((y, x)) * 30000 - 15000).astype(np.int16) for x, y in zip(n_samples, n_channels)] for key, buf in zip(keys, bufs): writer.write(key, buf) writer.close() reader = io_open(specifier, 'wm', value_style='sbd') for vals, expected_buf in zip(reader, bufs): sample_rate, actual_buf, dur = vals assert int(sample_rate) == 16000 assert isinstance(dur, float) assert np.allclose(actual_buf, expected_buf) n_waves -= 1 assert not n_waves, "Incorrect number of reads!"
def test_chained(temp_file_1_name): # wholly too limited a test obj_list = [ ('iv', tuple(x for x in range(1000))), ('fm', [[1, 2.5], [1e-10, 4]]), ('dv', np.random.random(1)), ('dm', np.random.random((100, 20))), ('t', 'fiddlesticks'), ('t', 'munsters'), ] shuffle(obj_list) with io_open(temp_file_1_name, mode='w') as outp: for dtype, obj in obj_list: outp.write(obj, dtype) with io_open(temp_file_1_name) as inp: for dtype, obj in obj_list: read = inp.read(dtype) if dtype in ('fv', 'fm', 'dv', 'dm'): assert np.allclose(read, obj) else: assert read == obj
def test_wave_read_write_valid(temp_file_1_name): specifier = 'ark:{}'.format(temp_file_1_name) writer = io_open(specifier, 'wm', mode='w') n_waves = 10 keys = [str(i) for i in range(n_waves)] n_samples = [np.random.randint(1, 100000) for _ in keys] n_channels = [np.random.randint(1, 3) for _ in keys] # always written as pcm 16 bufs = [ (np.random.random((y, x)) * 30000 - 15000).astype(np.int16) for x, y in zip(n_samples, n_channels) ] for key, buf in zip(keys, bufs): writer.write(key, buf) writer.close() reader = io_open(specifier, 'wm', value_style='sbd') for vals, expected_buf in zip(reader, bufs): sample_rate, actual_buf, dur = vals assert int(sample_rate) == 16000 assert isinstance(dur, float) assert np.allclose(actual_buf, expected_buf) n_waves -= 1 assert not n_waves, "Incorrect number of reads!"
def test_sequential_basic(temp_file_1_name): samples = np.arange(1000).reshape((10, 100)).astype(np.int32) with io_open('ark:' + temp_file_1_name, 'iv', mode='w') as f: for idx, sample in enumerate(samples): f.write(str(idx), sample) data = corpus.SequentialData(('ark,s:' + temp_file_1_name, 'iv'), batch_size=3) assert len(data) == 4 batch_start = 0 for act_batch in data: ex_batch = samples[batch_start:batch_start + 3] assert np.allclose(ex_batch, act_batch) batch_start += len(ex_batch) assert batch_start == len(samples)
def test_seeded_shuffled_is_predictable(temp_file_1_name, seed): samples = np.arange(100000).reshape((1000, 100)).astype(np.float32) with io_open('ark:' + temp_file_1_name, 'fv', mode='w') as f: for idx, sample in enumerate(samples): f.write(str(idx), sample) data_1 = corpus.ShuffledData(('ark:' + temp_file_1_name, 'fv'), batch_size=13, rng=seed) data_2 = corpus.ShuffledData(('ark:' + temp_file_1_name, 'fv'), batch_size=13, rng=seed) for _ in range(2): for batch_1, batch_2 in zip(data_1, data_2): assert np.allclose(batch_1, batch_2)
def test_sequential_data_tups(temp_file_1_name, temp_file_2_name): feats = np.random.random((4, 10, 100)).astype(np.float64) labels = [ ('foo',), ('bar', 'baz',), ('bingo',), ('bango', 'bongo', 'eugene'), ] with io_open('ark:' + temp_file_1_name, 'dm', mode='w') as feats_f, \ io_open('ark:' + temp_file_2_name, 'tv', mode='w') as labels_f: for idx, (feat, label) in enumerate(zip(feats, labels)): feats_f.write(str(idx), feat) labels_f.write(str(idx), label) data = corpus.SequentialData( ('ark,s:' + temp_file_1_name, 'dm'), ('ark,s:' + temp_file_2_name, 'tv'), axis_lengths=0) batch_start = 0 for ex_feat, ex_label, (act_feat, act_label, act_len) in zip( feats, labels, data): assert np.allclose(ex_feat, act_feat) assert ex_label == act_label assert act_len == 10 batch_start += 1 assert batch_start == len(feats) assert len(data) == batch_start
def test_sequential_basic(temp_file_1_name): samples = np.arange(1000).reshape((10, 100)).astype(np.int32) with io_open('ark:' + temp_file_1_name, 'iv', mode='w') as f: for idx, sample in enumerate(samples): f.write(str(idx), sample) data = corpus.SequentialData( ('ark,s:' + temp_file_1_name, 'iv'), batch_size=3) assert len(data) == 4 batch_start = 0 for act_batch in data: ex_batch = samples[batch_start:batch_start + 3] assert np.allclose(ex_batch, act_batch) batch_start += len(ex_batch) assert batch_start == len(samples)
def test_read_sequential(temp_file_1_name): values = ( [[1, 2] * 10] * 10, np.eye(1000, dtype=np.float32), [[]], np.outer(np.arange(1000, dtype=np.float32), np.arange(1000, dtype=np.float32)), ) writer = io_open('ark:{}'.format(temp_file_1_name), 'fm', mode='w') for key, value in enumerate(values): writer.write(str(key), value) writer.close() count = 0 reader = io_open('ark:{}'.format(temp_file_1_name), 'fm') for act_value, reader_value in zip(values, iter(reader)): assert np.allclose(act_value, reader_value) count += 1 assert count == len(values) reader.close() # check that the keys are all savvy reader = io_open('ark:{}'.format(temp_file_1_name), 'fm') for idx, tup in enumerate(reader.items()): key, value = tup assert str(idx) == key
def test_shuffled_data_basic(temp_file_1_name): samples = np.arange(100000).reshape((10, 200, 50)).astype( np.float64 if KaldiDataType.BaseMatrix.is_double else np.float32) keys = tuple(str(i) for i in range(10)) with io_open('ark:' + temp_file_1_name, 'bm', mode='w') as f: for key, sample in zip(keys, samples): f.write(key, sample) data = corpus.ShuffledData( 'ark:' + temp_file_1_name, batch_size=3, rng=NonRandomState()) assert isinstance(data.rng, NonRandomState) assert len(data) == int(np.ceil(len(keys) / 3)) assert keys == tuple(data.key_list) for _ in range(2): ex_samp_idx = len(samples) for batch in data: for act_sample in batch: ex_samp_idx -= 1 assert np.allclose(samples[ex_samp_idx], act_sample)
def test_shuffled_data_basic(temp_file_1_name): samples = np.arange(100000).reshape((10, 200, 50)).astype( np.float64 if KaldiDataType.BaseMatrix.is_double else np.float32) keys = tuple(str(i) for i in range(10)) with io_open('ark:' + temp_file_1_name, 'bm', mode='w') as f: for key, sample in zip(keys, samples): f.write(key, sample) data = corpus.ShuffledData('ark:' + temp_file_1_name, batch_size=3, rng=NonRandomState()) assert isinstance(data.rng, NonRandomState) assert len(data) == int(np.ceil(len(keys) / 3)) assert keys == tuple(data.key_list) for _ in range(2): ex_samp_idx = len(samples) for batch in data: for act_sample in batch: ex_samp_idx -= 1 assert np.allclose(samples[ex_samp_idx], act_sample)
def alt_compute_cmvn_stats(args=None): '''Python-based code for CMVN statistics computation Used for debugging ''' logger = logging.getLogger(sys.argv[0]) if not logger.handlers: logger.addHandler(logging.StreamHandler()) register_logger_for_kaldi(sys.argv[0]) options = _alt_compute_cmvn_stats_parse_args(args, logger) cmvn = CMVNCalculator() feat_table = io_open(options.feats_in, 'bm') num_utts = 0 for feats in feat_table: cmvn.accumulate(feats) num_utts += 1 logger.info('Accumulated stats for {} utterances'.format(num_utts)) cmvn.save(options.cmvn_stats_out) logger.info('Wrote stats to {}'.format(options.cmvn_stats_out))
def test_invalid_data_type(temp_file_1_name): specifier = 'ark:{}'.format(temp_file_1_name) with pytest.raises(ValueError): io_open(specifier, 'foo', mode='w')
def test_open_string_or_data_type(temp_file_1_name): specifier = 'ark:{}'.format(temp_file_1_name) io_open(specifier, 'bm', mode='w') io_open(specifier, table_streams.KaldiDataType.BaseMatrix, mode='w') io_open(specifier, 'bm', mode='r') io_open(specifier, table_streams.KaldiDataType.BaseMatrix, mode='r') io_open(specifier, 'bm', mode='r+') io_open(specifier, table_streams.KaldiDataType.BaseMatrix, mode='r+')
def test_no_exception_on_double_close(temp_file_1_name): specifier = 'ark:{}'.format(temp_file_1_name) kaldi_io = io_open(specifier, 'bm', mode='w') kaldi_io.close() kaldi_io.close()