def test_compute_error_rate( temp_file_1_name, temp_file_2_name, temp_file_3_name): with kaldi_io.open('ark:' + temp_file_1_name, 'tv', 'w') as ref_writer: ref_writer.write('A', ('lorem', 'ipsum', 'dolor', 'sit', 'amet')) ref_writer.write('B', ('consectetur', 'adipiscing', 'elit')) with kaldi_io.open('ark:' + temp_file_2_name, 'tv', 'w') as hyp_writer: hyp_writer.write( 'A', ('laura', 'ipsum', 'dollars', 'sit', 'down', 'amet')) hyp_writer.write( 'B', ('consecutive', 'elite')) # A : lorem -> laura, dolor -> dollars, -> down # B : consectetur -> consecutive, adipiscing -> , elit -> elite # with insertions = 6 / 8 # without insertions = 5 / 8 ret_code = command_line.compute_error_rate([ 'ark:' + temp_file_1_name, 'ark:' + temp_file_2_name, temp_file_3_name, ]) assert ret_code == 0 with open(temp_file_3_name) as out_file_reader: out_text = out_file_reader.read() assert 'Error rate: 75.00%' in out_text ret_code = command_line.compute_error_rate([ 'ark:' + temp_file_1_name, 'ark:' + temp_file_2_name, temp_file_3_name, '--include-inserts-in-cost=false', '--report-accuracy=true', ]) assert ret_code == 0 with open(temp_file_3_name) as out_file_reader: out_text = out_file_reader.read() assert 'Accuracy: {:.2f}%'.format((1 - 5 / 8) * 100) in out_text
def test_compute_error_rate(temp_file_1_name, temp_file_2_name, temp_file_3_name): with kaldi_io.open('ark:' + temp_file_1_name, 'tv', 'w') as ref_writer: ref_writer.write('A', ('lorem', 'ipsum', 'dolor', 'sit', 'amet')) ref_writer.write('B', ('consectetur', 'adipiscing', 'elit')) with kaldi_io.open('ark:' + temp_file_2_name, 'tv', 'w') as hyp_writer: hyp_writer.write('A', ('laura', 'ipsum', 'dollars', 'sit', 'down', 'amet')) hyp_writer.write('B', ('consecutive', 'elite')) # A : lorem -> laura, dolor -> dollars, -> down # B : consectetur -> consecutive, adipiscing -> , elit -> elite # with insertions = 6 / 8 # without insertions = 5 / 8 ret_code = command_line.compute_error_rate([ 'ark:' + temp_file_1_name, 'ark:' + temp_file_2_name, temp_file_3_name, ]) assert ret_code == 0 with open(temp_file_3_name) as out_file_reader: out_text = out_file_reader.read() assert 'Error rate: 75.00%' in out_text ret_code = command_line.compute_error_rate([ 'ark:' + temp_file_1_name, 'ark:' + temp_file_2_name, temp_file_3_name, '--include-inserts-in-cost=false', '--report-accuracy=true', ]) assert ret_code == 0 with open(temp_file_3_name) as out_file_reader: out_text = out_file_reader.read() assert 'Accuracy: {:.2f}%'.format((1 - 5 / 8) * 100) in out_text
def elicit_warning(filename, threaded=False): # helper to elicit a natural warning from kaldi writer = io.open('ark,t:{}'.format(filename), 'bv', 'w') writer.write('zz', [np.infty]) writer.close() reader = io.open( 'ark,t{}:{}'.format(',bg' if threaded else '', filename), 'bv') next(reader) reader.close()
def elicit_warning(filename, threaded=False): # helper to elicit a natural warning from kaldi writer = io.open('ark,t:{}'.format(filename), 'bv', 'w') writer.write('zz', [np.infty]) writer.close() reader = io.open('ark,t{}:{}'.format(',bg' if threaded else '', filename), 'bv') next(reader) reader.close()
def _write_pickle_to_table_empty(wspecifier, logger): '''Special case when pickle file(s) was/were empty''' logger.info('Opening {}'.format(wspecifier)) # doesn't matter what type we choose; we're not writing anything try: kaldi_io.open(wspecifier, 'bm', 'w') except IOError as error: logger.error(error.message, exc_info=True) return 1 logger.warn('No entries were written (pickle file(s) was/were empty)') return 0
def test_write_table_to_pickle(values, temp_file_1_name, temp_file_2_name): if len(values): kaldi_dtype = kaldi_io.util.infer_kaldi_data_type(values[0]).value else: kaldi_dtype = 'bm' with kaldi_io.open('ark:' + temp_file_1_name, kaldi_dtype, 'w') as writer: for num, value in enumerate(values): writer.write(str(num), value) ret_code = command_line.write_table_to_pickle( ['ark:' + temp_file_1_name, temp_file_2_name, '-i', kaldi_dtype]) assert ret_code == 0 num_entries = 0 pickle_file = open(temp_file_2_name, 'rb') num_entries = 0 try: while True: key, value = pickle.load(pickle_file) num_entries = int(key) + 1 try: values[num_entries - 1].dtype assert np.allclose(value, values[num_entries - 1]) except AttributeError: assert value == values[num_entries - 1] except EOFError: pass assert num_entries == len(values)
def test_write_pickle_to_table(values, temp_file_1_name, temp_file_2_name): if len(values): kaldi_dtype = kaldi_io.util.infer_kaldi_data_type(values[0]).value else: kaldi_dtype = 'bm' with open(temp_file_1_name, 'wb') as pickle_file: for num, value in enumerate(values): pickle.dump((str(num), value), pickle_file) ret_code = command_line.write_pickle_to_table( [temp_file_1_name, 'ark:' + temp_file_2_name, '-o', kaldi_dtype]) assert ret_code == 0 kaldi_reader = kaldi_io.open('ark:' + temp_file_2_name, kaldi_dtype, 'r') num_entries = 0 for key, value in kaldi_reader.items(): num_entries = int(key) + 1 try: values[num_entries - 1].dtype assert np.allclose(value, values[num_entries - 1]) except AttributeError: assert value == values[num_entries - 1] assert num_entries == len(values)
def _write_pickle_to_table_key_value(options, logger): try: logger.info('Opening {}'.format(options.value_in)) if options.value_in.endswith('.gz'): import gzip value_in = gzip.open(options.value_in, 'rb') else: value_in = open(options.value_in, 'rb') logger.info('Opening {}'.format(options.key_in)) if options.key_in.endswith('.gz'): import gzip key_in = gzip.open(options.key_in, 'rt') else: key_in = open(options.key_in, 'r') except IOError as error: logger.error(error.message, exc_info=True) return 1 try: value = pickle.load(value_in) except pickle.UnpicklingError as error: value_in.close() key_in.close() logger.error(error.message, exc_info=True) return 1 except EOFError: value_in.close() try: pickle.load(key_in) logger.error('Number of keys (1) and values (0) do not match') return 1 except pickle.UnpicklingError as error: key_in.close() logger.error(error.message, exc_info=True) return 1 key_in.close() return _write_pickle_to_table_empty(options.wspecifier, logger) try: key = pickle.load(key_in) except EOFError: value_in.close() key_in.close() logger.error('Number of keys (0) and values (1) do not match') return 1 except pickle.UnpicklingError as error: value_in.close() key_in.close() logger.error(error.message, exc_info=True) return 1 out_type = options.out_type try: logging.info('Opening {}'.format(options.wspecifier)) writer = kaldi_io.open(options.wspecifier, out_type, 'w') except IOError as error: value_in.close() key_in.close() logger.error(error.message, exc_info=True) return 1 num_entries = 0 try: while True: if out_type.is_floating_point: if out_type.is_double: try: value = value.astype(np.float64, copy=False) except AttributeError: pass # will happen implicitly else: try: value = value.astype(np.float32, copy=False) except AttributeError: pass # will happen implicitly writer.write(key, value) num_entries += 1 if num_entries % 10 == 0: logger.info('Processed {} entries'.format(num_entries)) logger.log(9, 'Processed key {}'.format(key)) key = pickle.load(key_in) value = pickle.load(value_in) except EOFError: pass except (IOError, ValueError, TypeError, pickle.UnpicklingError) as error: logger.error(error.message, exc_info=True) return 1 try: pickle.load(value_in) value_in.close() key_in.close() logger.error('Number of keys ({}) and values ({}) do not match'.format( num_entries, num_entries + 1)) return 1 except EOFError: pass except (IOError, pickle.UnpicklingError) as error: value_in.close() key_in.close() logger.error(error.message, exc_info=True) return 1 try: pickle.load(key_in) value_in.close() key_in.close() logger.error('Number of keys ({}) and values ({}) do not match'.format( num_entries + 1, num_entries)) return 1 except EOFError: pass except (IOError, pickle.UnpicklingError) as error: logger.error(error.message, exc_info=True) return 1 finally: value_in.close() key_in.close() logger.info("Wrote {} entries".format(num_entries)) return 0
def _write_pickle_to_table_value_only(options, logger): '''write_pickle_to_table when only value_in has been specified''' from six.moves import cPickle as pickle try: logger.info('Opening {}'.format(options.value_in)) if options.value_in.endswith('.gz'): import gzip value_in = gzip.open(options.value_in, 'rb') else: value_in = open(options.value_in, 'rb') except IOError as error: logger.error(error.message, exc_info=True) return 1 try: key, value = pickle.load(value_in) except pickle.UnpicklingError as error: logger.error(error.message, exc_info=True) return 1 except EOFError: value_in.close() return _write_pickle_to_table_empty(options.wspecifier, logger) out_type = options.out_type try: logging.info('Opening {}'.format(options.wspecifier)) writer = kaldi_io.open(options.wspecifier, out_type, 'w') except IOError as error: value_in.close() logger.error(error.message, exc_info=True) return 1 num_entries = 0 try: while True: if out_type.is_floating_point: if out_type.is_double: try: value = value.astype(np.float64, copy=False) except AttributeError: pass else: try: value = value.astype(np.float32, copy=False) except AttributeError: pass writer.write(key, value) num_entries += 1 if num_entries % 10 == 0: logger.info('Processed {} entries'.format(num_entries)) logger.log(9, 'Processed key {}'.format(key)) key, value = pickle.load(value_in) except EOFError: pass except (IOError, ValueError, TypeError, pickle.UnpicklingError) as error: if hasattr(error, 'message'): logger.error(error.message, exc_info=True) else: logger.error('error', exc_info=True) return 1 finally: value_in.close() logger.info("Wrote {} entries".format(num_entries)) return 0
def write_table_to_pickle(args=None): '''Write a kaldi table to pickle file(s) The inverse is write_pickle_to_table ''' logger = logging.getLogger(sys.argv[0]) if not logger.handlers: logger.addHandler(logging.StreamHandler()) register_logger_for_kaldi(logger) try: options = _write_table_to_pickle_parse_args(args, logger) except SystemExit as ex: return ex.code out_type = options.out_type if out_type is None: if options.in_type.is_floating_point: if options.in_type.is_double: out_type = np.float64 else: out_type = np.float32 else: out_type = np.str from six.moves import cPickle as pickle try: logger.info('Opening {}'.format(options.rspecifier)) reader = kaldi_io.open(options.rspecifier, options.in_type, 'r') logger.info('Opening {}'.format(options.value_out)) if options.value_out.endswith('.gz'): import gzip value_out = gzip.open(options.value_out, 'wb') else: value_out = open(options.value_out, 'wb') if options.key_out: logger.info('Opening {}'.format(options.key_out)) if options.key_out.endswith('.gz'): import gzip key_out = gzip.open(options.key_out, 'wt') else: key_out = open(options.key_out, 'w') else: key_out = None except IOError as error: logger.error(error.message, exc_info=True) return 1 num_entries = 0 try: for key, value in reader.items(): num_entries += 1 if not np.issubdtype(out_type, np.str): value = value.astype(out_type) if key_out: pickle.dump(value, value_out) pickle.dump(key, key_out) else: pickle.dump((key, value), value_out) if num_entries % 10 == 0: logger.info('Processed {} entries'.format(num_entries)) logger.log(9, 'Processed key {}'.format(key)) except (IOError, ValueError) as error: logger.error(error.message, exc_info=True) return 1 finally: value_out.close() if key_out: key_out.close() if num_entries == 0: logger.warn("No entries were written (table was empty)") else: logger.info("Wrote {} entries".format(num_entries)) return 0