def compute_raw_scores(audio_paths: List[str]) -> Iterable[List[AsrOutput]]: from deepspeech_training.train import create_model # pylint: disable=cyclic-import,import-outside-toplevel from deepspeech_training.util.checkpoints import load_graph_for_evaluation initialize_globals() with tf.Session(config=Config.session_config) as session: tf.train.get_or_create_global_step() for idx, audio_path in enumerate(audio_paths): with AudioFile(audio_path, as_path=True) as wav_path: data_set = split_audio_file(wav_path, batch_size=FLAGS.batch_size, aggressiveness=1, outlier_duration_ms=FLAGS.outlier_duration_ms, outlier_batch_size=FLAGS.outlier_batch_size) iterator = tf.data.Iterator.from_structure(data_set.output_types, data_set.output_shapes, output_classes=data_set.output_classes) batch_time_start, batch_time_end, batch_x, batch_x_len = iterator.get_next() no_dropout = [None] * 6 logits, _ = create_model(batch_x=batch_x, seq_length=batch_x_len, dropout=no_dropout) transposed = tf.nn.softmax(tf.transpose(logits, [1, 0, 2])) session.run(iterator.make_initializer(data_set)) if idx == 0: load_graph_for_evaluation(session) output_list = [] while True: try: starts, ends, batch_logits, batch_lengths = session.run([batch_time_start, batch_time_end, transposed, batch_x_len]) except tf.errors.OutOfRangeError: break for start, end, logits, length in zip(starts, ends, batch_logits, batch_lengths): output_list.append(AsrOutput(start, end, logits[:length])) tf.get_variable_scope().reuse_variables() yield output_list
def transcribe_file(audio_path, tlog_path): from deepspeech_training.train import create_model # pylint: disable=cyclic-import,import-outside-toplevel from deepspeech_training.util.checkpoints import load_graph_for_evaluation initialize_globals() scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path, Config.alphabet) try: num_processes = cpu_count() except NotImplementedError: num_processes = 1 with AudioFile(audio_path, as_path=True) as wav_path: data_set = split_audio_file( wav_path, batch_size=FLAGS.batch_size, aggressiveness=FLAGS.vad_aggressiveness, outlier_duration_ms=FLAGS.outlier_duration_ms, outlier_batch_size=FLAGS.outlier_batch_size) iterator = tf.data.Iterator.from_structure( data_set.output_types, data_set.output_shapes, output_classes=data_set.output_classes) batch_time_start, batch_time_end, batch_x, batch_x_len = iterator.get_next( ) no_dropout = [None] * 6 logits, _ = create_model(batch_x=batch_x, seq_length=batch_x_len, dropout=no_dropout) transposed = tf.nn.softmax(tf.transpose(logits, [1, 0, 2])) tf.train.get_or_create_global_step() with tf.Session(config=Config.session_config) as session: load_graph_for_evaluation(session) session.run(iterator.make_initializer(data_set)) transcripts = [] while True: try: starts, ends, batch_logits, batch_lengths = \ session.run([batch_time_start, batch_time_end, transposed, batch_x_len]) except tf.errors.OutOfRangeError: break decoded = ctc_beam_search_decoder_batch( batch_logits, batch_lengths, Config.alphabet, FLAGS.beam_width, num_processes=num_processes, scorer=scorer) decoded = list(d[0][1] for d in decoded) transcripts.extend(zip(starts, ends, decoded)) transcripts.sort(key=lambda t: t[0]) transcripts = [{ 'start': int(start), 'end': int(end), 'transcript': transcript } for start, end, transcript in transcripts] with open(tlog_path, 'w') as tlog_file: json.dump(transcripts, tlog_file, default=float)