コード例 #1
0
ファイル: deepspeech.py プロジェクト: entn-at/howl
def compute_raw_scores(audio_paths: List[str]) -> Iterable[List[AsrOutput]]:
    from deepspeech_training.train import create_model  # pylint: disable=cyclic-import,import-outside-toplevel
    from deepspeech_training.util.checkpoints import load_graph_for_evaluation
    initialize_globals()
    with tf.Session(config=Config.session_config) as session:
        tf.train.get_or_create_global_step()
        for idx, audio_path in enumerate(audio_paths):
            with AudioFile(audio_path, as_path=True) as wav_path:
                data_set = split_audio_file(wav_path,
                                            batch_size=FLAGS.batch_size,
                                            aggressiveness=1,
                                            outlier_duration_ms=FLAGS.outlier_duration_ms,
                                            outlier_batch_size=FLAGS.outlier_batch_size)
                iterator = tf.data.Iterator.from_structure(data_set.output_types, data_set.output_shapes,
                                                           output_classes=data_set.output_classes)
                batch_time_start, batch_time_end, batch_x, batch_x_len = iterator.get_next()
                no_dropout = [None] * 6
                logits, _ = create_model(batch_x=batch_x, seq_length=batch_x_len, dropout=no_dropout)
                transposed = tf.nn.softmax(tf.transpose(logits, [1, 0, 2]))
                session.run(iterator.make_initializer(data_set))
                if idx == 0:
                    load_graph_for_evaluation(session)
                output_list = []
                while True:
                    try:
                        starts, ends, batch_logits, batch_lengths = session.run([batch_time_start, batch_time_end, transposed, batch_x_len])
                    except tf.errors.OutOfRangeError:
                        break
                    for start, end, logits, length in zip(starts, ends, batch_logits, batch_lengths):
                        output_list.append(AsrOutput(start, end, logits[:length]))
                tf.get_variable_scope().reuse_variables()
            yield output_list
コード例 #2
0
def transcribe_file(audio_path, tlog_path):
    from deepspeech_training.train import create_model  # pylint: disable=cyclic-import,import-outside-toplevel
    from deepspeech_training.util.checkpoints import load_graph_for_evaluation
    initialize_globals()
    scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path,
                    Config.alphabet)
    try:
        num_processes = cpu_count()
    except NotImplementedError:
        num_processes = 1
    with AudioFile(audio_path, as_path=True) as wav_path:
        data_set = split_audio_file(
            wav_path,
            batch_size=FLAGS.batch_size,
            aggressiveness=FLAGS.vad_aggressiveness,
            outlier_duration_ms=FLAGS.outlier_duration_ms,
            outlier_batch_size=FLAGS.outlier_batch_size)
        iterator = tf.data.Iterator.from_structure(
            data_set.output_types,
            data_set.output_shapes,
            output_classes=data_set.output_classes)
        batch_time_start, batch_time_end, batch_x, batch_x_len = iterator.get_next(
        )
        no_dropout = [None] * 6
        logits, _ = create_model(batch_x=batch_x,
                                 seq_length=batch_x_len,
                                 dropout=no_dropout)
        transposed = tf.nn.softmax(tf.transpose(logits, [1, 0, 2]))
        tf.train.get_or_create_global_step()
        with tf.Session(config=Config.session_config) as session:
            load_graph_for_evaluation(session)
            session.run(iterator.make_initializer(data_set))
            transcripts = []
            while True:
                try:
                    starts, ends, batch_logits, batch_lengths = \
                        session.run([batch_time_start, batch_time_end, transposed, batch_x_len])
                except tf.errors.OutOfRangeError:
                    break
                decoded = ctc_beam_search_decoder_batch(
                    batch_logits,
                    batch_lengths,
                    Config.alphabet,
                    FLAGS.beam_width,
                    num_processes=num_processes,
                    scorer=scorer)
                decoded = list(d[0][1] for d in decoded)
                transcripts.extend(zip(starts, ends, decoded))
            transcripts.sort(key=lambda t: t[0])
            transcripts = [{
                'start': int(start),
                'end': int(end),
                'transcript': transcript
            } for start, end, transcript in transcripts]
            with open(tlog_path, 'w') as tlog_file:
                json.dump(transcripts, tlog_file, default=float)