Beispiel #1
0
    def transcribe_wav(self, wav_bytes: bytes) -> typing.Optional[Transcription]:
        """Speech to text from WAV data."""

        # Compute WAV duration
        audio_data: bytes = bytes()
        with io.BytesIO(wav_bytes) as wav_buffer:
            with wave.open(wav_buffer) as wav_file:
                frames = wav_file.getnframes()
                rate = wav_file.getframerate()
                wav_duration = frames / float(rate)

                # Extract raw audio data
                # TODO do we need this?
                audio_data = wav_file.readframes(wav_file.getnframes())

        # Process data as an entire utterance
        start_time = time.perf_counter()
        text, confidence = self._transcribe_wav(wav_bytes)

        transcribe_seconds = time.perf_counter() - start_time
        _LOGGER.debug("Decoded audio in %s second(s)", transcribe_seconds)

        if text is not None:
            return Transcription(
                text=text,
                likelihood=confidence,
                transcribe_seconds=transcribe_seconds,
                wav_seconds=wav_duration
            )

        return None
    def transcribe_wav(self,
                       wav_data: bytes) -> typing.Optional[Transcription]:
        """Speech to text from WAV data."""
        self.load_decoder()
        assert self.decoder

        _LOGGER.debug("Decoding %s byte(s)", len(wav_data))
        start_time = time.perf_counter()
        with io.BytesIO(wav_data) as wav_buffer:
            with wave.open(wav_buffer, "rb") as wav_file:
                sample_rate = wav_file.getframerate()
                num_frames = wav_file.getnframes()
                wav_duration = num_frames / float(sample_rate)

                frames = wav_file.readframes(num_frames)
                samples = struct.unpack_from("<%dh" % num_frames, frames)

                # Decode
                success = self.decoder.decode(
                    sample_rate, np.array(samples, dtype=np.float32), True)

                if success:
                    text, likelihood = self.decoder.get_decoded_string()
                    transcribe_seconds = time.perf_counter() - start_time

                    return Transcription(
                        text=text.strip(),
                        likelihood=likelihood,
                        transcribe_seconds=transcribe_seconds,
                        wav_seconds=wav_duration,
                    )

                # Failure
                return None
Beispiel #3
0
    def transcribe_stream(
            self,
            audio_stream: typing.Iterable[bytes],
            sample_rate: int,
            sample_width: int,
            channels: int,
    ) -> typing.Optional[Transcription]:

        total_frames = 0

        start_time = time.perf_counter()

        wav_frames = bytearray()
        for frame in audio_stream:
            wav_frames.extend(frame)
            total_frames += 1

        wav_bytes = bytes(wav_frames)

        text, confidence = self._transcribe_wav(wav_bytes)

        transcribe_seconds = time.perf_counter() - start_time
        _LOGGER.debug("Decoded audio in %s second(s)", transcribe_seconds)

        if text is not None:
            return Transcription(
                text=text,
                likelihood=confidence,
                transcribe_seconds=transcribe_seconds,
                wav_seconds=total_frames / float(sample_rate)
            )

        return None
Beispiel #4
0
    def transcribe_wav(self,
                       wav_bytes: bytes) -> typing.Optional[Transcription]:
        """Speech to text from WAV data."""
        start_time = time.perf_counter()

        with tempfile.NamedTemporaryFile(suffix=".wav", mode="wb") as wav_file:
            wav_file.write(wav_bytes)
            wav_file.seek(0)

            if self.model_type == KaldiModelType.NNET3:
                text = self._transcribe_wav_nnet3(wav_file.name)
            elif self.model_type == KaldiModelType.GMM:
                text = self._transcribe_wav_gmm(wav_file.name)
            else:
                raise ValueError(self.model_type)

        if text:
            # Success
            end_time = time.perf_counter()

            return Transcription(
                text=text.strip(),
                likelihood=1,
                transcribe_seconds=(end_time - start_time),
                wav_seconds=get_wav_duration(wav_bytes),
            )

        # Failure
        return None
Beispiel #5
0
    async def async_test_silence(self):
        """Check start/stop session with silence detection."""
        fake_transcription = Transcription(
            text="turn on the living room lamp",
            likelihood=1,
            transcribe_seconds=0,
            wav_seconds=0,
        )

        def fake_transcribe(stream, *args):
            """Return test trancription."""
            for chunk in stream:
                if not chunk:
                    break

            return fake_transcription

        self.transcriber.transcribe_stream = fake_transcribe

        # Start session
        start_listening = AsrStartListening(
            site_id=self.site_id,
            session_id=self.session_id,
            stop_on_silence=True,
            send_audio_captured=False,
        )
        result = None
        async for response in self.hermes.on_message_blocking(start_listening):
            result = response

        # No response expected
        self.assertIsNone(result)

        # Send in "audio"
        wav_path = Path("etc/turn_on_the_living_room_lamp.wav")

        results = []
        with open(wav_path, "rb") as wav_file:
            for wav_bytes in AudioFrame.iter_wav_chunked(wav_file, 4096):
                frame = AudioFrame(wav_bytes=wav_bytes)
                async for response in self.hermes.on_message_blocking(
                    frame, site_id=self.site_id
                ):
                    results.append(response)

        # Except transcription
        self.assertEqual(
            results,
            [
                AsrRecordingFinished(site_id=self.site_id, session_id=self.session_id),
                AsrTextCaptured(
                    text=fake_transcription.text,
                    likelihood=fake_transcription.likelihood,
                    seconds=fake_transcription.transcribe_seconds,
                    site_id=self.site_id,
                    session_id=self.session_id,
                ),
            ],
        )
Beispiel #6
0
    def transcribe_wav(self,
                       wav_bytes: bytes) -> typing.Optional[Transcription]:
        """Speech to text from WAV data."""
        self.maybe_load_model()
        assert self.model, "Model was not loaded"

        start_time = time.perf_counter()

        # Convert to raw numpy buffer
        with io.BytesIO(wav_bytes) as wav_io:
            with wave.open(wav_io) as wav_file:
                audio_bytes = wav_file.readframes(wav_file.getnframes())
                audio_buffer = np.frombuffer(audio_bytes, np.int16)

        metadata = self.model.sttWithMetadata(audio_buffer)
        end_time = time.perf_counter()

        if metadata:
            # Actual transcription
            text = ""

            # Individual tokens
            tokens: typing.List[TranscriptionToken] = []
            word = ""
            word_start_time = 0
            for index, item in enumerate(metadata.items):
                text += item.character

                if item.character != " ":
                    # Add to current word
                    word += item.character

                if item.character == " " or (index
                                             == (len(metadata.items) - 1)):
                    # Combine into single tokens
                    tokens.append(
                        TranscriptionToken(
                            token=word,
                            likelihood=1,
                            start_time=word_start_time,
                            end_time=item.start_time,
                        ))

                    # Word break
                    word = ""
                    word_start_time = 0
                elif len(word) > 1:
                    word_start_time = item.start_time

            return Transcription(
                text=text,
                likelihood=metadata.confidence,
                transcribe_seconds=(end_time - start_time),
                wav_seconds=get_wav_duration(wav_bytes),
                tokens=tokens,
            )

        # Failure
        return None
    def transcribe_stream(
        self,
        audio_stream: typing.Iterable[bytes],
        sample_rate: int,
        sample_width: int,
        channels: int,
    ) -> typing.Optional[Transcription]:
        """Speech to text from an audio stream."""
        assert channels == 1, "Only mono audio supported"
        self.load_decoder()
        assert self.decoder

        start_time = time.perf_counter()
        last_chunk: typing.Optional[bytes] = None
        audio_iter = iter(audio_stream)
        total_frames: int = 0
        while True:
            try:
                next_chunk = next(audio_iter)

                if last_chunk:
                    # Don't finalize
                    num_frames = len(last_chunk) // sample_width
                    total_frames += num_frames
                    samples = struct.unpack_from("<%dh" % num_frames,
                                                 last_chunk)
                    self.decoder.decode(sample_rate,
                                        np.array(samples, dtype=np.float32),
                                        False)

                last_chunk = next_chunk
            except StopIteration:
                break

        if not last_chunk:
            # Add one empty frame for finalization
            last_chunk = bytes([0] * sample_width)

        # Finalize
        num_frames = len(last_chunk) // sample_width
        total_frames += num_frames
        samples = struct.unpack_from("<%dh" % num_frames, last_chunk)
        success = self.decoder.decode(sample_rate,
                                      np.array(samples, dtype=np.float32),
                                      True)

        if success:
            text, likelihood = self.decoder.get_decoded_string()
            transcribe_seconds = time.perf_counter() - start_time

            return Transcription(
                text=text.strip(),
                likelihood=likelihood,
                transcribe_seconds=transcribe_seconds,
                wav_seconds=total_frames / float(sample_rate),
            )

        # Failure
        return None
Beispiel #8
0
    def transcribe_wav(self,
                       wav_bytes: bytes) -> typing.Optional[Transcription]:
        """Transcribe WAV data."""
        if not self.julius_proc:
            self.start_julius()

        assert self.julius_in and self.julius_out, "Julius not started"

        # Compute WAV duration
        wav_duration = get_wav_duration(wav_bytes)

        # Write path to WAV file
        _LOGGER.debug("Sending %s byte(s) to Julius", len(wav_bytes))
        start_time = time.time()

        with tempfile.NamedTemporaryFile(suffix=".wav",
                                         mode="wb+") as temp_file:
            temp_file.write(wav_bytes)
            temp_file.seek(0)

            print(temp_file.name, file=self.julius_out)
            self.julius_out.flush()

            sentence_line = ""
            line = self.julius_in.readline().strip()
            _LOGGER.debug("Julius> %s", line)

            while True:
                if line.startswith("sentence1:"):
                    sentence_line = line.split(":", maxsplit=1)[1]
                    break

                if "error" in line.lower():
                    # Give up with an empty transcription
                    _LOGGER.warning(line)
                    break

                line = self.julius_in.readline().strip()
                _LOGGER.debug("Julius> %s", line)

            # Exclude <s> and </s>
            _LOGGER.debug(sentence_line)
            result_text = sentence_line.replace("<s>", "").replace("</s>",
                                                                   "").strip()
            end_time = time.time()

        result_text = result_text.strip()

        return Transcription(
            text=result_text,
            transcribe_seconds=end_time - start_time,
            wav_seconds=wav_duration,
            likelihood=1,
        )
Beispiel #9
0
    def metadata_to_transcription(
        metadata: typing.Optional[deepspeech.Metadata],
        wav_seconds: float,
        transcribe_seconds: float,
    ) -> typing.Optional[Transcription]:
        """Convert DeepSpeech metadata to Rhasspy Transcription"""

        if metadata:
            # Actual transcription
            text = ""

            # Individual tokens
            tokens: typing.List[TranscriptionToken] = []
            confidence = 1.0
            if metadata.transcripts:
                transcript = next(iter(metadata.transcripts))
                confidence = math.exp(transcript.confidence)
                words_and_tokens: typing.List[typing.Any] = [["", []]]

                # Organize by whitespace-separated words
                for token in transcript.tokens:
                    text += token.text

                    if token.text.strip():
                        # Part of a word
                        words_and_tokens[-1][0] += token.text
                        words_and_tokens[-1][1].append(token)
                    else:
                        # Whitespace
                        words_and_tokens.append(["", []])

                for word, word_tokens in words_and_tokens:
                    if not (word and word_tokens):
                        continue

                    tokens.append(
                        TranscriptionToken(
                            token=word,
                            likelihood=1,
                            start_time=word_tokens[0].start_time,
                            end_time=word_tokens[-1].start_time,
                        ))

            return Transcription(
                text=text,
                likelihood=confidence,
                transcribe_seconds=transcribe_seconds,
                wav_seconds=wav_seconds,
                tokens=tokens,
            )

        # Failure
        return None
    def transcribe_wav(self,
                       wav_data: bytes) -> typing.Optional[Transcription]:
        """Speech to text from WAV data."""
        kaldi_cmd = [
            "kaldi-decode",
            "--model-type",
            str(self.model_type),
            "--model-dir",
            str(self.model_dir),
            "--graph-dir",
            str(self.graph_dir),
        ]

        _LOGGER.debug(kaldi_cmd)

        with tempfile.NamedTemporaryFile(suffix=".wav",
                                         mode="wb") as temp_file:
            temp_file.write(wav_data)

            # Rewind
            temp_file.seek(0)

            kaldi_proc = subprocess.Popen(
                kaldi_cmd,
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                universal_newlines=True,
            )

            # Write path to WAV file
            print(temp_file.name, file=kaldi_proc.stdin)

            # Get result back as JSON
            result_json, _ = kaldi_proc.communicate()
            _LOGGER.debug(result_json)
            result = json.loads(result_json)

            # Empty string indicates failure
            text = str(result.get("text", ""))
            if text:
                # Success
                return Transcription(
                    text=text.strip(),
                    likelihood=float(result.get("likelihood", 0)),
                    transcribe_seconds=float(
                        result.get("transcribe_seconds", 0)),
                    wav_seconds=float(result.get("wav_seconds", 0)),
                )

            # Failure
            return None
Beispiel #11
0
    def transcribe_proc():
        """Transcribe live audio stream indefinitely."""
        while True:
            # Get result of transcription
            transcribe_result = transcriber.transcribe_stream(
                audio_stream(), sample_rate, sample_width, channels)

            _LOGGER.debug("Transcription result: %s", transcribe_result)

            transcribe_result = transcribe_result or Transcription.empty()
            transcribe_dict = dataclasses.asdict(transcribe_result)
            transcribe_dict["timeout"] = is_timeout

            print_json(transcribe_dict)
    def transcribe_stream(
        self,
        audio_stream: typing.Iterable[bytes],
        sample_rate: int,
        sample_width: int,
        channels: int,
    ) -> typing.Optional[Transcription]:
        """Speech to text from an audio stream."""
        assert channels == 1, "Only mono audio supported"
        if self.decoder is None:
            # Load decoder
            self.decoder = self.get_decoder()

        total_frames = 0

        # Process data as an entire utterance
        start_time = time.perf_counter()
        self.decoder.start_utt()

        for frame in audio_stream:
            self.decoder.process_raw(frame, False, False)
            total_frames += 1

        self.decoder.end_utt()

        transcribe_seconds = time.perf_counter() - start_time
        _LOGGER.debug("Decoded audio in %s second(s)", transcribe_seconds)

        hyp = self.decoder.hyp()
        if hyp:
            return Transcription(
                text=hyp.hypstr.strip(),
                likelihood=self.decoder.get_logmath().exp(hyp.prob),
                transcribe_seconds=transcribe_seconds,
                wav_seconds=total_frames / float(sample_rate),
                tokens=[
                    TranscriptionToken(
                        token=seg.word,
                        start_time=seg.start_frame / 100,
                        end_time=seg.end_frame / 100,
                        likelihood=self.decoder.get_logmath().exp(seg.prob),
                    ) for seg in self.decoder.seg()
                ],
            )

        return None
    def transcribe_wav(self,
                       wav_bytes: bytes) -> typing.Optional[Transcription]:
        """Speech to text from WAV data."""
        if self.decoder is None:
            # Load decoder
            self.decoder = self.get_decoder()

        # Compute WAV duration
        audio_data: bytes = bytes()
        with io.BytesIO(wav_bytes) as wav_buffer:
            with wave.open(wav_buffer) as wav_file:
                frames = wav_file.getnframes()
                rate = wav_file.getframerate()
                wav_duration = frames / float(rate)

                # Extract raw audio data
                audio_data = wav_file.readframes(wav_file.getnframes())

        # Process data as an entire utterance
        start_time = time.perf_counter()
        self.decoder.start_utt()
        self.decoder.process_raw(audio_data, False, True)
        self.decoder.end_utt()

        transcribe_seconds = time.perf_counter() - start_time
        _LOGGER.debug("Decoded audio in %s second(s)", transcribe_seconds)

        hyp = self.decoder.hyp()
        if hyp:
            return Transcription(
                text=hyp.hypstr.strip(),
                likelihood=self.decoder.get_logmath().exp(hyp.prob),
                transcribe_seconds=transcribe_seconds,
                wav_seconds=wav_duration,
                tokens=[
                    TranscriptionToken(
                        token=seg.word,
                        start_time=seg.start_frame / 100,
                        end_time=seg.end_frame / 100,
                        likelihood=self.decoder.get_logmath().exp(seg.prob),
                    ) for seg in self.decoder.seg()
                ],
            )

        return None
Beispiel #14
0
def transcribe(args: argparse.Namespace):
    """Do speech to text on one more WAV files."""
    # Load transcriber
    args.model_dir = Path(args.model_dir)

    if args.graph_dir:
        args.graph_dir = Path(args.graph_dir)
    else:
        args.graph_dir = args.model_dir / "graph"

    transcriber = KaldiCommandLineTranscriber(
        args.model_type, args.model_dir, args.graph_dir
    )

    # Do transcription
    try:
        if args.wav_file:
            # Transcribe WAV files
            for wav_path in args.wav_file:
                _LOGGER.debug("Processing %s", wav_path)
                wav_bytes = open(wav_path, "rb").read()
                result = transcriber.transcribe_wav(wav_bytes)

                if not result:
                    result = Transcription.empty()

                print_json(result)
        else:
            # Read WAV data from stdin
            if os.isatty(sys.stdin.fileno()):
                print("Reading WAV data from stdin...", file=sys.stderr)

            # Stream in chunks
            with wave.open(sys.stdin.buffer, "rb") as wav_file:

                def audio_stream(wav_file, frames_in_chunk):
                    num_frames = wav_file.getnframes()
                    try:
                        while num_frames > frames_in_chunk:
                            yield wav_file.readframes(frames_in_chunk)
                            num_frames -= frames_in_chunk

                        if num_frames > 0:
                            # Last chunk
                            yield wav_file.readframes(num_frames)
                    except KeyboardInterrupt:
                        pass

                result = transcriber.transcribe_stream(
                    audio_stream(wav_file, args.frames_in_chunk),
                    wav_file.getframerate(),
                    wav_file.getsampwidth(),
                    wav_file.getnchannels(),
                )

                assert result
                print_json(result)
    except KeyboardInterrupt:
        pass
    finally:
        transcriber.stop()
Beispiel #15
0
    def transcribe_stream(
        self,
        audio_stream: typing.Iterable[bytes],
        sample_rate: int,
        sample_width: int,
        channels: int,
    ) -> typing.Optional[Transcription]:
        """Speech to text from an audio stream."""
        if self.model_type == KaldiModelType.NNET3:
            # Use online2-tcp-nnet3-decode-faster
            if not self.decode_proc:
                self.start_decode()

            # Connect to decoder
            client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            client_socket.settimeout(self.timeout_seconds)
            client_socket.connect(("localhost", self.port_num))
            client_file = client_socket.makefile(mode="rb")

            start_time = time.perf_counter()
            num_frames = 0
            for chunk in audio_stream:
                if chunk:
                    client_socket.sendall(chunk)
                    num_frames += len(chunk) // sample_width

            # Partial shutdown of socket (write only).
            # This should force the Kaldi server to finalize the output.
            client_socket.shutdown(socket.SHUT_WR)

            _LOGGER.debug("Finished stream. Getting transcription.")

            lines = client_file.read().decode().splitlines()
            text = ""
            _LOGGER.debug(lines)

            if lines:
                # Find longest line
                for line in reversed(lines):
                    line = line.strip()
                    if len(line) > len(text):
                        text = line
            else:
                # No result
                text = ""

            if text:
                # Success
                end_time = time.perf_counter()

                return Transcription(
                    text=text,
                    likelihood=1,
                    transcribe_seconds=(end_time - start_time),
                    wav_seconds=(num_frames / sample_rate),
                )

            # Failure
            return None

        if self.model_type == KaldiModelType.GMM:
            # No online streaming support.
            # Re-package as a WAV.
            with io.BytesIO() as wav_buffer:
                wav_file: wave.Wave_write = wave.open(wav_buffer, "wb")
                with wav_file:
                    wav_file.setframerate(sample_rate)
                    wav_file.setsampwidth(sample_width)
                    wav_file.setnchannels(channels)

                    for frame in audio_stream:
                        wav_file.writeframes(frame)

                return self.transcribe_wav(wav_buffer.getvalue())

        raise ValueError(f"Unsupported model type: {self.model_type}")
Beispiel #16
0
                def transcribe_proc(info, transcriber_factory, sample_rate,
                                    sample_width, channels):
                    def audio_stream(frame_queue) -> typing.Iterable[bytes]:
                        # Pull frames from the queue
                        frames = frame_queue.get()
                        while frames:
                            yield frames
                            frames = frame_queue.get()

                    try:
                        info.transcriber = transcriber_factory(
                            port_num=self.kaldi_port)

                        assert (info.transcriber
                                is not None), "Failed to create transcriber"

                        while True:
                            # Wait for session to start
                            info.ready_event.wait()
                            info.ready_event.clear()

                            # Get result of transcription
                            result = info.transcriber.transcribe_stream(
                                audio_stream(info.frame_queue),
                                sample_rate,
                                sample_width,
                                channels,
                            )

                            _LOGGER.debug("Transcription result: %s", result)

                            assert (result is not None
                                    and result.text), "Null transcription"

                            # Signal completion
                            info.result = result
                            info.result_event.set()

                            if not self.reuse_transcribers:
                                try:
                                    info.transcriber.stop()
                                except Exception:
                                    _LOGGER.exception("Transcriber stop")

                                break
                    except Exception:
                        _LOGGER.exception("session proc")

                        # Mark as not reusable
                        info.reuse = False

                        # Stop transcriber
                        if info.transcriber is not None:
                            try:
                                info.transcriber.stop()
                            except Exception:
                                _LOGGER.exception("Transcriber stop")

                        # Signal failure
                        info.transcriber = None
                        info.result = Transcription(text="",
                                                    likelihood=0,
                                                    transcribe_seconds=0,
                                                    wav_seconds=0)
                        info.result_event.set()
Beispiel #17
0
async def transcribe_wav(args: argparse.Namespace,
                         core: Voice2JsonCore) -> None:
    """Speech to text from WAV file(s)."""
    from rhasspyasr import Transcription

    # Make sure profile has been trained
    assert core.check_trained(), "Not trained"

    # Get speech to text transcriber for profile
    transcriber = core.get_transcriber(open_transcription=args.open,
                                       debug=args.debug)

    # Directory to report WAV file names relative to
    relative_dir = (None if args.relative_directory is None else Path(
        args.relative_directory))

    try:
        if args.wav_file or args.stdin_files:
            # Read WAV file paths
            wav_files = args.wav_file
            if args.stdin_files:
                _LOGGER.debug("Reading file paths from stdin")
                wav_files = itertools.chain(wav_files, sys.stdin)

            for wav_path_str in wav_files:
                wav_path_str = wav_path_str.strip()

                # Load and convert
                wav_path = Path(wav_path_str)
                _LOGGER.debug("Transcribing %s", wav_path)

                wav_data = await core.maybe_convert_wav(wav_path.read_bytes())

                # Transcribe
                transcription = (transcriber.transcribe_wav(wav_data)
                                 or Transcription.empty())
                result = dataclasses.asdict(transcription)

                if relative_dir is None:
                    # Add name of WAV file to result
                    result["wav_name"] = wav_path.name
                else:
                    # Make relative to some directory
                    result["wav_name"] = str(wav_path.absolute().relative_to(
                        relative_dir.absolute()))

                print_json(result)
        else:
            # Read WAV data from stdin
            _LOGGER.debug("Reading WAV data from stdin")

            if args.input_size:
                # Number of bytes is on separate line
                line = sys.stdin.buffer.readline().strip()
                if not line:
                    return

                num_bytes = int(line)
                while num_bytes > 0:
                    # Read in WAV
                    wav_data = sys.stdin.buffer.read(num_bytes)
                    while len(wav_data) < num_bytes:
                        wav_data = sys.stdin.buffer.read(num_bytes -
                                                         len(wav_data))

                    # Transcribe
                    wav_data = await core.maybe_convert_wav(wav_data)
                    transcription = (transcriber.transcribe_wav(wav_data)
                                     or Transcription.empty())
                    result = dataclasses.asdict(transcription)

                    print_json(result)

                    # Next WAV
                    line = sys.stdin.buffer.readline().strip()
                    if not line:
                        break

                    num_bytes = int(line)
            else:
                # Load and convert entire input
                wav_data = await core.maybe_convert_wav(
                    sys.stdin.buffer.read())

                # Transcribe
                transcription = (transcriber.transcribe_wav(wav_data)
                                 or Transcription.empty())
                result = dataclasses.asdict(transcription)

                print_json(result)
    finally:
        transcriber.stop()
Beispiel #18
0
    async def async_test_session(self):
        """Check good start/stop session."""
        fake_transcription = Transcription(
            text="this is a test", likelihood=1, transcribe_seconds=0, wav_seconds=0
        )

        def fake_transcribe(stream, *args):
            """Return test trancription."""
            for chunk in stream:
                if not chunk:
                    break

            return fake_transcription

        self.transcriber.transcribe_stream = fake_transcribe

        # Start session
        start_listening = AsrStartListening(
            site_id=self.site_id,
            session_id=self.session_id,
            stop_on_silence=False,
            send_audio_captured=True,
        )
        result = None
        async for response in self.hermes.on_message_blocking(start_listening):
            result = response

        # No response expected
        self.assertIsNone(result)

        # Send in "audio"
        fake_wav_bytes = self.hermes.to_wav_bytes(secrets.token_bytes(100))
        fake_frame = AudioFrame(wav_bytes=fake_wav_bytes)
        async for response in self.hermes.on_message_blocking(
            fake_frame, site_id=self.site_id
        ):
            result = response

        # No response expected
        self.assertIsNone(result)

        # Stop session
        stop_listening = AsrStopListening(
            site_id=self.site_id, session_id=self.session_id
        )

        results = []
        async for response in self.hermes.on_message_blocking(stop_listening):
            results.append(response)

        # Check results
        self.assertEqual(
            results,
            [
                AsrRecordingFinished(site_id=self.site_id, session_id=self.session_id),
                AsrTextCaptured(
                    text=fake_transcription.text,
                    likelihood=fake_transcription.likelihood,
                    seconds=fake_transcription.transcribe_seconds,
                    site_id=self.site_id,
                    session_id=self.session_id,
                ),
                (
                    AsrAudioCaptured(wav_bytes=fake_wav_bytes),
                    {"site_id": self.site_id, "session_id": self.session_id},
                ),
            ],
        )
Beispiel #19
0
    def transcribe_stream(
        self,
        audio_stream: typing.Iterable[bytes],
        sample_rate: int,
        sample_width: int,
        channels: int,
    ) -> typing.Optional[Transcription]:
        """Speech to text from an audio stream."""
        if self.model_type == KaldiModelType.NNET3:
            # Use online2-tcp-nnet3-decode-faster
            if not self.decode_proc:
                self.start_decode()

            assert self.decode_proc, "No decode process"

            start_time = time.perf_counter()
            num_frames = 0
            for chunk in audio_stream:
                if chunk:
                    num_samples = len(chunk) // sample_width

                    # Write sample count to process stdin
                    print(num_samples, file=self.decode_proc.stdin)
                    self.decode_proc.stdin.flush()

                    # Write chunk to FIFO.
                    # Make sure that we write exactly the right number of bytes.
                    self.chunk_fifo_file.write(chunk[:num_samples *
                                                     sample_width])
                    self.chunk_fifo_file.flush()
                    num_frames += num_samples

            # Finish utterance
            print("0", file=self.decode_proc.stdin)
            self.decode_proc.stdin.flush()

            _LOGGER.debug("Finished stream. Getting transcription.")

            for line in self.decode_proc.stdout:
                line = line.strip()
                if line.lower() == "ready":
                    continue

                confidence_and_text = line
                break

            _LOGGER.debug(confidence_and_text)

            if confidence_and_text:
                # Success
                end_time = time.perf_counter()

                # <mbr_wer> <word> <word_confidence> <word_start_time> <word_end_time> ...
                wer_str, *words = confidence_and_text.split()
                confidence = 0.0

                try:
                    # Try to parse minimum bayes risk (MBR) word error rate (WER)
                    confidence = max(0, 1 - float(wer_str))
                except ValueError:
                    _LOGGER.exception(wer_str)

                tokens = []
                for word, word_confidence, word_start_time, word_end_time in grouper(
                        words, n=4):
                    tokens.append(
                        TranscriptionToken(
                            token=word,
                            start_time=float(word_start_time),
                            end_time=float(word_end_time),
                            likelihood=float(word_confidence),
                        ))

                text = " ".join(t.token for t in tokens)
                return Transcription(
                    text=text,
                    likelihood=confidence,
                    transcribe_seconds=(end_time - start_time),
                    wav_seconds=(num_frames / sample_rate),
                    tokens=tokens,
                )

            # Failure
            return None

        if self.model_type == KaldiModelType.GMM:
            # No online streaming support.
            # Re-package as a WAV.
            with io.BytesIO() as wav_buffer:
                wav_file: wave.Wave_write = wave.open(wav_buffer, "wb")
                with wav_file:
                    wav_file.setframerate(sample_rate)
                    wav_file.setsampwidth(sample_width)
                    wav_file.setnchannels(channels)

                    for frame in audio_stream:
                        wav_file.writeframes(frame)

                return self.transcribe_wav(wav_buffer.getvalue())

        raise ValueError(f"Unsupported model type: {self.model_type}")