コード例 #1
0
    def __init__(
            self,
            api_key: str,
            secret_key: str,
            host: str = client_config["host_stt"],
            port: int = client_config["port"],
            ssl_channel: bool = True,
            ca_file: str = None,
            uploader_config: dict = None
    ):
        """
        Create async client for speech recognition.
            :param api_key: client public api key
            :param secret_key: client secret api key
            :param host: Tinkoff Voicekit speech recognition host url
            :param port: Tinkoff Voicekit speech recognition port, default value: 443
            :param ca_file: optional certificate file
            :uploader_config: config for Uploader
        """
        super().__init__(host, port, ssl_channel, ca_file)
        self._metadata = Metadata(api_key, secret_key, aud=aud["stt"])
        self._api_key = api_key
        self._secret_key = secret_key
        self._stub = SpeechToTextStub(self._channel)

        uploader_config = {} if uploader_config is None else uploader_config
        self._uploader = Uploader(self._api_key, self._secret_key, **uploader_config)
コード例 #2
0
 def __init__(self,
              api_key: str,
              secret_key: str,
              host: str = client_config["host_tts"],
              port: int = client_config["port"],
              ca_file: str = None):
     """
     Create client for speech synthesis.
         :param api_key: client public api key
         :param secret_key: client secret api key
         :param host: Tinkoff Voicekit speech synthesize host url
         :param port: Tinkoff Voicekit speech synthesize port, default value: 443
         :param ca_file: optional certificate file
     """
     super().__init__()
     configuration()
     self._metadata = Metadata(api_key, secret_key, aud="TTS")
     self._channel = self._make_channel(host, port, ca_file)
     self._stub = TextToSpeechStub(self._channel)
コード例 #3
0
 def __init__(self,
              api_key: str,
              secret_key: str,
              host: str = client_config["host_stt"],
              port: int = client_config["port"],
              ca_file: str = None):
     """
     Create client for speech recognition.
         :param api_key: client public api key
         :param secret_key: client secret api key
         :param host: Tinkoff Voicekit speech recognition host url
         :param port: Tinkoff Voicekit speech recognition port, default value: 443
         :param ca_file: optional certificate file
     """
     super().__init__()
     self._metadata = Metadata(api_key, secret_key, aud="STT")
     self._api_key = api_key
     self._secret_key = secret_key
     self._channel = self._make_channel(host, port, ca_file)
     self._stub = SpeechToTextStub(self._channel)
コード例 #4
0
 def __init__(self,
              api_key: str,
              secret_key: str,
              host: str = client_config["host_operations"],
              port: int = client_config["port"],
              ssl_channel: bool = True,
              ca_file: str = None):
     """
     Create client for long running operations.
         :param api_key: client public api key
         :param secret_key: client secret api key
         :param host: Tinkoff Voicekit speech operations host url
         :param port: Tinkoff Voicekit speech operations port, default value: 443
         :param ca_file: optional certificate file
     """
     super().__init__(host, port, ssl_channel, ca_file)
     self._metadata = Metadata(api_key, secret_key, aud["operations"])
     self._api_key = api_key
     self._secret_key = secret_key
     self._stub = OperationsStub(self._channel)
コード例 #5
0
class ClientTTS(BaseClient):
    definitions = {
        "AudioEncoding": {
            "type": "string",
            "enum": ["LINEAR16", "RAW_OPUS"]
        },
        "SynthesisInput": {
            "type": "object",
            "properties": {
                "text": {
                    "type": "string"
                }
            }
        }
    }

    streaming_synthesize_config_schema = {
        "type": "object",
        "definitions": definitions,
        "properties": {
            "audio_encoding": {
                "$ref": "#/definitions/AudioEncoding"
            },
            "speaking_rate": {
                "type": "number"
            },
            "sample_rate_hertz": {
                "type": "number"
            }
        },
        "required": [
            "sample_rate_hertz",
            "audio_encoding",
        ],
        "additionalProperties": False
    }

    def __init__(self,
                 api_key: str,
                 secret_key: str,
                 host: str = client_config["host_tts"],
                 port: int = client_config["port"],
                 ca_file: str = None):
        """
        Create client for speech synthesis.
            :param api_key: client public api key
            :param secret_key: client secret api key
            :param host: Tinkoff Voicekit speech synthesize host url
            :param port: Tinkoff Voicekit speech synthesize port, default value: 443
            :param ca_file: optional certificate file
        """
        super().__init__()
        configuration()
        self._metadata = Metadata(api_key, secret_key, aud="TTS")
        self._channel = self._make_channel(host, port, ca_file)
        self._stub = TextToSpeechStub(self._channel)

    def streaming_synthesize(self,
                             text_source: str,
                             config: dict,
                             text_encoding: str = "utf-8",
                             ssml: bool = False):
        """
        Description:
        return generator by StreamingSynthesizeSpeechResponses from each text line in file or text string.
            :param text_source: path to file with text or string with text
            :param config: dict conforming to streaming_synthesize_config_schema
            :param text_encoding: text encoding
            :param ssml: enable ssml text source
        """
        validate(config, ClientTTS.streaming_synthesize_config_schema)

        generate_utterances = get_utterance_generator(text_source)

        request = SynthesizeSpeechRequest()
        request.audio_config.CopyFrom(get_config(config))

        for text in generate_utterances(text_source, text_encoding):
            if not self._metadata.is_fresh_jwt():
                self._metadata.refresh_jwt()

            if ssml:
                request.input.ssml = text
            else:
                request.input.text = text
            yield self._stub.StreamingSynthesize(
                request, metadata=self._metadata.metadata)

    def synthesize_to_audio_wav(self,
                                text_source: str,
                                config: dict,
                                output_dir: str = os.curdir,
                                text_encoding: str = "utf-8",
                                ssml: bool = False):
        """
        Description:
        Generate audio for each text line from your text source and save it in wav format.
            :param text_source: path to file with text or string with text
            :param config: dict conforming to streaming_synthesize_config_schema
            :param output_dir: path to output directory where to store synthesized audio
            :param text_encoding: text encoding
            :param ssml: enable ssml text source
        """
        rows_responses = self.streaming_synthesize(text_source, config,
                                                   text_encoding, ssml)
        os.makedirs(output_dir, exist_ok=True)

        for index, row_response in enumerate(rows_responses):
            audio_chunks = []
            get_chunk = get_encoder(config["audio_encoding"],
                                    config["sample_rate_hertz"])
            for response in row_response:
                audio_chunks += get_chunk(response.audio_chunk)

            save_synthesize_wav(bytes(audio_chunks),
                                os.path.join(output_dir, f"{index}.wav"),
                                config["sample_rate_hertz"])
コード例 #6
0
class ClientSTT(BaseClient):
    definitions = {
        "StringArray": {
            "type": "array",
            "items": {
                "type": "string",
            }
        },
        "AudioEncoding": {
            "type":
            "string",
            "enum": [
                "LINEAR16", "ALAW", "MULAW", "LINEAR32F", "RAW_OPUS",
                "MPEG_AUDIO"
            ]
        },
        "VoiceActivityDetectionConfig": {
            "type": "object",
            "properties": {
                "min_speech_duration": {
                    "type": "number"
                },
                "max_speech_duration": {
                    "type": "number"
                },
                "silence_duration_threshold": {
                    "type": "number"
                },
                "silence_prob_threshold": {
                    "type": "number"
                },
                "aggressiveness": {
                    "type": "number"
                },
            }
        },
        "SpeechContext": {
            "type": "object",
            "properties": {
                "phrases": {
                    "$ref": "#definitions/StringArray"
                },
                "words": {
                    "$ref": "#definitions/StringArray"
                }
            }
        },
        "InterimResultsConfig": {
            "type": "object",
            "properties": {
                "enable_interim_results": {
                    "type": "boolean"
                },
                "interval": {
                    "type": "number"
                }
            }
        }
    }

    recognition_config_schema = {
        "type": "object",
        "definitions": definitions,
        "properties": {
            "encoding": {
                "$ref": "#/definitions/AudioEncoding"
            },
            "sample_rate_hertz": {
                "type": "number"
            },
            "language_code": {
                "type": "string"
            },
            "max_alternatives": {
                "type": "number"
            },
            "speech_contexts": {
                "type": "array",
                "items": {
                    "$ref": "#/definitions/SpeechContext"
                }
            },
            "enable_automatic_punctuation": {
                "type": "boolean"
            },
            "model": {
                "type": "string"
            },
            "num_channels": {
                "type": "number"
            },
            "do_not_perform_vad": {
                "type": "boolean"
            },
            "vad_config": {
                "$ref": "#/definitions/VoiceActivityDetectionConfig"
            }
        },
        "required": [
            "sample_rate_hertz",
            "num_channels",
            "encoding",
        ],
        "additionalProperties": False
    }

    streaming_recognition_config_schema = {
        "type": "object",
        "definitions": definitions,
        "properties": {
            "config": recognition_config_schema,
            "single_utterance": {
                "type": "boolean"
            },
            "interim_results_config": {
                "$ref": "#/definitions/InterimResultsConfig"
            }
        },
        "additionalProperties": False
    }

    def __init__(self,
                 api_key: str,
                 secret_key: str,
                 host: str = client_config["host_stt"],
                 port: int = client_config["port"],
                 ca_file: str = None):
        """
        Create client for speech recognition.
            :param api_key: client public api key
            :param secret_key: client secret api key
            :param host: Tinkoff Voicekit speech recognition host url
            :param port: Tinkoff Voicekit speech recognition port, default value: 443
            :param ca_file: optional certificate file
        """
        super().__init__()
        self._metadata = Metadata(api_key, secret_key, aud="STT")
        self._api_key = api_key
        self._secret_key = secret_key
        self._channel = self._make_channel(host, port, ca_file)
        self._stub = SpeechToTextStub(self._channel)

    def recognize(self, source, config):
        """
        Recognize whole audio and then return all responses.
            :param source: path to audio file or buffer with audio
            :param config: dict conforming to recognition_config_schema
        """
        validate(config, ClientSTT.recognition_config_schema)
        buffer = get_buffer(source)

        if not self._metadata.is_fresh_jwt():
            self._metadata.refresh_jwt()

        response = self._stub.Recognize(get_proto_request(buffer, config),
                                        metadata=self._metadata.metadata)

        return MessageToDict(response,
                             including_default_value_fields=True,
                             preserving_proto_field_name=True)["results"]

    def streaming_recognize(self, source, config):
        """
        Recognize audio in streaming mode.
        Stream audio chunks to server and get streaming responses.
            :param source: path to audio file or audio stream
            :param config: dict conforming to streaming_recognition_config_schema
        """
        validate(config, ClientSTT.streaming_recognition_config_schema)
        buffer = get_buffer(source)

        if not self._metadata.is_fresh_jwt():
            self._metadata.refresh_jwt()

        responses = self._stub.StreamingRecognize(
            create_stream_requests(buffer, config),
            metadata=self._metadata.metadata)

        return dict_generator(responses)