def __init__( self, api_key: str, secret_key: str, host: str = client_config["host_stt"], port: int = client_config["port"], ssl_channel: bool = True, ca_file: str = None, uploader_config: dict = None ): """ Create async client for speech recognition. :param api_key: client public api key :param secret_key: client secret api key :param host: Tinkoff Voicekit speech recognition host url :param port: Tinkoff Voicekit speech recognition port, default value: 443 :param ca_file: optional certificate file :uploader_config: config for Uploader """ super().__init__(host, port, ssl_channel, ca_file) self._metadata = Metadata(api_key, secret_key, aud=aud["stt"]) self._api_key = api_key self._secret_key = secret_key self._stub = SpeechToTextStub(self._channel) uploader_config = {} if uploader_config is None else uploader_config self._uploader = Uploader(self._api_key, self._secret_key, **uploader_config)
def __init__(self, api_key: str, secret_key: str, host: str = client_config["host_tts"], port: int = client_config["port"], ca_file: str = None): """ Create client for speech synthesis. :param api_key: client public api key :param secret_key: client secret api key :param host: Tinkoff Voicekit speech synthesize host url :param port: Tinkoff Voicekit speech synthesize port, default value: 443 :param ca_file: optional certificate file """ super().__init__() configuration() self._metadata = Metadata(api_key, secret_key, aud="TTS") self._channel = self._make_channel(host, port, ca_file) self._stub = TextToSpeechStub(self._channel)
def __init__(self, api_key: str, secret_key: str, host: str = client_config["host_stt"], port: int = client_config["port"], ca_file: str = None): """ Create client for speech recognition. :param api_key: client public api key :param secret_key: client secret api key :param host: Tinkoff Voicekit speech recognition host url :param port: Tinkoff Voicekit speech recognition port, default value: 443 :param ca_file: optional certificate file """ super().__init__() self._metadata = Metadata(api_key, secret_key, aud="STT") self._api_key = api_key self._secret_key = secret_key self._channel = self._make_channel(host, port, ca_file) self._stub = SpeechToTextStub(self._channel)
def __init__(self, api_key: str, secret_key: str, host: str = client_config["host_operations"], port: int = client_config["port"], ssl_channel: bool = True, ca_file: str = None): """ Create client for long running operations. :param api_key: client public api key :param secret_key: client secret api key :param host: Tinkoff Voicekit speech operations host url :param port: Tinkoff Voicekit speech operations port, default value: 443 :param ca_file: optional certificate file """ super().__init__(host, port, ssl_channel, ca_file) self._metadata = Metadata(api_key, secret_key, aud["operations"]) self._api_key = api_key self._secret_key = secret_key self._stub = OperationsStub(self._channel)
class ClientTTS(BaseClient): definitions = { "AudioEncoding": { "type": "string", "enum": ["LINEAR16", "RAW_OPUS"] }, "SynthesisInput": { "type": "object", "properties": { "text": { "type": "string" } } } } streaming_synthesize_config_schema = { "type": "object", "definitions": definitions, "properties": { "audio_encoding": { "$ref": "#/definitions/AudioEncoding" }, "speaking_rate": { "type": "number" }, "sample_rate_hertz": { "type": "number" } }, "required": [ "sample_rate_hertz", "audio_encoding", ], "additionalProperties": False } def __init__(self, api_key: str, secret_key: str, host: str = client_config["host_tts"], port: int = client_config["port"], ca_file: str = None): """ Create client for speech synthesis. :param api_key: client public api key :param secret_key: client secret api key :param host: Tinkoff Voicekit speech synthesize host url :param port: Tinkoff Voicekit speech synthesize port, default value: 443 :param ca_file: optional certificate file """ super().__init__() configuration() self._metadata = Metadata(api_key, secret_key, aud="TTS") self._channel = self._make_channel(host, port, ca_file) self._stub = TextToSpeechStub(self._channel) def streaming_synthesize(self, text_source: str, config: dict, text_encoding: str = "utf-8", ssml: bool = False): """ Description: return generator by StreamingSynthesizeSpeechResponses from each text line in file or text string. :param text_source: path to file with text or string with text :param config: dict conforming to streaming_synthesize_config_schema :param text_encoding: text encoding :param ssml: enable ssml text source """ validate(config, ClientTTS.streaming_synthesize_config_schema) generate_utterances = get_utterance_generator(text_source) request = SynthesizeSpeechRequest() request.audio_config.CopyFrom(get_config(config)) for text in generate_utterances(text_source, text_encoding): if not self._metadata.is_fresh_jwt(): self._metadata.refresh_jwt() if ssml: request.input.ssml = text else: request.input.text = text yield self._stub.StreamingSynthesize( request, metadata=self._metadata.metadata) def synthesize_to_audio_wav(self, text_source: str, config: dict, output_dir: str = os.curdir, text_encoding: str = "utf-8", ssml: bool = False): """ Description: Generate audio for each text line from your text source and save it in wav format. :param text_source: path to file with text or string with text :param config: dict conforming to streaming_synthesize_config_schema :param output_dir: path to output directory where to store synthesized audio :param text_encoding: text encoding :param ssml: enable ssml text source """ rows_responses = self.streaming_synthesize(text_source, config, text_encoding, ssml) os.makedirs(output_dir, exist_ok=True) for index, row_response in enumerate(rows_responses): audio_chunks = [] get_chunk = get_encoder(config["audio_encoding"], config["sample_rate_hertz"]) for response in row_response: audio_chunks += get_chunk(response.audio_chunk) save_synthesize_wav(bytes(audio_chunks), os.path.join(output_dir, f"{index}.wav"), config["sample_rate_hertz"])
class ClientSTT(BaseClient): definitions = { "StringArray": { "type": "array", "items": { "type": "string", } }, "AudioEncoding": { "type": "string", "enum": [ "LINEAR16", "ALAW", "MULAW", "LINEAR32F", "RAW_OPUS", "MPEG_AUDIO" ] }, "VoiceActivityDetectionConfig": { "type": "object", "properties": { "min_speech_duration": { "type": "number" }, "max_speech_duration": { "type": "number" }, "silence_duration_threshold": { "type": "number" }, "silence_prob_threshold": { "type": "number" }, "aggressiveness": { "type": "number" }, } }, "SpeechContext": { "type": "object", "properties": { "phrases": { "$ref": "#definitions/StringArray" }, "words": { "$ref": "#definitions/StringArray" } } }, "InterimResultsConfig": { "type": "object", "properties": { "enable_interim_results": { "type": "boolean" }, "interval": { "type": "number" } } } } recognition_config_schema = { "type": "object", "definitions": definitions, "properties": { "encoding": { "$ref": "#/definitions/AudioEncoding" }, "sample_rate_hertz": { "type": "number" }, "language_code": { "type": "string" }, "max_alternatives": { "type": "number" }, "speech_contexts": { "type": "array", "items": { "$ref": "#/definitions/SpeechContext" } }, "enable_automatic_punctuation": { "type": "boolean" }, "model": { "type": "string" }, "num_channels": { "type": "number" }, "do_not_perform_vad": { "type": "boolean" }, "vad_config": { "$ref": "#/definitions/VoiceActivityDetectionConfig" } }, "required": [ "sample_rate_hertz", "num_channels", "encoding", ], "additionalProperties": False } streaming_recognition_config_schema = { "type": "object", "definitions": definitions, "properties": { "config": recognition_config_schema, "single_utterance": { "type": "boolean" }, "interim_results_config": { "$ref": "#/definitions/InterimResultsConfig" } }, "additionalProperties": False } def __init__(self, api_key: str, secret_key: str, host: str = client_config["host_stt"], port: int = client_config["port"], ca_file: str = None): """ Create client for speech recognition. :param api_key: client public api key :param secret_key: client secret api key :param host: Tinkoff Voicekit speech recognition host url :param port: Tinkoff Voicekit speech recognition port, default value: 443 :param ca_file: optional certificate file """ super().__init__() self._metadata = Metadata(api_key, secret_key, aud="STT") self._api_key = api_key self._secret_key = secret_key self._channel = self._make_channel(host, port, ca_file) self._stub = SpeechToTextStub(self._channel) def recognize(self, source, config): """ Recognize whole audio and then return all responses. :param source: path to audio file or buffer with audio :param config: dict conforming to recognition_config_schema """ validate(config, ClientSTT.recognition_config_schema) buffer = get_buffer(source) if not self._metadata.is_fresh_jwt(): self._metadata.refresh_jwt() response = self._stub.Recognize(get_proto_request(buffer, config), metadata=self._metadata.metadata) return MessageToDict(response, including_default_value_fields=True, preserving_proto_field_name=True)["results"] def streaming_recognize(self, source, config): """ Recognize audio in streaming mode. Stream audio chunks to server and get streaming responses. :param source: path to audio file or audio stream :param config: dict conforming to streaming_recognition_config_schema """ validate(config, ClientSTT.streaming_recognition_config_schema) buffer = get_buffer(source) if not self._metadata.is_fresh_jwt(): self._metadata.refresh_jwt() responses = self._stub.StreamingRecognize( create_stream_requests(buffer, config), metadata=self._metadata.metadata) return dict_generator(responses)