def text2audio(self, text): stub = tts_pb2_grpc.TextToSpeechStub( grpc.secure_channel(self._endpoint, grpc.ssl_channel_credentials())) request = self._build_request(text) metadata = authorization_metadata(self._api_key, self._secret_key, "tinkoff.cloud.tts") responses = stub.StreamingSynthesize(request, metadata=metadata) for key, value in responses.initial_metadata(): if key == "x-audio-num-samples": print("Estimated audio duration is " + str(int(value) / self._sample_rate) + " seconds") break f = io.BytesIO() ogg_opus_writer = pyogg.OggOpusWriter(f) ogg_opus_writer.set_application("audio") ogg_opus_writer.set_sampling_frequency(self._sample_rate) ogg_opus_writer.set_channels(1) ogg_opus_writer.set_frame_size(20) # milliseconds for stream_response in responses: ogg_opus_writer.encode(stream_response.audio_chunk) # close writer ogg_opus_writer.close() # get audio f.seek(0) audio = f.getvalue() return audio
def main(): args = BaseRecognitionParser().parse_args() if args.encoding == stt_pb2.RAW_OPUS: raise ValueError("RAW_OPUS encoding is not supported by this script") with audio_open_read(args.audio_file, args.encoding, args.rate, args.num_channels, args.chunk_size, args.pyaudio_max_seconds) as reader: metadata = authorization_metadata(args.api_key, args.secret_key, "tinkoff.cloud.stt", type=dict) request = build_recognition_request(args, reader, type="json") response = requests.post("http{}://{}/v1/stt:recognize".format( "s" if args.endpoint.endswith("443") else "", args.endpoint), json=request, headers=metadata) if response.status_code != 200: print( "REST failed with HTTP code {}\nHeaders: {}\nBody: {}".format( response.status_code, response.headers, response.text)) return response = response.json() print_recognition_response(response)
def text2speach(text, tmp_dir='./tmp'): ts = int(time.time() * 10**6) wav_tmp = os.path.join(tmp_dir, '{}.wav'.format(ts)) oga_tmp = os.path.join(tmp_dir, '{}.oga'.format(ts)) with wave.open(wav_tmp, "wb") as f: f.setframerate(sample_rate) f.setnchannels(1) f.setsampwidth(2) stub = tts_pb2_grpc.TextToSpeechStub(grpc.secure_channel(endpoint, grpc.ssl_channel_credentials())) request = build_request(text) metadata = authorization_metadata(api_key, secret_key, "tinkoff.cloud.tts") responses = stub.StreamingSynthesize(request, metadata=metadata) # for key, value in responses.initial_metadata(): # if key == "x-audio-num-samples": # #print("Estimated audio duration is " + str(int(value) / sample_rate) + " seconds") # break for stream_response in responses: f.writeframes(stream_response.audio_chunk) stream = ffmpeg.input(wav_tmp) stream = ffmpeg.output(stream, oga_tmp) ffmpeg.run(stream, overwrite_output=True) with open(oga_tmp, 'rb') as f: fp = f.read() os.remove(oga_tmp) os.remove(wav_tmp) return fp
def main(): args = StreamingRecognitionParser().parse_args() with audio_open_read(args.audio_file, args.encoding, args.rate, args.num_channels, args.chunk_size, args.pyaudio_max_seconds) as reader: stub = stt_pb2_grpc.SpeechToTextStub(make_channel(args)) metadata = authorization_metadata(args.api_key, args.secret_key, "tinkoff.cloud.stt") responses = stub.StreamingRecognize(stt_generate_requests(args, reader), metadata=metadata) print_streaming_recognition_responses(responses)
def main(): args = RecognitionParser().parse_args() if args.encoding == "RAW_OPUS": raise ValueError("RAW_OPUS encoding is not supported by this script") stub = stt_pb2_grpc.SpeechToTextStub(make_channel(args.host, args.port)) metadata = authorization_metadata(args.api_key, args.secret_key, "tinkoff.cloud.stt") response = stub.Recognize(build_recognition_request(args), metadata=metadata) print_recognition_response(response)
def main(): args = BaseRecognitionParser().parse_args() if args.encoding == stt_pb2.RAW_OPUS: raise ValueError("RAW_OPUS encoding is not supported by this script") with audio_open_read(args.audio_file, args.encoding, args.rate, args.num_channels, args.chunk_size, args.pyaudio_max_seconds) as reader: stub = stt_pb2_grpc.SpeechToTextStub(make_channel(args)) metadata = authorization_metadata(args.api_key, args.secret_key, "tinkoff.cloud.stt") response = stub.Recognize(build_recognition_request(args, reader), metadata=metadata) print_recognition_response(response)
def __init__(self, phrase): phrase = phrase.replace(' ', '. ') self._ssml = '<speak><p>' + phrase + '</p></speak>' self._text = re.sub(r'\<[^>]*\>', '', phrase) stub = tts_pb2_grpc.TextToSpeechStub( grpc.secure_channel(ENDPOINT, grpc.ssl_channel_credentials())) metadata = authorization_metadata(API_KEY, SECRET_KEY, "tinkoff.cloud.tts") request = tts_pb2.SynthesizeSpeechRequest( input=tts_pb2.SynthesisInput(text=self._text, ssml=self._ssml), audio_config=tts_pb2.AudioConfig(audio_encoding=tts_pb2.LINEAR16, speaking_rate=1, sample_rate_hertz=SAMPLE_RATE)) self._responses = stub.StreamingSynthesize(request, metadata=metadata)
def main(): args = BaseSynthesisParser().parse_args() if args.encoding == tts_pb2.LINEAR16 and args.rate != 48000: raise ValueError("LINEAR16 supports only 48kHz for now, use RAW_OPUS") with audio_open_write(args.output_file, args.encoding, args.rate) as audio_writer: stub = tts_pb2_grpc.TextToSpeechStub(make_channel(args)) request = build_synthesis_request(args, args.input_text) metadata = authorization_metadata(args.api_key, args.secret_key, "tinkoff.cloud.tts") responses = stub.StreamingSynthesize(request, metadata=metadata) for stream_response in responses: audio_writer.write(stream_response.audio_chunk)
def main(): args = StreamingRecognitionParser().parse_args() stub = stt_pb2_grpc.SpeechToTextStub(make_channel(args.host, args.port)) metadata = authorization_metadata(args.api_key, args.secret_key, "tinkoff.cloud.stt") responses = stub.StreamingRecognize(generate_requests(args), metadata=metadata) for response in responses: for result in response.results: print("Channel", result.recognition_result.channel) print("Phrase start", result.recognition_result.start_time.ToTimedelta()) print("Phrase end", result.recognition_result.end_time.ToTimedelta()) print("Is final", result.is_final) for alternative in result.recognition_result.alternatives: print("Transcription", alternative.transcript) print("Confidence", alternative.confidence) print("------------------")
def __init__(self, recognize): self._recognize = recognize r = stt_pb2.StreamingRecognizeRequest() r.streaming_config.config.encoding = stt_pb2.AudioEncoding.LINEAR16 r.streaming_config.config.sample_rate_hertz = 16000 r.streaming_config.config.num_channels = 1 r.streaming_config.config.enable_denormalization = True r.streaming_config.config.enable_automatic_punctuation = True r.streaming_config.config.vad_config.silence_duration_threshold = 1.20 r.streaming_config.single_utterance = True r.streaming_config.config.speech_contexts.append( stt_pb2.SpeechContext(phrases=[ stt_pb2.SpeechContextPhrase(text=text, score=10.0) for text in self._recognize.context_words ])) metadata = authorization_metadata(API_KEY, SECRET_KEY, "tinkoff.cloud.stt") stub = stt_pb2_grpc.SpeechToTextStub( grpc.secure_channel(ENDPOINT, grpc.ssl_channel_credentials())) self._responses = stub.StreamingRecognize(self.requests(r), metadata=metadata)
def main(): agi = AGI() #agi = None if agi != None: agi.verbose("EAGI script started...") ani = agi.env['agi_callerid'] uid = agi.env['agi_uniqueid'] agi.verbose("Call answered from: %s with id %s" % (ani, uid)) else: ani = "" uid = str(uuid.uuid4()) try: with dbcon.cursor() as cursor: sql = "INSERT INTO calls SET uniqueid=%s,callerid=%s,calldate=NOW()" cursor.execute(sql, (uid, ani)) call_id = cursor.lastrowid finally: dbcon.commit() data = { "type": "call", "unqueid": uid, "callerid": ani[-4:], "calldate": time.strftime('%Y-%m-%d %H:%M:%S'), "call_id": call_id } client.publish(cent_channel, data) if agi == None: ic(data) args = StreamingRecognitionParser().parse_args() stub = stt_pb2_grpc.SpeechToTextStub(make_channel(args)) metadata = authorization_metadata(cfg.api_key, cfg.secret_key, "tinkoff.cloud.stt") responses = stub.StreamingRecognize(generate_requests(args, agi), metadata=metadata) save_streaming_recognition_responses(responses, agi, ani, uid, call_id)
def main(): args = BaseRecognitionParser().parse_args() total = '' if args.encoding == stt_pb2.RAW_OPUS: raise ValueError("RAW_OPUS encoding is not supported by this script") with audio_open_read(args.audio_file, args.encoding, args.rate, args.num_channels, args.chunk_size, args.pyaudio_max_seconds) as reader: stub = stt_pb2_grpc.SpeechToTextStub(make_channel(args)) metadata = authorization_metadata(args.api_key, args.secret_key, "tinkoff.cloud.stt") response = stub.Recognize(build_recognition_request(args, reader), metadata=metadata) if not isinstance(response, dict): # https://developers.google.com/protocol-buffers/docs/proto3#json response = MessageToDict(response, including_default_value_fields=True, preserving_proto_field_name=True) for result in response["results"]: for alternative in result["alternatives"]: total = total + alternative["transcript"] print(total)
def speach2text(url, tmp_dir='./tmp'): ts = int(time.time() * 10**6) wav_tmp = os.path.join(tmp_dir, '{}.wav'.format(ts)) oga_tmp = os.path.join(tmp_dir, '{}.oga'.format(ts)) r = urlopen(url) with open(oga_tmp, 'wb') as f: f.write(r.read()) stream = ffmpeg.input(oga_tmp) stream = ffmpeg.output(stream, wav_tmp) ffmpeg.run(stream, overwrite_output=True) stub = stt_pb2_grpc.SpeechToTextStub( grpc.secure_channel(endpoint, grpc.ssl_channel_credentials())) metadata = authorization_metadata(api_key, secret_key, "tinkoff.cloud.stt") responses = stub.StreamingRecognize(generate_requests(wav_tmp), metadata=metadata) text = next( responses).results[0].recognition_result.alternatives[0].transcript os.remove(wav_tmp) os.remove(oga_tmp) return text
with wave.open("../../audio/sample_5.wav") as f: yield build_first_request(f.getframerate(), f.getnchannels()) frame_samples = f.getframerate() // 10 # Send 100ms at a time for data in iter(lambda: f.readframes(frame_samples), b''): request = stt_pb2.StreamingRecognizeRequest() request.audio_content = data yield request except Exception as e: print("Got exception in generate_requests", e) raise def print_streaming_recognition_responses(responses): for response in responses: for result in response.results: print("Channel", result.recognition_result.channel) print("Phrase start:", result.recognition_result.start_time.ToTimedelta()) print("Phrase end: ", result.recognition_result.end_time.ToTimedelta()) for alternative in result.recognition_result.alternatives: print('"' + alternative.transcript + '"') print("------------------") stub = stt_pb2_grpc.SpeechToTextStub( grpc.secure_channel(endpoint, grpc.ssl_channel_credentials())) metadata = authorization_metadata(api_key, secret_key, "tinkoff.cloud.stt") responses = stub.StreamingRecognize(generate_requests(), metadata=metadata) print_streaming_recognition_responses(responses)
#!/usr/bin/env python3 import sys sys.path.append("..") from auth import authorization_metadata import os import requests endpoint = os.environ.get("VOICEKIT_ENDPOINT") or "api.tinkoff.ai:443" api_key = os.environ["VOICEKIT_API_KEY"] secret_key = os.environ["VOICEKIT_SECRET_KEY"] metadata = authorization_metadata(api_key, secret_key, "tinkoff.cloud.tts", type=dict) response = requests.get( f"http{'s' if endpoint.endswith('443') else ''}://{endpoint}/v1/tts:list_voices", headers=metadata) if response.status_code != 200: print( f"REST failed with HTTP code {response.status_code}\nHeaders: {response.headers}\nBody: {response.text}" ) else: response = response.json() print("Allowed voices:") for voice in sorted(response["voices"], key=lambda voice: voice["name"]): print(f"- {voice['name']}")
response = stt_pb2.RecognizeResponse() operation.response.Unpack(response) return " ".join( [result.alternatives[0].transcript for result in response.results]) if operation.state == FAILED: return operation.error group_name = datetime.now().strftime("test-group-%Y-%m-%d, %H:%M:%S") audio_folder = "../../audio/sample_group" # Send audio files for recognition stt_stub = stt_pb2_grpc.SpeechToTextStub( grpc.secure_channel(endpoint, grpc.ssl_channel_credentials())) stt_metadata = authorization_metadata(api_key, secret_key, "tinkoff.cloud.stt") created_operations = 0 for test_file in os.listdir(audio_folder): file_path = join(audio_folder, test_file) metadata = [] for entry in stt_metadata: metadata.append(entry) # Passing filename without extension into "x-client-request-id": metadata.append(("x-client-request-id", os.path.basename(test_file))) stt_stub.LongRunningRecognize(build_recognize_request( file_path, group_name), metadata=metadata) created_operations += 1 # Wait for results by calling WatchOperations operations_stub = longrunning_pb2_grpc.OperationsStub(