Ejemplo n.º 1
0
    def text2audio(self, text):
        stub = tts_pb2_grpc.TextToSpeechStub(
            grpc.secure_channel(self._endpoint,
                                grpc.ssl_channel_credentials()))
        request = self._build_request(text)
        metadata = authorization_metadata(self._api_key, self._secret_key,
                                          "tinkoff.cloud.tts")
        responses = stub.StreamingSynthesize(request, metadata=metadata)

        for key, value in responses.initial_metadata():
            if key == "x-audio-num-samples":
                print("Estimated audio duration is " +
                      str(int(value) / self._sample_rate) + " seconds")
                break

        f = io.BytesIO()
        ogg_opus_writer = pyogg.OggOpusWriter(f)
        ogg_opus_writer.set_application("audio")
        ogg_opus_writer.set_sampling_frequency(self._sample_rate)
        ogg_opus_writer.set_channels(1)
        ogg_opus_writer.set_frame_size(20)  # milliseconds
        for stream_response in responses:
            ogg_opus_writer.encode(stream_response.audio_chunk)

        # close writer
        ogg_opus_writer.close()

        # get audio
        f.seek(0)
        audio = f.getvalue()

        return audio
Ejemplo n.º 2
0
def main():
    args = BaseRecognitionParser().parse_args()

    if args.encoding == stt_pb2.RAW_OPUS:
        raise ValueError("RAW_OPUS encoding is not supported by this script")
    with audio_open_read(args.audio_file, args.encoding, args.rate,
                         args.num_channels, args.chunk_size,
                         args.pyaudio_max_seconds) as reader:
        metadata = authorization_metadata(args.api_key,
                                          args.secret_key,
                                          "tinkoff.cloud.stt",
                                          type=dict)
        request = build_recognition_request(args, reader, type="json")
        response = requests.post("http{}://{}/v1/stt:recognize".format(
            "s" if args.endpoint.endswith("443") else "", args.endpoint),
                                 json=request,
                                 headers=metadata)

        if response.status_code != 200:
            print(
                "REST failed with HTTP code {}\nHeaders: {}\nBody: {}".format(
                    response.status_code, response.headers, response.text))
            return
        response = response.json()
        print_recognition_response(response)
Ejemplo n.º 3
0
def text2speach(text, tmp_dir='./tmp'):
    ts = int(time.time() * 10**6)
    wav_tmp = os.path.join(tmp_dir, '{}.wav'.format(ts))
    oga_tmp = os.path.join(tmp_dir, '{}.oga'.format(ts))

    with wave.open(wav_tmp, "wb") as f:
        f.setframerate(sample_rate)
        f.setnchannels(1)
        f.setsampwidth(2)

        stub = tts_pb2_grpc.TextToSpeechStub(grpc.secure_channel(endpoint, grpc.ssl_channel_credentials()))
        request = build_request(text)
        metadata = authorization_metadata(api_key, secret_key, "tinkoff.cloud.tts")
        responses = stub.StreamingSynthesize(request, metadata=metadata)
        # for key, value in responses.initial_metadata():
        #     if key == "x-audio-num-samples":
        #         #print("Estimated audio duration is " + str(int(value) / sample_rate) + " seconds")
        #         break
        for stream_response in responses:
            f.writeframes(stream_response.audio_chunk)

    stream = ffmpeg.input(wav_tmp)
    stream = ffmpeg.output(stream, oga_tmp)
    ffmpeg.run(stream, overwrite_output=True)
    with open(oga_tmp, 'rb') as f:
        fp = f.read()
    os.remove(oga_tmp)
    os.remove(wav_tmp)
    return fp
Ejemplo n.º 4
0
def main():
    args = StreamingRecognitionParser().parse_args()

    with audio_open_read(args.audio_file, args.encoding, args.rate, args.num_channels, args.chunk_size,
                         args.pyaudio_max_seconds) as reader:
        stub = stt_pb2_grpc.SpeechToTextStub(make_channel(args))
        metadata = authorization_metadata(args.api_key, args.secret_key, "tinkoff.cloud.stt")
        responses = stub.StreamingRecognize(stt_generate_requests(args, reader), metadata=metadata)
        print_streaming_recognition_responses(responses)
Ejemplo n.º 5
0
def main():
    args = RecognitionParser().parse_args()
    if args.encoding == "RAW_OPUS":
        raise ValueError("RAW_OPUS encoding is not supported by this script")
    stub = stt_pb2_grpc.SpeechToTextStub(make_channel(args.host, args.port))
    metadata = authorization_metadata(args.api_key, args.secret_key,
                                      "tinkoff.cloud.stt")
    response = stub.Recognize(build_recognition_request(args),
                              metadata=metadata)
    print_recognition_response(response)
Ejemplo n.º 6
0
def main():
    args = BaseRecognitionParser().parse_args()
    if args.encoding == stt_pb2.RAW_OPUS:
        raise ValueError("RAW_OPUS encoding is not supported by this script")
    with audio_open_read(args.audio_file, args.encoding, args.rate,
                         args.num_channels, args.chunk_size,
                         args.pyaudio_max_seconds) as reader:
        stub = stt_pb2_grpc.SpeechToTextStub(make_channel(args))
        metadata = authorization_metadata(args.api_key, args.secret_key,
                                          "tinkoff.cloud.stt")
        response = stub.Recognize(build_recognition_request(args, reader),
                                  metadata=metadata)
        print_recognition_response(response)
Ejemplo n.º 7
0
 def __init__(self, phrase):
     phrase = phrase.replace(' ', '. ')
     self._ssml = '<speak><p>' + phrase + '</p></speak>'
     self._text = re.sub(r'\<[^>]*\>', '', phrase)
     stub = tts_pb2_grpc.TextToSpeechStub(
         grpc.secure_channel(ENDPOINT, grpc.ssl_channel_credentials()))
     metadata = authorization_metadata(API_KEY, SECRET_KEY,
                                       "tinkoff.cloud.tts")
     request = tts_pb2.SynthesizeSpeechRequest(
         input=tts_pb2.SynthesisInput(text=self._text, ssml=self._ssml),
         audio_config=tts_pb2.AudioConfig(audio_encoding=tts_pb2.LINEAR16,
                                          speaking_rate=1,
                                          sample_rate_hertz=SAMPLE_RATE))
     self._responses = stub.StreamingSynthesize(request, metadata=metadata)
Ejemplo n.º 8
0
def main():
    args = BaseSynthesisParser().parse_args()
    if args.encoding == tts_pb2.LINEAR16 and args.rate != 48000:
        raise ValueError("LINEAR16 supports only 48kHz for now, use RAW_OPUS")

    with audio_open_write(args.output_file, args.encoding,
                          args.rate) as audio_writer:
        stub = tts_pb2_grpc.TextToSpeechStub(make_channel(args))
        request = build_synthesis_request(args, args.input_text)
        metadata = authorization_metadata(args.api_key, args.secret_key,
                                          "tinkoff.cloud.tts")
        responses = stub.StreamingSynthesize(request, metadata=metadata)
        for stream_response in responses:
            audio_writer.write(stream_response.audio_chunk)
Ejemplo n.º 9
0
def main():
    args = StreamingRecognitionParser().parse_args()

    stub = stt_pb2_grpc.SpeechToTextStub(make_channel(args.host, args.port))
    metadata = authorization_metadata(args.api_key, args.secret_key, "tinkoff.cloud.stt")
    responses = stub.StreamingRecognize(generate_requests(args), metadata=metadata)
    for response in responses:
        for result in response.results:
            print("Channel", result.recognition_result.channel)
            print("Phrase start", result.recognition_result.start_time.ToTimedelta())
            print("Phrase end", result.recognition_result.end_time.ToTimedelta())
            print("Is final", result.is_final)
            for alternative in result.recognition_result.alternatives:
                print("Transcription", alternative.transcript)
                print("Confidence", alternative.confidence)
            print("------------------")
Ejemplo n.º 10
0
 def __init__(self, recognize):
     self._recognize = recognize
     r = stt_pb2.StreamingRecognizeRequest()
     r.streaming_config.config.encoding = stt_pb2.AudioEncoding.LINEAR16
     r.streaming_config.config.sample_rate_hertz = 16000
     r.streaming_config.config.num_channels = 1
     r.streaming_config.config.enable_denormalization = True
     r.streaming_config.config.enable_automatic_punctuation = True
     r.streaming_config.config.vad_config.silence_duration_threshold = 1.20
     r.streaming_config.single_utterance = True
     r.streaming_config.config.speech_contexts.append(
         stt_pb2.SpeechContext(phrases=[
             stt_pb2.SpeechContextPhrase(text=text, score=10.0)
             for text in self._recognize.context_words
         ]))
     metadata = authorization_metadata(API_KEY, SECRET_KEY,
                                       "tinkoff.cloud.stt")
     stub = stt_pb2_grpc.SpeechToTextStub(
         grpc.secure_channel(ENDPOINT, grpc.ssl_channel_credentials()))
     self._responses = stub.StreamingRecognize(self.requests(r),
                                               metadata=metadata)
Ejemplo n.º 11
0
def main():
    agi = AGI()
    #agi = None
    if agi != None:
        agi.verbose("EAGI script started...")
        ani = agi.env['agi_callerid']
        uid = agi.env['agi_uniqueid']
        agi.verbose("Call answered from: %s with id %s" % (ani, uid))
    else:
        ani = ""
        uid = str(uuid.uuid4())

    try:
        with dbcon.cursor() as cursor:
            sql = "INSERT INTO calls SET uniqueid=%s,callerid=%s,calldate=NOW()"
            cursor.execute(sql, (uid, ani))
            call_id = cursor.lastrowid
    finally:
        dbcon.commit()

    data = {
        "type": "call",
        "unqueid": uid,
        "callerid": ani[-4:],
        "calldate": time.strftime('%Y-%m-%d %H:%M:%S'),
        "call_id": call_id
    }
    client.publish(cent_channel, data)
    if agi == None:
        ic(data)

    args = StreamingRecognitionParser().parse_args()

    stub = stt_pb2_grpc.SpeechToTextStub(make_channel(args))
    metadata = authorization_metadata(cfg.api_key, cfg.secret_key,
                                      "tinkoff.cloud.stt")
    responses = stub.StreamingRecognize(generate_requests(args, agi),
                                        metadata=metadata)
    save_streaming_recognition_responses(responses, agi, ani, uid, call_id)
Ejemplo n.º 12
0
def main():
    args = BaseRecognitionParser().parse_args()
    total = ''
    if args.encoding == stt_pb2.RAW_OPUS:
        raise ValueError("RAW_OPUS encoding is not supported by this script")
    with audio_open_read(args.audio_file, args.encoding, args.rate,
                         args.num_channels, args.chunk_size,
                         args.pyaudio_max_seconds) as reader:
        stub = stt_pb2_grpc.SpeechToTextStub(make_channel(args))
        metadata = authorization_metadata(args.api_key, args.secret_key,
                                          "tinkoff.cloud.stt")
        response = stub.Recognize(build_recognition_request(args, reader),
                                  metadata=metadata)

        if not isinstance(response, dict):
            # https://developers.google.com/protocol-buffers/docs/proto3#json
            response = MessageToDict(response,
                                     including_default_value_fields=True,
                                     preserving_proto_field_name=True)
        for result in response["results"]:
            for alternative in result["alternatives"]:
                total = total + alternative["transcript"]
    print(total)
Ejemplo n.º 13
0
def speach2text(url, tmp_dir='./tmp'):
    ts = int(time.time() * 10**6)
    wav_tmp = os.path.join(tmp_dir, '{}.wav'.format(ts))
    oga_tmp = os.path.join(tmp_dir, '{}.oga'.format(ts))

    r = urlopen(url)

    with open(oga_tmp, 'wb') as f:
        f.write(r.read())

    stream = ffmpeg.input(oga_tmp)
    stream = ffmpeg.output(stream, wav_tmp)
    ffmpeg.run(stream, overwrite_output=True)

    stub = stt_pb2_grpc.SpeechToTextStub(
        grpc.secure_channel(endpoint, grpc.ssl_channel_credentials()))
    metadata = authorization_metadata(api_key, secret_key, "tinkoff.cloud.stt")
    responses = stub.StreamingRecognize(generate_requests(wav_tmp),
                                        metadata=metadata)
    text = next(
        responses).results[0].recognition_result.alternatives[0].transcript
    os.remove(wav_tmp)
    os.remove(oga_tmp)
    return text
Ejemplo n.º 14
0
        with wave.open("../../audio/sample_5.wav") as f:
            yield build_first_request(f.getframerate(), f.getnchannels())
            frame_samples = f.getframerate() // 10  # Send 100ms at a time
            for data in iter(lambda: f.readframes(frame_samples), b''):
                request = stt_pb2.StreamingRecognizeRequest()
                request.audio_content = data
                yield request
    except Exception as e:
        print("Got exception in generate_requests", e)
        raise


def print_streaming_recognition_responses(responses):
    for response in responses:
        for result in response.results:
            print("Channel", result.recognition_result.channel)
            print("Phrase start:",
                  result.recognition_result.start_time.ToTimedelta())
            print("Phrase end:  ",
                  result.recognition_result.end_time.ToTimedelta())
            for alternative in result.recognition_result.alternatives:
                print('"' + alternative.transcript + '"')
            print("------------------")


stub = stt_pb2_grpc.SpeechToTextStub(
    grpc.secure_channel(endpoint, grpc.ssl_channel_credentials()))
metadata = authorization_metadata(api_key, secret_key, "tinkoff.cloud.stt")
responses = stub.StreamingRecognize(generate_requests(), metadata=metadata)
print_streaming_recognition_responses(responses)
Ejemplo n.º 15
0
#!/usr/bin/env python3

import sys
sys.path.append("..")

from auth import authorization_metadata
import os
import requests

endpoint = os.environ.get("VOICEKIT_ENDPOINT") or "api.tinkoff.ai:443"
api_key = os.environ["VOICEKIT_API_KEY"]
secret_key = os.environ["VOICEKIT_SECRET_KEY"]

metadata = authorization_metadata(api_key,
                                  secret_key,
                                  "tinkoff.cloud.tts",
                                  type=dict)
response = requests.get(
    f"http{'s' if endpoint.endswith('443') else ''}://{endpoint}/v1/tts:list_voices",
    headers=metadata)

if response.status_code != 200:
    print(
        f"REST failed with HTTP code {response.status_code}\nHeaders: {response.headers}\nBody: {response.text}"
    )
else:
    response = response.json()
    print("Allowed voices:")
    for voice in sorted(response["voices"], key=lambda voice: voice["name"]):
        print(f"- {voice['name']}")
        response = stt_pb2.RecognizeResponse()
        operation.response.Unpack(response)

        return " ".join(
            [result.alternatives[0].transcript for result in response.results])
    if operation.state == FAILED:
        return operation.error


group_name = datetime.now().strftime("test-group-%Y-%m-%d, %H:%M:%S")
audio_folder = "../../audio/sample_group"

# Send audio files for recognition
stt_stub = stt_pb2_grpc.SpeechToTextStub(
    grpc.secure_channel(endpoint, grpc.ssl_channel_credentials()))
stt_metadata = authorization_metadata(api_key, secret_key, "tinkoff.cloud.stt")
created_operations = 0
for test_file in os.listdir(audio_folder):
    file_path = join(audio_folder, test_file)
    metadata = []
    for entry in stt_metadata:
        metadata.append(entry)
    # Passing filename without extension into "x-client-request-id":
    metadata.append(("x-client-request-id", os.path.basename(test_file)))
    stt_stub.LongRunningRecognize(build_recognize_request(
        file_path, group_name),
                                  metadata=metadata)
    created_operations += 1

# Wait for results by calling WatchOperations
operations_stub = longrunning_pb2_grpc.OperationsStub(