def voice(message: str):
    """產生神經語言的聲音檔案。"""
    # azure speech 基本設定
    speech_key, service_region = Config.SPEECH_TOKEN, Config.SPEECH_REGION
    speech_config = speechsdk.SpeechConfig(
        subscription=speech_key,
        region=service_region,
        speech_recognition_language=Config.SPEECH_VOICE_NAME)
    speech_synthesizer = speechsdk.SpeechSynthesizer(
        speech_config=speech_config, audio_config=None)

    # 更新時間戳
    _now_timestamp = int(time.time())
    _now_time_string = time.strftime("%H%M%S", _now_array)

    # 建立 XML 檔案並產生聲音檔案、讀取播放
    _file_name = f"{_now_time_string}-{randomString()}"
    build_XAL(message, _file_name)
    ssml_string = open(f"./talks/xmls/{_now_day_string}/{_file_name}.xml",
                       "r",
                       encoding="utf-8").read()
    result = speech_synthesizer.speak_ssml_async(ssml_string).get()
    stream = speechsdk.AudioDataStream(result)
    stream.save_to_wav_file(
        f"./talks/voices/{_now_day_string}/{_file_name}.wav")
    playsound(f"./talks/voices/{_now_day_string}/{_file_name}.wav")
def speech_recognize_keyword_locally_from_microphone():
    """runs keyword spotting locally, with direct access to the result audio"""

    # Creates an instance of a keyword recognition model. Update this to
    # point to the location of your keyword recognition model.
    model = speechsdk.KeywordRecognitionModel(
        "YourKeywordRecognitionModelFile.table")

    # The phrase your keyword recognition model triggers on.
    keyword = "YourKeyword"

    # Create a local keyword recognizer with the default microphone device for input.
    keyword_recognizer = speechsdk.KeywordRecognizer()

    done = False

    def recognized_cb(evt):
        # Only a keyword phrase is recognized. The result cannot be 'NoMatch'
        # and there is no timeout. The recognizer runs until a keyword phrase
        # is detected or recognition is canceled (by stop_recognition_async()
        # or due to the end of an input file or stream).
        result = evt.result
        if result.reason == speechsdk.ResultReason.RecognizedKeyword:
            print("RECOGNIZED KEYWORD: {}".format(result.text))
        nonlocal done
        done = True

    def canceled_cb(evt):
        result = evt.result
        if result.reason == speechsdk.ResultReason.Canceled:
            print('CANCELED: {}'.format(result.cancellation_details.reason))
        nonlocal done
        done = True

    # Connect callbacks to the events fired by the keyword recognizer.
    keyword_recognizer.recognized.connect(recognized_cb)
    keyword_recognizer.canceled.connect(canceled_cb)

    # Start keyword recognition.
    result_future = keyword_recognizer.recognize_once_async(model)
    print('Say something starting with "{}" followed by whatever you want...'.
          format(keyword))
    result = result_future.get()

    # Read result audio (incl. the keyword).
    if result.reason == speechsdk.ResultReason.RecognizedKeyword:
        time.sleep(2)  # give some time so the stream is filled
        result_stream = speechsdk.AudioDataStream(result)
        result_stream.detach_input(
        )  # stop any more data from input getting to the stream

        save_future = result_stream.save_to_wav_file_async(
            "AudioFromRecognizedKeyword.wav")
        print('Saving file...')
        saved = save_future.get()
def speech_synthesis_to_audio_data_stream():
    """performs speech synthesis and gets the audio data from single request based stream."""
    # Creates an instance of a speech config with specified subscription key and service region.
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)
    # Creates a speech synthesizer with a null output stream.
    # This means the audio output data will not be written to any output channel.
    # You can just get the audio from the result.
    speech_synthesizer = speechsdk.SpeechSynthesizer(
        speech_config=speech_config, audio_config=None)

    # Receives a text from console input and synthesizes it to result.
    while True:
        print("Enter some text that you want to synthesize, Ctrl-Z to exit")
        try:
            text = input()
        except EOFError:
            break
        result = speech_synthesizer.speak_text_async(text).get()
        # Check result
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print("Speech synthesized for text [{}]".format(text))
            audio_data_stream = speechsdk.AudioDataStream(result)

            # You can save all the data in the audio data stream to a file
            file_name = "outputaudio.wav"
            audio_data_stream.save_to_wav_file(file_name)
            print("Audio data for text [{}] was saved to [{}]".format(
                text, file_name))

            # You can also read data from audio data stream and process it in memory
            # Reset the stream position to the beginning since saving to file puts the postion to end.
            audio_data_stream.position = 0

            # Reads data from the stream
            audio_buffer = bytes(16000)
            total_size = 0
            filled_size = audio_data_stream.read_data(audio_buffer)
            while filled_size > 0:
                print("{} bytes received.".format(filled_size))
                total_size += filled_size
                filled_size = audio_data_stream.read_data(audio_buffer)
            print("Totally {} bytes received for text [{}].".format(
                total_size, text))
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = result.cancellation_details
            print("Speech synthesis canceled: {}".format(
                cancellation_details.reason))
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                print("Error details: {}".format(
                    cancellation_details.error_details))
Example #4
0
def call_tts(text, tgt_lang):
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config,
                                              audio_config=None)
    xml_body = ElementTree.Element('speak', version='1.0')
    xml_body.set("xmlns", 'https://www.w3.org/2001/10/synthesis')
    xml_body.set("xml:lang", tts_lang[tgt_lang])
    voice = ElementTree.SubElement(xml_body, 'voice')
    voice.set("name", tts_voice[tgt_lang])
    voice.text = str(text)
    body = ElementTree.tostring(xml_body)
    result = synthesizer.speak_ssml(body.decode("utf-8"))
    stream = speechsdk.AudioDataStream(result)
    audio_filename = str(time.time()) + ".wav"
    stream.save_to_wav_file("static/" + audio_filename)
    print("Audio file saved to: " + audio_filename, "lang: " + tgt_lang)
    return audio_filename
Example #5
0
def xmlaudio():
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
    xml_body = ElementTree.Element('speak', version='1.0')
    # xml_body.set('{https://www.w3.org/2001/10/synthesis}lang', 'en-US')
    xml_body.set("xmlns", 'https://www.w3.org/2001/10/synthesis')
    xml_body.set("xml:lang", 'hi-IN')

    voice = ElementTree.SubElement(xml_body, 'voice')
    voice.set("name", 'hi-IN-Kalpana-Apollo')
    voice.text = str("हिंदी")
    body = ElementTree.tostring(xml_body)
    # f = open("sample.xml","w")
    # f.write(str(body))



    # ssml_string = open("sample.xml", "r",encoding="UTF-8").read()
    # print(ssml_string)
    result = synthesizer.speak_ssml_async(body.decode("utf-8")).get()

    stream = speechsdk.AudioDataStream(result)
    stream.save_to_wav_file("voice.wav")
 def saveAudioFile(self, result, filename):
     audioStream = speechsdk.AudioDataStream(result)
     audioStream.save_to_wav_file(filename)
Example #7
0
def image_caption(request):
    voice_num = int(request.data.get('num'))
    voice_index = [
        {
            'name': 'en-US-AriaRUS',
            'pitch': '-10%',
            'rate': '-10%'
            }, 
        {
            'name': 'en-US-ZiraRUS',
            'pitch': '20%',
            'rate': '-10%'
            },
        {
            'name': 'en-US-GuyRUS',
            'pitch': '10%',
            'rate': '-20%'
            },
        {
            'name': 'en-US-BenjaminRUS',
            'pitch': '20%',
            'rate': '-20%'
            },
    ]
    
    mediaURL = getattr(settings, 'MEDIA_URL', 'MEDIA_URL')
    mediaROOTURL = getattr(settings, 'MEDIA_ROOT', 'MEDIA_ROOT')
    speech_key, service_region = getattr(settings, 'MS_API_KEY', 'MS_API_KEY'), "koreacentral"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

    speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat["Riff16Khz16BitMonoPcm"])
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
    
    if voice_num != 4:
        speak = ET.Element('speak')
        speak.set('version', '1.0')
        speak.set('xmlns', 'https://www.w3.org/2001/10/synthesis')
        speak.set('xml:lang', 'en-US')
        voice = ET.SubElement(speak, 'voice')
        voice.set('name', voice_index[voice_num]['name'])
        prosody = ET.SubElement(voice, 'prosody')
        prosody.set('rate', voice_index[voice_num]['rate'])
        prosody.set('pitch', voice_index[voice_num]['pitch'])
    
    try:
        img = request.data.get('img')
    except:
        return Response({'error': '이미지 잘못 들어왔어요'}, status=status.HTTP_400_BAD_REQUEST)
        
    MSVS_API_KEY = getattr(settings, 'MSVS_API_KEY', 'MSVS_API_KEY')
    endpoint = "https://jes5918.cognitiveservices.azure.com/"
    computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(MSVS_API_KEY))
    try:
        tags_result_remote = computervision_client.tag_image_in_stream(img)
    except:
        return Response({'error': '이미지 파일 형식을 확인하세요.'}, status=status.HTTP_400_BAD_REQUEST)

    if (len(tags_result_remote.tags) == 0):
        return Response({'error' : '생성된 태그가 없습니다.'}, status=status.HTTP_400_NOT_FOUND)
    else:
        captiontags = []
        body = []
        for idx, tag in enumerate(tags_result_remote.tags):
            if idx == 8:
                break
            if voice_num == 4:
                dockerUrl = "http://j4b105.p.ssafy.io:5002/api/tts?text=" + tag.name
                responseData = requests.request("GET", dockerUrl)
                data, samplerate = sf.read(io.BytesIO(responseData.content))
                stream_path = mediaROOTURL+ '/tts_basic/' + str(voice_num) + tag.name + '.wav'
                sf.write(stream_path, data, samplerate)
                for i in range(5):
                    if i == voice_num:
                        continue
                    stream_path2 = mediaROOTURL+ '/tts_basic/' + str(i) + tag.name + '.wav'
                    if not os.path.isfile(stream_path2):
                        sf.write(stream_path2, data, samplerate)
            else:
                prosody.text = tag.name
                mydata = ET.tostring(speak).decode("utf-8")
                result = synthesizer.speak_ssml_async(mydata).get()

                stream = speechsdk.AudioDataStream(result)
                stream_path = mediaROOTURL+ '/tts_basic/' + str(voice_num) + tag.name + '.wav'
                
                # Checks result..
                if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
                    stream.save_to_wav_file(stream_path)
                    for i in range(5):
                        if i == voice_num:
                            continue
                        stream_path2 = mediaROOTURL+ '/tts_basic/' + str(i) + tag.name + '.wav'
                        if not os.path.isfile(stream_path2):
                            stream.save_to_wav_file(stream_path2)
                elif result.reason == speechsdk.ResultReason.Canceled:
                    cancellation_details = result.cancellation_details
                    print("Speech synthesis canceled: {}".format(cancellation_details.reason))
                    if cancellation_details.reason == speechsdk.CancellationReason.Error:
                        if cancellation_details.error_details:
                            print("Error details: {}".format(cancellation_details.error_details))
                    print("Did you update the subscription info?")
                    return Response({'error' : 'voice tts error please retry'}, status=status.HTTP_503_SERVICE_UNAVAILABLE)

            captiontags.append({
                'content': tag.name, 
                'filepath': mediaURL+'tts_basic/' + str(voice_num) + tag.name + '.wav', 
                'checked': False
            })
            body.append({'text': tag.name})

        endpoint = "https://api.cognitive.microsofttranslator.com/dictionary/lookup"
        params = {
            'api-version': '3.0',
            'from': 'en',
            'to': 'ko'
        }
        headers = {
            'Ocp-Apim-Subscription-Key': getattr(settings, 'MSTR_API_KEY', 'MSTR_API_KEY'),
            'Ocp-Apim-Subscription-Region': "koreacentral",
            'Content-type': 'application/json',
            'X-ClientTraceId': str(uuid.uuid4())
        }
        try:
            request = requests.post(endpoint, params=params, headers=headers, json=body)
        except:
            return Response({'error' : '번역에 에러가 발생'}, status=status.HTTP_400_BAD_REQUEST)
        response = request.json()
        posIndex = {
            'ADJ': '형용사',
            'ADV': '부사',
            'CONJ': '접속사',
            'DET': '한정사',
            'MODAL': '동사',
            'NOUN': '명사',
            'PREP': '전치사',
            'PRON': '대명사', 
            'VERB': '동사',
            'OTHER': '기타',
        }
        remove_idx = []
        for i in range(len(body)):
            try:
                captiontags[i]["mean"] = response[i]["translations"][0]["displayTarget"]
                captiontags[i]["part"] = posIndex[response[i]["translations"][0]["posTag"]]
                if captiontags[i]["content"] == captiontags[i]["mean"]:
                    remove_idx.append(i)
            except:
                remove_idx.append(i)
                captiontags[i]["mean"] = "nottrans"
                captiontags[i]["part"] = "nottrans"
        captiontags_res = []
        for idx, c in enumerate(captiontags):
            if idx in remove_idx:
                continue
            captiontags_res.append(c)
        
        return Response({'data' : captiontags_res}, status=status.HTTP_200_OK)
Example #8
0
def text_to_speech(request):
    text = request.data.get('text')
    for content in text:
        if ord('ㄱ') <= ord(content) <= ord('힣'):
            return Response({'error': '한글은 작성할 수 없습니다'},
                            status=status.HTTP_400_BAD_REQUEST)
    voice_num = int(request.data.get('num'))
    voice_index = [{
        'name': 'en-US-AriaRUS',
        'pitch': '-10%',
        'rate': '-10%'
    }, {
        'name': 'en-US-ZiraRUS',
        'pitch': '20%',
        'rate': '-10%'
    }, {
        'name': 'en-US-GuyRUS',
        'pitch': '10%',
        'rate': '-20%'
    }, {
        'name': 'en-US-BenjaminRUS',
        'pitch': '20%',
        'rate': '-20%'
    }, {
        'name': 'en-US-AriaRUS',
        'pitch': '30%',
        'rate': '-30%'
    }]

    mediaURL = getattr(settings, 'MEDIA_URL', 'MEDIA_URL')
    mediaROOTURL = getattr(settings, 'MEDIA_ROOT', 'MEDIA_ROOT')
    speech_key, service_region = getattr(settings, 'MS_API_KEY',
                                         'MS_API_KEY'), "koreacentral"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)

    speech_config.set_speech_synthesis_output_format(
        speechsdk.SpeechSynthesisOutputFormat["Riff16Khz16BitMonoPcm"])
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config,
                                              audio_config=None)

    speak = ET.Element('speak')
    speak.set('version', '1.0')
    speak.set('xmlns', 'https://www.w3.org/2001/10/synthesis')
    speak.set('xml:lang', 'en-US')
    voice = ET.SubElement(speak, 'voice')
    voice.set('name', voice_index[voice_num]['name'])
    prosody = ET.SubElement(voice, 'prosody')
    prosody.set('rate', voice_index[voice_num]['rate'])
    prosody.set('pitch', voice_index[voice_num]['pitch'])
    prosody.text = text

    mydata = ET.tostring(speak).decode("utf-8")
    result = synthesizer.speak_ssml_async(mydata).get()

    stream = speechsdk.AudioDataStream(result)
    temp = ('').join(text.split(' ')).lower()
    temp2 = ('').join(temp.split('.'))
    stream_path = mediaROOTURL + '/tts_basic/' + temp2 + '.wav'

    # Checks result.
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        stream.save_to_wav_file(stream_path)
        print("complete")
        return Response({'filepath': mediaURL + 'tts_basic/' + temp2 + '.wav'},
                        status=status.HTTP_200_OK)
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech synthesis canceled: {}".format(
            cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            if cancellation_details.error_details:
                print("Error details: {}".format(
                    cancellation_details.error_details))
        print("Did you update the subscription info?")
        return Response({'error': '서버 에러입니다.'},
                        status=status.HTTP_503_SERVICE_UNAVAILABLE)
    return Response({'error': '서버 에러입니다.'},
                    status=status.HTTP_503_SERVICE_UNAVAILABLE)