def callback(data): global GLOBAL_FileNamePublisher dataParts = data.data.split("|") if dataParts[0] != "TTS": return ttsProvider = TTSMemory().GetString("TTSProvider") usePygame = TTSMemory().GetBoolean("UsePygame") FileLogger().Info("TTS, callback(), Provider: {0}".format(ttsProvider)) try: if usePygame and data == "TRIGGER_STOP_AUDIO": SoundMixer().Stop() return if (ttsProvider.lower() == "google"): data = Google().Speak(dataParts[1]) if (ttsProvider.lower() == "microsoft"): data = Microsoft().Speak(dataParts[1]) if (ttsProvider.lower() == "ivona"): data = Ivona().Speak(dataParts[1]) if (ttsProvider.lower() == "watson"): data = Watson().Speak(dataParts[1]) try: audio = MP3(data) delay = Config().GetInt("TextToSpeech", "IntermediateAudioDelay") TTSMemory().Set("TTS.Until", (rospy.Time.now().to_sec() + int(round(audio.info.length)) + delay)) except Exception as e: FileLogger().Warn( "TTS, callback() - Error on getting audio duration: {0}". format(e)) if usePygame: SoundMixer().Play(data) else: audioPlayer = Config().Get("TextToSpeech", "AudioPlayer") + " '{0}'" os.system(audioPlayer.format(data)) FileLogger().Info("TTS, callback(), Play Audio: {0}".format(data)) GLOBAL_FileNamePublisher.publish("TTS|{0}".format(data)) user = User().LoadObject() if (user.GetName() is not None): user.UpdateSpokenTo() user.Update() except Exception as e: FileLogger().Error( "TTS, callback(), Error on processing TTS data: {0}".format(e))
def GetName(self): if self.Formal and self.LastName: if self.Gender.lower() == "female": nameWrapper = Config().Get("DEFAULT", "FormalFormOfAddressFemale") else: nameWrapper = Config().Get("DEFAULT", "FormalFormOfAddressMale") return nameWrapper.format(self.LastName) elif not self.Name and self.FirstName: return self.FirstName elif self.Name: return self.Name return None
class Watson(): __metaclass__ = Singleton def __init__(self): self.CHUNK = 1024 self.BUF_MAX_SIZE = self.CHUNK * 10 self.q = Queue(maxsize=int(round(self.BUF_MAX_SIZE / self.CHUNK))) self.audio_source = AudioSource(self.q, True, True) self.FORMAT = pyaudio.paInt16 self.CHANNELS = 1 self.RATE = 44100 self.__apikey_stt = Config().Get("SpeechToText", "WatsonSTTAPIKey") self.__url_stt = Config().Get("SpeechToText", "WatsonSTTUrl") self.__apikey_tts = Config().Get("TextToSpeech", "WatsonTTSAPIKey") self.__url_tts = Config().Get("TextToSpeech", "WatsonTTSUrl") self.__voiceName = Config().Get("TextToSpeech", "WatsonVoiceName") self.__language_2letter_cc = Config().Get("SpeechToText", "CountryCode2Letter") self.__language_4letter_cc = Config().Get("SpeechToText", "CountryCode4Letter") self.__audioPlayer = Config().Get("TextToSpeech", "AudioPlayer") + " '{0}'" self.text_to_speech = TextToSpeechV1(url=self.__url_tts, iam_apikey=self.__apikey_tts) self.text_to_speech.set_default_headers( {'x-watson-learning-opt-out': "true"}) self.speech_to_text = SpeechToTextV1(url=self.__url_stt, iam_apikey=self.__apikey_stt) self.speech_to_text.set_default_headers( {'x-watson-learning-opt-out': "true"}) self.audio = pyaudio.PyAudio() # open stream using callback self.stream = self.audio.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, frames_per_buffer=self.CHUNK, stream_callback=self.pyaudio_callback, start=False) try: rospy.init_node('STT_watson_node', anonymous=True) except: FileLogger().Info('already initialized') def Speak(self, audioString, playAudio=False): if (len(audioString) == 0): return tmpAudioFile = os.path.join(Global.EmeraldPath, "Data", "TTS", ("Watson_" + \ self.__language_2letter_cc + "_" + \ self.CleanString(audioString) + ".mp3")) if not os.path.isfile(tmpAudioFile): with open(join(dirname(__file__), tmpAudioFile), 'wb') as audio_file: response = self.text_to_speech.synthesize( audioString, accept='audio/mp3', voice=self.__voiceName).get_result() audio_file.write(response.content) if (playAudio): os.system(self.__audioPlayer.format(tmpAudioFile)) return tmpAudioFile def Listen(self): self.stream.start_stream() try: while True: recognize_thread = Thread( target=self.recognize_using_weboscket, args=()) recognize_thread.start() recognize_thread.join() except KeyboardInterrupt: # stop recording self.audio_source.completed_recording() self.stream.stop_stream() self.stream.close() self.audio.terminate() def CleanString(self, string): data = re.sub(r'\W+', '', string) return (data[:75] + '_TRIMMED') if len(data) > 75 else data def recognize_using_weboscket(self, *args): mycallback = MyRecognizeCallback() self.speech_to_text.recognize_using_websocket( audio=self.audio_source, content_type='audio/l16; rate=44100', recognize_callback=mycallback, interim_results=True, model='{0}_BroadbandModel'.format(self.__language_4letter_cc), smart_formatting=True) def pyaudio_callback(self, in_data, frame_count, time_info, status): try: self.q.put(in_data) except Full: pass return (None, pyaudio.paContinue)
class Ivona(object): __metaclass__ = Singleton __language_2letter_cc = 'de' __language_4letter_cc = 'de-DE' __audioPlayer = "afplay '{0}'" __voiceGender = 'Male' __voiceName = 'Hans' __accessKey = None __secretKey = None __speechRate = None __sentenceBreak = None __paragraphBreak = None __session = None __host = None __region = None __region_options = { 'us-east': 'us-east-1', 'us-west': 'us-west-2', 'eu-west': 'eu-west-1', } def __setRegion(self, region_name): self.__region = self.__region_options.get(region_name, 'eu-west-1') self.__host = 'tts.{}.ivonacloud.com'.format(self.__region) __codec = None __codec_options = { 'mp3': 'mp3', 'ogg': 'ogg', 'mp4': 'mp4', } def __setCodec(self, codec_name): self.__codec = self.__codec_options.get(codec_name, 'mp3') def __init__(self): self.__language_2letter_cc = Config().Get("TextToSpeech", "CountryCode2Letter") self.__language_4letter_cc = Config().Get("TextToSpeech", "CountryCode4Letter") self.__audioPlayer = Config().Get("TextToSpeech", "AudioPlayer") + " '{0}'" self.__voiceGender = Config().Get("TextToSpeech", "IvonaVoiceGender") self.__voiceName = Config().Get("TextToSpeech", "IvonaVoiceName") self.__accessKey = Config().Get("TextToSpeech", "IvonaAccessKey") self.__secretKey = Config().Get("TextToSpeech", "IvonaSecretKey") self.__speechRate = 'medium' # x-slow - slow - medium - fast - x-fast self.__sentenceBreak = 400 self.__paragraphBreak = 650 self.__setRegion('eu-west') self.__setCodec('mp3') def Speak(self, audioString, playAudio=False): if (len(audioString) == 0): return tmpAudioFile = os.path.join(Global.EmeraldPath, "Data", "TTS", ("Ivona_" + \ self.__language_2letter_cc + "_" + \ self.CleanString(audioString) + ".mp3")) if not os.path.isfile(tmpAudioFile): with open(tmpAudioFile, "wb") as f: r = self._send_amazon_auth_packet_v4( 'POST', 'tts', 'application/json', '/CreateSpeech', '', self._generate_payload(audioString), self.__region, self.__host) if r.content.startswith(b'{'): raise Exception('Error fetching voice: {}'.format( r.content)) else: f.write(r.content) if (playAudio): os.system(self.__audioPlayer.format(tmpAudioFile)) return tmpAudioFile def GetVoices(self): """Returns all the possible voices """ r = self._send_amazon_auth_packet_v4('POST', 'tts', 'application/json', '/ListVoices', '', '', self.__region, self.__host) return r.json() def CleanString(self, string): data = re.sub(r'\W+', '', string) return (data[:75] + '_TRIMMED') if len(data) > 75 else data def _generate_payload(self, text_to_speak): return json.dumps({ 'Input': { "Type": "application/ssml+xml", 'Data': text_to_speak }, 'OutputFormat': { 'Codec': self.__codec.upper() }, 'Parameters': { 'Rate': self.__speechRate, 'SentenceBreak': self.__sentenceBreak, 'ParagraphBreak': self.__paragraphBreak }, 'Voice': { 'Name': self.__voiceName, 'Language': self.__language_4letter_cc, 'Gender': self.__voiceGender } }) def _send_amazon_auth_packet_v4(self, method, service, content_type, canonical_uri, canonical_querystring, request_parameters, region, host): """Send a packet to a given amazon server using Amazon's signature Version 4, Returns the resulting response object """ algorithm = 'AWS4-HMAC-SHA256' signed_headers = 'content-type;host;x-amz-content-sha256;x-amz-date' # Create date for headers and the credential string t = datetime.datetime.utcnow() amazon_date = t.strftime('%Y%m%dT%H%M%SZ') date_stamp = t.strftime('%Y%m%d') # Step 1: Create canonical request payload_hash = self._sha_hash(request_parameters) canonical_headers = 'content-type:{}\n'.format(content_type) canonical_headers += 'host:{}\n'.format(host) canonical_headers += 'x-amz-content-sha256:{}\n'.format(payload_hash) canonical_headers += 'x-amz-date:{}\n'.format(amazon_date) canonical_request = '\n'.join([ method, canonical_uri, canonical_querystring, canonical_headers, signed_headers, payload_hash ]) # Step 2: Create the string to sign credential_scope = '{}/{}/{}/aws4_request'.format( date_stamp, region, service) string_to_sign = '\n'.join([ algorithm, amazon_date, credential_scope, self._sha_hash(canonical_request) ]) # Step 3: Calculate the signature signing_key = self._get_signature_key(self.__secretKey, date_stamp, region, service) signature = hmac.new(signing_key, string_to_sign.encode('utf-8'), hashlib.sha256).hexdigest() # Step 4: Create the signed packet endpoint = 'https://{}{}'.format(host, canonical_uri) authorization_header = '{} Credential={}/{}, ' +\ 'SignedHeaders={}, Signature={}' authorization_header = authorization_header.format( algorithm, self.__accessKey, credential_scope, signed_headers, signature) headers = { 'Host': host, 'Content-type': content_type, 'X-Amz-Date': amazon_date, 'Authorization': authorization_header, 'x-amz-content-sha256': payload_hash, 'Content-Length': str(len(request_parameters)) } # Send the packet and return the response # Use requests.Session() for HTTP keep-alive if self.__session is None: self.__session = requests.Session() return self.__session.post(endpoint, data=request_parameters, headers=headers) def _sha_hash(self, to_hash): return hashlib.sha256(to_hash.encode('utf-8')).hexdigest() def _sign(self, key, msg): return hmac.new(key, msg.encode('utf-8'), hashlib.sha256).digest() def _get_signature_key(self, key, date_stamp, region_name, service_name): k_date = self._sign(('AWS4{}'.format(key)).encode('utf-8'), date_stamp) k_region = self._sign(k_date, region_name) k_service = self._sign(k_region, service_name) k_signing = self._sign(k_service, 'aws4_request') return k_signing
class Google(object): __metaclass__ = Singleton __language_2letter_cc = 'de' __language_4letter_cc = 'de-DE' __audioPlayer = "afplay '{0}'" __apiKey = None __asyncInit = False __asyncResultList = [] def __init__(self): self.__language_2letter_cc = Config().Get("TextToSpeech", "CountryCode2Letter") self.__language_4letter_cc = Config().Get("TextToSpeech", "CountryCode4Letter") self.__audioPlayer = Config().Get("TextToSpeech", "AudioPlayer") + " '{0}'" self.__asyncInit = False self.__apiKey = Config().Get("TextToSpeech", "GoogleAPIKey") if (len(self.__apiKey) == 0): self.__apiKey = None self.__microphoneID = None microphoneName = Config().Get("SpeechToText", "Microphone") for i, microphone_name in enumerate( sr.Microphone().list_microphone_names()): if microphone_name == microphoneName: self.__microphoneID = i if self.__microphoneID is None: FileLogger().Error( "Google Line 38: No microphone found - skip listen initialisation" ) return self.__recognizer = sr.Recognizer() #Represents the minimum length of silence (in seconds) that will register as the #end of a phrase. Can be changed. #Smaller values result in the recognition completing more quickly, but might result #in slower speakers being cut off. self.__recognizer.pause_threshold = 0.5 self.__recognizer.operation_timeout = 3 self.__microphone = sr.Microphone(device_index=self.__microphoneID) with self.__microphone as source: self.__recognizer.dynamic_energy_threshold = True self.__recognizer.adjust_for_ambient_noise(source) def Speak(self, audioString, playAudio=False): if (len(audioString) == 0): return tmpAudioFile = os.path.join(Global.EmeraldPath, "Data", "TTS", ("Google_" + \ self.__language_2letter_cc + "_" + \ self.CleanString(audioString) + ".mp3")) if not os.path.isfile(tmpAudioFile): tts = gTTS(text=audioString, lang=self.__language_2letter_cc) tts.save(tmpAudioFile) if (playAudio): os.system(self.__audioPlayer.format(tmpAudioFile)) return tmpAudioFile def AsyncCallback(self, recognizer, audio): if self.__microphoneID is None: raise Exception("Google: No microphone found - Exit") data = "" try: data = self.__recognizer.recognize_google( audio, key=self.__apiKey, language=self.__language_4letter_cc, show_all=False) except sr.UnknownValueError as e: FileLogger().Warn( "Google Line 83: Google Speech Recognition could not understand audio: {0}" .format(e)) except sr.RequestError as e: FileLogger().Warn( "Google Line 85: Could not request results from Google Speech Recognition service: {0}" .format(e)) except Exception as e: FileLogger().Warn( "Google Line 87: Error on executing Google Speech Recognition service: {0}" .format(e)) if (len(data) > 0): self.__asyncResultList.append(data) def ListenAsync(self): if self.__microphoneID is None: raise Exception("Google: No microphone found - Exit") if not self.__asyncInit: self.__recognizer.listen_in_background(self.__microphone, self.AsyncCallback) self.__asyncInit = True if (len(self.__asyncResultList) > 0): return self.__asyncResultList.pop(0) return "" def Listen(self): if self.__microphoneID is None: raise Exception("Google: No microphone found - Exit") with self.__microphone as source: self.__audio = self.__recognizer.listen(source) data = "" try: data = self.__recognizer.recognize_google( self.__audio, key=self.__apiKey, language=self.__language_4letter_cc, show_all=False) except sr.UnknownValueError as e: FileLogger().Warn( "Google Line 75: Google Speech Recognition could not understand audio: {0}" .format(e)) except sr.RequestError as e: FileLogger().Warn( "Google Line 77: Could not request results from Google Speech Recognition service: {0}" .format(e)) except Exception as e: FileLogger().Warn( "Google Line 81: Error on executing Google Speech Recognition service: {0}" .format(e)) return data def CleanString(self, string): data = re.sub(r'\W+', '', string) return (data[:75] + '_TRIMMED') if len(data) > 75 else data def GetAvailiabeMicrophones(self): return sr.Microphone().list_microphone_names()
class Microsoft(object): __metaclass__ = Singleton __language_2letter_cc = 'de' __language_4letter_cc = 'de-DE' __audioPlayer = "afplay '{0}'" __voiceGender = 'Male' __voiceName = 'Microsoft Server Speech Text to Speech Voice (de-DE, Stefan, Apollo)' __apiKey = None __accesstoken = None __ssmlTemplate = """<speak version='1.0' xml:lang='{0}'> <voice xml:lang='{0}' xml:gender='{1}' name='{2}'> {3} </voice> </speak>""" def __init__(self): self.__language_2letter_cc = Config().Get("TextToSpeech", "CountryCode2Letter") self.__language_4letter_cc = Config().Get("TextToSpeech", "CountryCode4Letter") self.__audioPlayer = Config().Get("TextToSpeech", "AudioPlayer") + " '{0}'" self.__voiceGender = Config().Get("TextToSpeech", "MicrosoftVoiceGender") self.__voiceName = Config().Get("TextToSpeech", "MicrosoftVoiceName") self.__apiKey = Config().Get("TextToSpeech", "MicrosoftAPIKey") params = "" headers = {"Ocp-Apim-Subscription-Key": self.__apiKey} __AccessTokenHost = "api.cognitive.microsoft.com" path = "/sts/v1.0/issueToken" conn = httplib.HTTPSConnection(__AccessTokenHost) conn.request("POST", path, params, headers) response = conn.getresponse() data = response.read() conn.close() self.__accesstoken = data.decode("UTF-8") self.__microphoneID = None microphoneName = Config().Get("SpeechToText", "Microphone") for i, microphone_name in enumerate( sr.Microphone().list_microphone_names()): if microphone_name == microphoneName: self.__microphoneID = i if self.__microphoneID is None: FileLogger().Error( "Microsoft Line 44: No microphone found - skip listen initialisation" ) return self.__recognizer = sr.Recognizer() self.__microphone = sr.Microphone(device_index=self.__microphoneID) with self.__microphone as source: self.__recognizer.dynamic_energy_threshold = True self.__recognizer.adjust_for_ambient_noise(source) def Speak(self, audioString, playAudio=False): if (len(audioString) == 0): return tmpAudioFile = os.path.join(Global.EmeraldPath, "Data", "TTS", ("Microsoft_" + \ self.__language_2letter_cc + "_" + \ self.CleanString(audioString) + ".mp3")) if not os.path.isfile(tmpAudioFile): ssml = self.__ssmlTemplate.format(self.__language_4letter_cc, self.__voiceGender, self.__voiceName, audioString) body = ssml # .encode('utf8') headers = { "Content-type": "application/ssml+xml", "X-Microsoft-OutputFormat": "audio-16khz-32kbitrate-mono-mp3", "Authorization": "Bearer " + self.__accesstoken, "X-Search-AppId": "07D3234E49CE426DAA29772419F436CA", "X-Search-ClientID": "1ECFAE91408841A480F00935DC390960", "User-Agent": "TTSForPython" } # Connect to server to synthesize the wave conn = httplib.HTTPSConnection("speech.platform.bing.com") conn.request("POST", "/synthesize", body, headers) response = conn.getresponse() data = response.read() conn.close() with open(tmpAudioFile, "wb") as f: f.write(data) if (playAudio): os.system(self.__audioPlayer.format(tmpAudioFile)) return tmpAudioFile def Listen(self): if self.__microphoneID is None: raise Exception("Microsoft: No microphone found - Exit") with self.__microphone as source: self.__audio = self.__recognizer.listen(source) data = "" try: data = self.__recognizer.recognize_bing( self.__audio, key=self.__apiKey, language=self.__language_4letter_cc, show_all=False) except sr.UnknownValueError as e: FileLogger().Warn( "Microsoft Line 119: Microsoft Bing Voice Recognition could not understand audio: {0}" .format(e)) except sr.RequestError as e: FileLogger().Warn( "Microsoft Line 121: Could not request results from Microsoft Bing Voice Recognition service: {0}" .format(e)) return data def CleanString(self, string): data = re.sub(r'\W+', '', string) return (data[:75] + '_TRIMMED') if len(data) > 75 else data def GetAvailiabeMicrophones(self): return sr.Microphone().list_microphone_names()