Esempio n. 1
0
def callback(data):
    global GLOBAL_FileNamePublisher

    dataParts = data.data.split("|")

    if dataParts[0] != "TTS":
        return

    ttsProvider = TTSMemory().GetString("TTSProvider")
    usePygame = TTSMemory().GetBoolean("UsePygame")

    FileLogger().Info("TTS, callback(), Provider: {0}".format(ttsProvider))

    try:
        if usePygame and data == "TRIGGER_STOP_AUDIO":
            SoundMixer().Stop()
            return

        if (ttsProvider.lower() == "google"):
            data = Google().Speak(dataParts[1])

        if (ttsProvider.lower() == "microsoft"):
            data = Microsoft().Speak(dataParts[1])

        if (ttsProvider.lower() == "ivona"):
            data = Ivona().Speak(dataParts[1])

        if (ttsProvider.lower() == "watson"):
            data = Watson().Speak(dataParts[1])

        try:
            audio = MP3(data)
            delay = Config().GetInt("TextToSpeech", "IntermediateAudioDelay")
            TTSMemory().Set("TTS.Until",
                            (rospy.Time.now().to_sec() +
                             int(round(audio.info.length)) + delay))
        except Exception as e:
            FileLogger().Warn(
                "TTS, callback() - Error on getting audio duration: {0}".
                format(e))

        if usePygame:
            SoundMixer().Play(data)
        else:
            audioPlayer = Config().Get("TextToSpeech",
                                       "AudioPlayer") + " '{0}'"
            os.system(audioPlayer.format(data))

        FileLogger().Info("TTS, callback(), Play Audio: {0}".format(data))
        GLOBAL_FileNamePublisher.publish("TTS|{0}".format(data))

        user = User().LoadObject()
        if (user.GetName() is not None):
            user.UpdateSpokenTo()
            user.Update()

    except Exception as e:
        FileLogger().Error(
            "TTS, callback(), Error on processing TTS data: {0}".format(e))
Esempio n. 2
0
    def GetName(self):
        if self.Formal and self.LastName:
            if self.Gender.lower() == "female":
                nameWrapper = Config().Get("DEFAULT",
                                           "FormalFormOfAddressFemale")
            else:
                nameWrapper = Config().Get("DEFAULT",
                                           "FormalFormOfAddressMale")
            return nameWrapper.format(self.LastName)

        elif not self.Name and self.FirstName:
            return self.FirstName
        elif self.Name:
            return self.Name
        return None
Esempio n. 3
0
class Watson():
    __metaclass__ = Singleton

    def __init__(self):
        self.CHUNK = 1024
        self.BUF_MAX_SIZE = self.CHUNK * 10
        self.q = Queue(maxsize=int(round(self.BUF_MAX_SIZE / self.CHUNK)))
        self.audio_source = AudioSource(self.q, True, True)
        self.FORMAT = pyaudio.paInt16
        self.CHANNELS = 1
        self.RATE = 44100

        self.__apikey_stt = Config().Get("SpeechToText", "WatsonSTTAPIKey")
        self.__url_stt = Config().Get("SpeechToText", "WatsonSTTUrl")

        self.__apikey_tts = Config().Get("TextToSpeech", "WatsonTTSAPIKey")
        self.__url_tts = Config().Get("TextToSpeech", "WatsonTTSUrl")

        self.__voiceName = Config().Get("TextToSpeech", "WatsonVoiceName")

        self.__language_2letter_cc = Config().Get("SpeechToText",
                                                  "CountryCode2Letter")
        self.__language_4letter_cc = Config().Get("SpeechToText",
                                                  "CountryCode4Letter")
        self.__audioPlayer = Config().Get("TextToSpeech",
                                          "AudioPlayer") + " '{0}'"

        self.text_to_speech = TextToSpeechV1(url=self.__url_tts,
                                             iam_apikey=self.__apikey_tts)
        self.text_to_speech.set_default_headers(
            {'x-watson-learning-opt-out': "true"})

        self.speech_to_text = SpeechToTextV1(url=self.__url_stt,
                                             iam_apikey=self.__apikey_stt)
        self.speech_to_text.set_default_headers(
            {'x-watson-learning-opt-out': "true"})

        self.audio = pyaudio.PyAudio()

        # open stream using callback
        self.stream = self.audio.open(format=self.FORMAT,
                                      channels=self.CHANNELS,
                                      rate=self.RATE,
                                      input=True,
                                      frames_per_buffer=self.CHUNK,
                                      stream_callback=self.pyaudio_callback,
                                      start=False)
        try:
            rospy.init_node('STT_watson_node', anonymous=True)
        except:
            FileLogger().Info('already initialized')

    def Speak(self, audioString, playAudio=False):
        if (len(audioString) == 0):
            return
        tmpAudioFile = os.path.join(Global.EmeraldPath, "Data", "TTS", ("Watson_" + \
            self.__language_2letter_cc + "_" + \
            self.CleanString(audioString) + ".mp3"))

        if not os.path.isfile(tmpAudioFile):
            with open(join(dirname(__file__), tmpAudioFile),
                      'wb') as audio_file:
                response = self.text_to_speech.synthesize(
                    audioString, accept='audio/mp3',
                    voice=self.__voiceName).get_result()
                audio_file.write(response.content)
        if (playAudio):
            os.system(self.__audioPlayer.format(tmpAudioFile))
        return tmpAudioFile

    def Listen(self):
        self.stream.start_stream()

        try:
            while True:
                recognize_thread = Thread(
                    target=self.recognize_using_weboscket, args=())
                recognize_thread.start()

                recognize_thread.join()

        except KeyboardInterrupt:
            # stop recording
            self.audio_source.completed_recording()
            self.stream.stop_stream()
            self.stream.close()
            self.audio.terminate()

    def CleanString(self, string):
        data = re.sub(r'\W+', '', string)
        return (data[:75] + '_TRIMMED') if len(data) > 75 else data

    def recognize_using_weboscket(self, *args):
        mycallback = MyRecognizeCallback()
        self.speech_to_text.recognize_using_websocket(
            audio=self.audio_source,
            content_type='audio/l16; rate=44100',
            recognize_callback=mycallback,
            interim_results=True,
            model='{0}_BroadbandModel'.format(self.__language_4letter_cc),
            smart_formatting=True)

    def pyaudio_callback(self, in_data, frame_count, time_info, status):
        try:
            self.q.put(in_data)
        except Full:
            pass
        return (None, pyaudio.paContinue)
Esempio n. 4
0
class Ivona(object):
    __metaclass__ = Singleton

    __language_2letter_cc = 'de'
    __language_4letter_cc = 'de-DE'
    __audioPlayer = "afplay '{0}'"

    __voiceGender = 'Male'
    __voiceName = 'Hans'
    __accessKey = None
    __secretKey = None
    __speechRate = None
    __sentenceBreak = None
    __paragraphBreak = None

    __session = None

    __host = None
    __region = None
    __region_options = {
        'us-east': 'us-east-1',
        'us-west': 'us-west-2',
        'eu-west': 'eu-west-1',
    }

    def __setRegion(self, region_name):
        self.__region = self.__region_options.get(region_name, 'eu-west-1')
        self.__host = 'tts.{}.ivonacloud.com'.format(self.__region)

    __codec = None
    __codec_options = {
        'mp3': 'mp3',
        'ogg': 'ogg',
        'mp4': 'mp4',
    }

    def __setCodec(self, codec_name):
        self.__codec = self.__codec_options.get(codec_name, 'mp3')

    def __init__(self):
        self.__language_2letter_cc = Config().Get("TextToSpeech",
                                                  "CountryCode2Letter")
        self.__language_4letter_cc = Config().Get("TextToSpeech",
                                                  "CountryCode4Letter")
        self.__audioPlayer = Config().Get("TextToSpeech",
                                          "AudioPlayer") + " '{0}'"

        self.__voiceGender = Config().Get("TextToSpeech", "IvonaVoiceGender")
        self.__voiceName = Config().Get("TextToSpeech", "IvonaVoiceName")
        self.__accessKey = Config().Get("TextToSpeech", "IvonaAccessKey")
        self.__secretKey = Config().Get("TextToSpeech", "IvonaSecretKey")

        self.__speechRate = 'medium'  # x-slow - slow - medium - fast - x-fast
        self.__sentenceBreak = 400
        self.__paragraphBreak = 650
        self.__setRegion('eu-west')
        self.__setCodec('mp3')

    def Speak(self, audioString, playAudio=False):
        if (len(audioString) == 0):
            return
        tmpAudioFile = os.path.join(Global.EmeraldPath, "Data", "TTS", ("Ivona_" + \
            self.__language_2letter_cc + "_" + \
            self.CleanString(audioString) + ".mp3"))

        if not os.path.isfile(tmpAudioFile):
            with open(tmpAudioFile, "wb") as f:
                r = self._send_amazon_auth_packet_v4(
                    'POST', 'tts', 'application/json', '/CreateSpeech', '',
                    self._generate_payload(audioString), self.__region,
                    self.__host)
                if r.content.startswith(b'{'):
                    raise Exception('Error fetching voice: {}'.format(
                        r.content))
                else:
                    f.write(r.content)

        if (playAudio):
            os.system(self.__audioPlayer.format(tmpAudioFile))
        return tmpAudioFile

    def GetVoices(self):
        """Returns all the possible voices
        """
        r = self._send_amazon_auth_packet_v4('POST', 'tts', 'application/json',
                                             '/ListVoices', '', '',
                                             self.__region, self.__host)
        return r.json()

    def CleanString(self, string):
        data = re.sub(r'\W+', '', string)
        return (data[:75] + '_TRIMMED') if len(data) > 75 else data

    def _generate_payload(self, text_to_speak):
        return json.dumps({
            'Input': {
                "Type": "application/ssml+xml",
                'Data': text_to_speak
            },
            'OutputFormat': {
                'Codec': self.__codec.upper()
            },
            'Parameters': {
                'Rate': self.__speechRate,
                'SentenceBreak': self.__sentenceBreak,
                'ParagraphBreak': self.__paragraphBreak
            },
            'Voice': {
                'Name': self.__voiceName,
                'Language': self.__language_4letter_cc,
                'Gender': self.__voiceGender
            }
        })

    def _send_amazon_auth_packet_v4(self, method, service, content_type,
                                    canonical_uri, canonical_querystring,
                                    request_parameters, region, host):
        """Send a packet to a given amazon server using Amazon's signature Version 4,
        Returns the resulting response object
        """

        algorithm = 'AWS4-HMAC-SHA256'
        signed_headers = 'content-type;host;x-amz-content-sha256;x-amz-date'

        # Create date for headers and the credential string
        t = datetime.datetime.utcnow()
        amazon_date = t.strftime('%Y%m%dT%H%M%SZ')
        date_stamp = t.strftime('%Y%m%d')

        # Step 1: Create canonical request
        payload_hash = self._sha_hash(request_parameters)

        canonical_headers = 'content-type:{}\n'.format(content_type)
        canonical_headers += 'host:{}\n'.format(host)
        canonical_headers += 'x-amz-content-sha256:{}\n'.format(payload_hash)
        canonical_headers += 'x-amz-date:{}\n'.format(amazon_date)

        canonical_request = '\n'.join([
            method, canonical_uri, canonical_querystring, canonical_headers,
            signed_headers, payload_hash
        ])

        # Step 2: Create the string to sign
        credential_scope = '{}/{}/{}/aws4_request'.format(
            date_stamp, region, service)
        string_to_sign = '\n'.join([
            algorithm, amazon_date, credential_scope,
            self._sha_hash(canonical_request)
        ])

        # Step 3: Calculate the signature
        signing_key = self._get_signature_key(self.__secretKey, date_stamp,
                                              region, service)
        signature = hmac.new(signing_key, string_to_sign.encode('utf-8'),
                             hashlib.sha256).hexdigest()

        # Step 4: Create the signed packet
        endpoint = 'https://{}{}'.format(host, canonical_uri)
        authorization_header = '{} Credential={}/{}, ' +\
            'SignedHeaders={}, Signature={}'
        authorization_header = authorization_header.format(
            algorithm, self.__accessKey, credential_scope, signed_headers,
            signature)
        headers = {
            'Host': host,
            'Content-type': content_type,
            'X-Amz-Date': amazon_date,
            'Authorization': authorization_header,
            'x-amz-content-sha256': payload_hash,
            'Content-Length': str(len(request_parameters))
        }

        # Send the packet and return the response
        # Use requests.Session() for HTTP keep-alive
        if self.__session is None:
            self.__session = requests.Session()

        return self.__session.post(endpoint,
                                   data=request_parameters,
                                   headers=headers)

    def _sha_hash(self, to_hash):
        return hashlib.sha256(to_hash.encode('utf-8')).hexdigest()

    def _sign(self, key, msg):
        return hmac.new(key, msg.encode('utf-8'), hashlib.sha256).digest()

    def _get_signature_key(self, key, date_stamp, region_name, service_name):
        k_date = self._sign(('AWS4{}'.format(key)).encode('utf-8'), date_stamp)
        k_region = self._sign(k_date, region_name)
        k_service = self._sign(k_region, service_name)
        k_signing = self._sign(k_service, 'aws4_request')
        return k_signing
Esempio n. 5
0
class Google(object):
    __metaclass__ = Singleton

    __language_2letter_cc = 'de'
    __language_4letter_cc = 'de-DE'
    __audioPlayer = "afplay '{0}'"
    __apiKey = None

    __asyncInit = False
    __asyncResultList = []

    def __init__(self):
        self.__language_2letter_cc = Config().Get("TextToSpeech",
                                                  "CountryCode2Letter")
        self.__language_4letter_cc = Config().Get("TextToSpeech",
                                                  "CountryCode4Letter")
        self.__audioPlayer = Config().Get("TextToSpeech",
                                          "AudioPlayer") + " '{0}'"

        self.__asyncInit = False

        self.__apiKey = Config().Get("TextToSpeech", "GoogleAPIKey")
        if (len(self.__apiKey) == 0):
            self.__apiKey = None

        self.__microphoneID = None
        microphoneName = Config().Get("SpeechToText", "Microphone")
        for i, microphone_name in enumerate(
                sr.Microphone().list_microphone_names()):
            if microphone_name == microphoneName:
                self.__microphoneID = i

        if self.__microphoneID is None:
            FileLogger().Error(
                "Google Line 38: No microphone found - skip listen initialisation"
            )
            return

        self.__recognizer = sr.Recognizer()
        #Represents the minimum length of silence (in seconds) that will register as the
        #end of a phrase. Can be changed.
        #Smaller values result in the recognition completing more quickly, but might result
        #in slower speakers being cut off.
        self.__recognizer.pause_threshold = 0.5
        self.__recognizer.operation_timeout = 3

        self.__microphone = sr.Microphone(device_index=self.__microphoneID)

        with self.__microphone as source:
            self.__recognizer.dynamic_energy_threshold = True
            self.__recognizer.adjust_for_ambient_noise(source)

    def Speak(self, audioString, playAudio=False):
        if (len(audioString) == 0):
            return
        tmpAudioFile = os.path.join(Global.EmeraldPath, "Data", "TTS", ("Google_" + \
            self.__language_2letter_cc + "_" + \
            self.CleanString(audioString) + ".mp3"))

        if not os.path.isfile(tmpAudioFile):
            tts = gTTS(text=audioString, lang=self.__language_2letter_cc)
            tts.save(tmpAudioFile)

        if (playAudio):
            os.system(self.__audioPlayer.format(tmpAudioFile))
        return tmpAudioFile

    def AsyncCallback(self, recognizer, audio):
        if self.__microphoneID is None:
            raise Exception("Google: No microphone found - Exit")

        data = ""
        try:
            data = self.__recognizer.recognize_google(
                audio,
                key=self.__apiKey,
                language=self.__language_4letter_cc,
                show_all=False)
        except sr.UnknownValueError as e:
            FileLogger().Warn(
                "Google Line 83: Google Speech Recognition could not understand audio: {0}"
                .format(e))
        except sr.RequestError as e:
            FileLogger().Warn(
                "Google Line 85: Could not request results from Google Speech Recognition service: {0}"
                .format(e))
        except Exception as e:
            FileLogger().Warn(
                "Google Line 87: Error on executing Google Speech Recognition service: {0}"
                .format(e))

        if (len(data) > 0):
            self.__asyncResultList.append(data)

    def ListenAsync(self):
        if self.__microphoneID is None:
            raise Exception("Google: No microphone found - Exit")

        if not self.__asyncInit:
            self.__recognizer.listen_in_background(self.__microphone,
                                                   self.AsyncCallback)
            self.__asyncInit = True

        if (len(self.__asyncResultList) > 0):
            return self.__asyncResultList.pop(0)
        return ""

    def Listen(self):
        if self.__microphoneID is None:
            raise Exception("Google: No microphone found - Exit")

        with self.__microphone as source:
            self.__audio = self.__recognizer.listen(source)

            data = ""
            try:
                data = self.__recognizer.recognize_google(
                    self.__audio,
                    key=self.__apiKey,
                    language=self.__language_4letter_cc,
                    show_all=False)
            except sr.UnknownValueError as e:
                FileLogger().Warn(
                    "Google Line 75: Google Speech Recognition could not understand audio: {0}"
                    .format(e))
            except sr.RequestError as e:
                FileLogger().Warn(
                    "Google Line 77: Could not request results from Google Speech Recognition service: {0}"
                    .format(e))
            except Exception as e:
                FileLogger().Warn(
                    "Google Line 81: Error on executing Google Speech Recognition service: {0}"
                    .format(e))

            return data

    def CleanString(self, string):
        data = re.sub(r'\W+', '', string)
        return (data[:75] + '_TRIMMED') if len(data) > 75 else data

    def GetAvailiabeMicrophones(self):
        return sr.Microphone().list_microphone_names()
Esempio n. 6
0
class Microsoft(object):
    __metaclass__ = Singleton

    __language_2letter_cc = 'de'
    __language_4letter_cc = 'de-DE'
    __audioPlayer = "afplay '{0}'"

    __voiceGender = 'Male'
    __voiceName = 'Microsoft Server Speech Text to Speech Voice (de-DE, Stefan, Apollo)'
    __apiKey = None
    __accesstoken = None

    __ssmlTemplate = """<speak version='1.0' xml:lang='{0}'>
        <voice xml:lang='{0}' xml:gender='{1}' name='{2}'>
          {3}
        </voice>
      </speak>"""

    def __init__(self):
        self.__language_2letter_cc = Config().Get("TextToSpeech",
                                                  "CountryCode2Letter")
        self.__language_4letter_cc = Config().Get("TextToSpeech",
                                                  "CountryCode4Letter")
        self.__audioPlayer = Config().Get("TextToSpeech",
                                          "AudioPlayer") + " '{0}'"

        self.__voiceGender = Config().Get("TextToSpeech",
                                          "MicrosoftVoiceGender")
        self.__voiceName = Config().Get("TextToSpeech", "MicrosoftVoiceName")
        self.__apiKey = Config().Get("TextToSpeech", "MicrosoftAPIKey")

        params = ""
        headers = {"Ocp-Apim-Subscription-Key": self.__apiKey}

        __AccessTokenHost = "api.cognitive.microsoft.com"
        path = "/sts/v1.0/issueToken"

        conn = httplib.HTTPSConnection(__AccessTokenHost)
        conn.request("POST", path, params, headers)
        response = conn.getresponse()

        data = response.read()
        conn.close()

        self.__accesstoken = data.decode("UTF-8")

        self.__microphoneID = None
        microphoneName = Config().Get("SpeechToText", "Microphone")
        for i, microphone_name in enumerate(
                sr.Microphone().list_microphone_names()):
            if microphone_name == microphoneName:
                self.__microphoneID = i

        if self.__microphoneID is None:
            FileLogger().Error(
                "Microsoft Line 44: No microphone found - skip listen initialisation"
            )
            return

        self.__recognizer = sr.Recognizer()
        self.__microphone = sr.Microphone(device_index=self.__microphoneID)

        with self.__microphone as source:
            self.__recognizer.dynamic_energy_threshold = True
            self.__recognizer.adjust_for_ambient_noise(source)

    def Speak(self, audioString, playAudio=False):
        if (len(audioString) == 0):
            return
        tmpAudioFile = os.path.join(Global.EmeraldPath, "Data", "TTS", ("Microsoft_" + \
            self.__language_2letter_cc + "_" + \
            self.CleanString(audioString) + ".mp3"))

        if not os.path.isfile(tmpAudioFile):
            ssml = self.__ssmlTemplate.format(self.__language_4letter_cc,
                                              self.__voiceGender,
                                              self.__voiceName, audioString)
            body = ssml  # .encode('utf8')

            headers = {
                "Content-type": "application/ssml+xml",
                "X-Microsoft-OutputFormat": "audio-16khz-32kbitrate-mono-mp3",
                "Authorization": "Bearer " + self.__accesstoken,
                "X-Search-AppId": "07D3234E49CE426DAA29772419F436CA",
                "X-Search-ClientID": "1ECFAE91408841A480F00935DC390960",
                "User-Agent": "TTSForPython"
            }

            # Connect to server to synthesize the wave
            conn = httplib.HTTPSConnection("speech.platform.bing.com")
            conn.request("POST", "/synthesize", body, headers)
            response = conn.getresponse()

            data = response.read()
            conn.close()

            with open(tmpAudioFile, "wb") as f:
                f.write(data)

        if (playAudio):
            os.system(self.__audioPlayer.format(tmpAudioFile))
        return tmpAudioFile

    def Listen(self):
        if self.__microphoneID is None:
            raise Exception("Microsoft: No microphone found - Exit")

        with self.__microphone as source:
            self.__audio = self.__recognizer.listen(source)

            data = ""
            try:
                data = self.__recognizer.recognize_bing(
                    self.__audio,
                    key=self.__apiKey,
                    language=self.__language_4letter_cc,
                    show_all=False)
            except sr.UnknownValueError as e:
                FileLogger().Warn(
                    "Microsoft Line 119: Microsoft Bing Voice Recognition could not understand audio: {0}"
                    .format(e))
            except sr.RequestError as e:
                FileLogger().Warn(
                    "Microsoft Line 121: Could not request results from Microsoft Bing Voice Recognition service: {0}"
                    .format(e))

            return data

    def CleanString(self, string):
        data = re.sub(r'\W+', '', string)
        return (data[:75] + '_TRIMMED') if len(data) > 75 else data

    def GetAvailiabeMicrophones(self):
        return sr.Microphone().list_microphone_names()