Exemple #1
0
class TestVoice(Voice):
    # playback
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 44100
    CHUNK = 1024
    FILE_NAME = 'aux.wav'

    # recognition
    MODELDIR = "es-ES"
    GRAMMARDIR = "gram"

    # text to speech
    RATE = 150
    VOLUME = 0.9

    def __init__(self, file_name='aux.wav', raspi=False, local=True):

        ## load environment

        self.FILE_NAME = file_name
        self.audio = pyaudio.PyAudio()
        self.raspi = raspi

        self.local = local

        self.config = Decoder.default_config()
        self.config.set_string('-hmm',
                               os.path.join(self.MODELDIR, 'acoustic-model'))
        self.config.set_string(
            '-dict',
            os.path.join(self.MODELDIR, 'pronounciation-dictionary.dict'))
        self.config.set_string('-logfn', os.devnull)
        self.decoder = Decoder(self.config)
        self.r = sr.Recognizer()
        print("adjunting...")
        with sr.Microphone() as source:
            self.r.adjust_for_ambient_noise(source)

        # tts
        if self.local:
            self.tts = pyttsx3.init()
            self.tts.setProperty('rate', self.RATE)
            self.tts.setProperty('volume', self.VOLUME)
            self.tts.setProperty('voice', 'spanish-latin-am')
        else:
            # Instantiates a client
            self.tts_client = texttospeech.TextToSpeechClient()
            # Build the voice request, select the language code ("en-US") and the ssml
            # voice gender ("neutral")
            self.tts_voice = texttospeech.types.VoiceSelectionParams(
                language_code='es-ES',
                ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE)

            # Select the type of audio file you want returned
            self.tts_audio_config = texttospeech.types.AudioConfig(
                audio_encoding=texttospeech.enums.AudioEncoding.MP3)

    def speak(self, phrase):
        print('decir: ' + phrase)
        if self.local:
            self.tts.say(phrase)
            self.tts.runAndWait()
        else:
            # Set the text input to be synthesized
            synthesis_input = texttospeech.types.SynthesisInput(text=phrase)
            # Perform the text-to-speech request on the text input with the selected
            # voice parameters and audio file type
            response = self.tts_client.synthesize_speech(
                synthesis_input, self.tts_voice, self.tts_audio_config)
            audio_file = 'tts.mp3'
            # The response's audio_content is binary.
            with open(audio_file, 'wb') as out:
                out.write(response.audio_content)
            print('reproducir voz sintetica')
            command = '/usr/bin/mpg321 ' + audio_file
            print(command)
            os.system(command)

    def play(self, filename):
        print('reproduciendo archivo: ' + filename)
        extension = filename.split('.')[-1]
        if extension == 'wav':
            wf = wave.open(filename, 'rb')
            stream = self.audio.open(format=self.audio.get_format_from_width(
                wf.getsampwidth()),
                                     channels=wf.getnchannels(),
                                     rate=wf.getframerate(),
                                     output=True)
            data = wf.readframes(self.CHUNK)

            # play
            while len(data) > 0:
                stream.write(data)
                data = wf.readframes(self.CHUNK)
            stream.stop_stream()
            stream.close()
        elif extension == 'mp3':
            command = '/usr/bin/mpg321 ' + filename
            print(command)
            os.system(command)

    def listen(self, duration=3):
        # start recording
        if self.raspi:
            stream = self.audio.open(format=self.FORMAT,
                                     channels=1,
                                     rate=self.RATE,
                                     input_device_index=2,
                                     input=True,
                                     frames_per_buffer=self.CHUNK)
        else:
            stream = self.audio.open(format=self.FORMAT,
                                     channels=self.CHANNELS,
                                     rate=self.RATE,
                                     input_device_index=7,
                                     input=True,
                                     frames_per_buffer=self.CHUNK)

        frames = []

        for i in range(0, int(self.RATE / self.CHUNK * duration)):
            data = stream.read(self.CHUNK, exception_on_overflow=False)
            frames.append(data)

        stream.stop_stream()
        stream.close()

        wave_file = wave.open(self.FILE_NAME, 'wb')
        if self.raspi:
            wave_file.setnchannels(1)
        else:
            wave_file.setnchannels(self.CHANNELS)

        wave_file.setsampwidth(self.audio.get_sample_size(self.FORMAT))
        wave_file.setframerate(self.RATE)
        wave_file.writeframes(b''.join(frames))
        wave_file.close()

        with sr.AudioFile(self.FILE_NAME) as source:
            audio = self.r.record(source)

        raw_data = audio.get_raw_data(convert_rate=16000, convert_width=2)

        return raw_data

    def echo(self):
        self.play(self.FILE_NAME)

    def recognize(self):
        with sr.Microphone() as source:
            audio = self.r.listen(source)

        # raw_out = self.listen()
        try:
            self.decoder.start_utt()
            self.decoder.process_raw(audio.frame_data, False, True)
            self.decoder.end_utt()
            hyp = self.decoder.hyp()
            return hyp.hypstr

        except Exception:
            return None

    def loadGrammar(self, grammar):
        # delete(self.decoder)
        grammar_file = grammar + '.gram'
        c_string = os.path.join(self.GRAMMARDIR,
                                grammar_file)  #.encode('ascii')
        print(c_string)

        self.config.set_string('-jsgf', c_string)

        self.decoder.reinit(self.config)

    def close(self):
        self.audio.terminate()
Exemple #2
0
class TestVoice(Voice):
    # playback
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 44100
    CHUNK = 1024
    FILE_NAME = 'aux.wav'

    # recognition
    MODELDIR = "es-ES"
    GRAMMARDIR = "gram"

    # text to speech
    RATE = 150
    VOLUME = 0.9

    def __init__(self, file_name='aux.wav', raspi=False):
        self.FILE_NAME = file_name
        self.audio = pyaudio.PyAudio()
        self.raspi = raspi

        self.config = Decoder.default_config()
        self.config.set_string('-hmm',
                               os.path.join(self.MODELDIR, 'acoustic-model'))
        self.config.set_string(
            '-dict',
            os.path.join(self.MODELDIR, 'pronounciation-dictionary.dict'))
        self.config.set_string('-logfn', os.devnull)
        self.decoder = Decoder(self.config)
        self.r = sr.Recognizer()
        print("adjunting...")
        with sr.Microphone() as source:
            self.r.adjust_for_ambient_noise(source)

        # tts
        self.tts = pyttsx3.init()
        self.tts.setProperty('rate', self.RATE)
        self.tts.setProperty('volume', self.VOLUME)
        self.tts.setProperty('voice', 'spanish-latin-am')

    def speak(self, phrase):
        self.tts.say(phrase)
        self.tts.runAndWait()

    def play(self, filename):
        extension = filename.split('.')[1]
        if extension == 'wav':
            wf = wave.open(filename, 'rb')
            stream = self.audio.open(format=self.audio.get_format_from_width(
                wf.getsampwidth()),
                                     channels=wf.getnchannels(),
                                     rate=wf.getframerate(),
                                     output=True)
            data = wf.readframes(self.CHUNK)

            # play
            while len(data) > 0:
                stream.write(data)
                data = wf.readframes(self.CHUNK)
            stream.stop_stream()
            stream.close()
        elif extension == 'mp3':
            playsound(filename)

    def listen(self, duration=3):
        # start recording
        if self.raspi:
            stream = self.audio.open(format=self.FORMAT,
                                     channels=1,
                                     rate=self.RATE,
                                     input_device_index=2,
                                     input=True,
                                     frames_per_buffer=self.CHUNK)
        else:
            stream = self.audio.open(format=self.FORMAT,
                                     channels=self.CHANNELS,
                                     rate=self.RATE,
                                     input_device_index=7,
                                     input=True,
                                     frames_per_buffer=self.CHUNK)

        frames = []

        for i in range(0, int(self.RATE / self.CHUNK * duration)):
            data = stream.read(self.CHUNK, exception_on_overflow=False)
            frames.append(data)

        stream.stop_stream()
        stream.close()

        wave_file = wave.open(self.FILE_NAME, 'wb')
        if self.raspi:
            wave_file.setnchannels(1)
        else:
            wave_file.setnchannels(self.CHANNELS)

        wave_file.setsampwidth(self.audio.get_sample_size(self.FORMAT))
        wave_file.setframerate(self.RATE)
        wave_file.writeframes(b''.join(frames))
        wave_file.close()

        with sr.AudioFile(self.FILE_NAME) as source:
            audio = self.r.record(source)

        raw_data = audio.get_raw_data(convert_rate=16000, convert_width=2)

        return raw_data

    def echo(self):
        self.play(self.FILE_NAME)

    def recognize(self):
        with sr.Microphone() as source:
            audio = self.r.listen(source)

        # raw_out = self.listen()
        try:
            self.decoder.start_utt()
            self.decoder.process_raw(audio.frame_data, False, True)
            self.decoder.end_utt()
            hyp = self.decoder.hyp()
            return hyp.hypstr

        except Exception:
            return None

    def loadGrammar(self, grammar):
        # delete(self.decoder)
        grammar_file = grammar + '.gram'
        c_string = os.path.join(self.GRAMMARDIR,
                                grammar_file)  #.encode('ascii')
        print(c_string)

        self.config.set_string('-jsgf', c_string)

        self.decoder.reinit(self.config)

    def close(self):
        self.audio.terminate()