Esempio n. 1
0
def listen_for_text(context="none"):
    global stop_now, text, q, service, audio_source
    stop_now = False
    # print("1. Context is "+context)
    iam_apikey = os.environ['IAM_APIKEY']
    service = SpeechToTextV1(
        url='https://gateway-lon.watsonplatform.net/speech-to-text/api',
        iam_apikey=iam_apikey)
    CHUNK = 1024
    BUF_MAX_SIZE = CHUNK * 10
    q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK)))
    audio_source = AudioSource(q, True, True)
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 44100
    audio = pyaudio.PyAudio()
    stream = audio.open(format=FORMAT,
                        channels=CHANNELS,
                        rate=RATE,
                        input=True,
                        frames_per_buffer=CHUNK,
                        stream_callback=pyaudio_callback,
                        start=False,
                        input_device_index=2)
    stream.start_stream()
    recognize_thread = Thread(target=recognize_using_websocket, args=())
    recognize_thread.start()
    while not stop_now:
        pass
    stream.stop_stream()
    stream.close()
    audio.terminate()
    audio_source.completed_recording()
    return text
Esempio n. 2
0
def watson_streaming_stt(filename: str, lang: str, encoding: str) -> str:
    authenticator = IAMAuthenticator(WATSON_API_KEY)
    speech_to_text = SpeechToTextV1(authenticator=authenticator)
    speech_to_text.set_service_url(WATSON_STT_URL)

    # Make watson audio source fed by a buffer queue
    buffer_queue = Queue(maxsize=BUFFER_MAX_ELEMENT)
    audio_source = AudioSource(buffer_queue, True, True)

    # Callback object
    mycallback = MyRecognizeCallback()

    # Read the file
    buffer, rate = read_wav_file(filename)

    # Start Speech-to-Text recognition thread
    stt_stream_thread = Thread(target=speech_to_text.recognize_using_websocket,
                               kwargs={
                                   'audio':
                                   audio_source,
                                   'content_type':
                                   'audio/l16; rate={}'.format(rate),
                                   'recognize_callback':
                                   mycallback,
                                   'interim_results':
                                   True
                               })
    stt_stream_thread.start()

    # Simulation audio stream by breaking file into chunks and filling buffer queue
    audio_generator = simulate_stream(buffer, CHUNK_SIZE)
    for chunk in audio_generator:
        buffer_queue.put(chunk)
        time.sleep(0.5)  # give a chance to callback

    # Close the audio feed and wait for STTT thread to complete
    audio_source.completed_recording()
    stt_stream_thread.join()

    # send final result
    return mycallback.transcript
Esempio n. 3
0
class WatsonRecognizer:
    # Note: It will discard if the websocket client can't consume fast enough
    # So, increase the max size as per your choice
    CHUNK = 1024
    BUF_MAX_SIZE = CHUNK * 10

    # Variables for recording the speech
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 44100

    def __init__(self):
        # Buffer to store audio
        self.q = Queue(maxsize=self.BUF_MAX_SIZE)
        self.transcription_q = Queue()
        self.audio_source = AudioSource(self.q, True, True)
        self.callback = WatsonCallback(transcript_q=self.transcription_q, prints=True)

        # initialize speech to text service
        self.authenticator = IAMAuthenticator('zPJij17cD8uAVUsaWqRgZPyGt9CH5q8XuwNGurfFhtXW')
        self.speech_to_text = SpeechToTextV1(authenticator=self.authenticator)

        # instantiate audio
        self.audio = pyaudio.PyAudio()

        # open stream using callback
        self.stream = self.audio.open(
            format=self.FORMAT,
            channels=self.CHANNELS,
            rate=self.RATE,
            input=True,
            frames_per_buffer=self.CHUNK,
            stream_callback=self.pyaudio_callback,
            start=False
        )

        # thread for the speech recognition
        self.thread = Process(target=self.speech_to_text.recognize_using_websocket, kwargs={
            "audio": self.audio_source,
            "content_type": "audio/l16; rate=44100",
            "recognize_callback": self.callback,
            "interim_results": True})

    def pyaudio_callback(self, in_data, frame_count, time_info, status):
        try:
            self.q.put(in_data)
        except Full:
            pass  # discard
        return None, pyaudio.paContinue

    def start(self):
        if not self.running:
            self.stream.start_stream()
            self.thread.start()

    def stop(self, timeout=20):
        if self.running:
            self.stream.stop_stream()
            self.thread.terminate()
            self.thread.join(timeout=timeout)

    def close(self, timeout=20):
        self.thread.terminate()
        self.thread.join(timeout=timeout)
        self.stream.stop_stream()
        self.stream.close()
        self.audio.terminate()
        self.audio_source.completed_recording()

    @property
    def running(self):
        return self.thread.is_alive()
audio = pyaudio.PyAudio()

# open stream using callback
stream = audio.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK,
                    stream_callback=pyaudio_callback,
                    start=False)

#########################################################################
#### Start the recording and start service to recognize the stream ######
#########################################################################

print("Enter CTRL+C to end recording...")
stream.start_stream()

try:
    recognize_thread = Thread(target=recognize_using_weboscket, args=())
    recognize_thread.start()

    while True:
        pass
except KeyboardInterrupt:
    # stop recording
    stream.stop_stream()
    stream.close()
    audio.terminate()
    audio_source.completed_recording()
def start_stt():
    mic_photo_on = PhotoImage(file=r"images/mic-on-50.png")
    mic_on = Button(window,
                    text="Mic",
                    image=mic_photo_on,
                    background="white",
                    activebackground="white",
                    border=0,
                    command=end_stt)
    mic_on.place(x=900, y=148)

    # mic_off.grid()
    # import watson_tts
    # def quit():
    #     return
    # def stop_watson():
    #     return
    # window.bind('<Control-c>', stop_watson)
    def space_break():
        keyboard = Controller()
        # keyboard.press(Key.pause)
        # keyboard.release(Key.pause)
        with keyboard.pressed(Key.control):
            keyboard.press('c')
            keyboard.release('c')

    window.bind('<space>', space_break)
    try:
        from Queue import Queue, Full
    except ImportError:
        from queue import Queue, Full

    ###############################################
    #### Initalize queue / the thing to store the audio recordings ##
    ###############################################
    # CHUNK = 1024
    CHUNK = 1500
    # *** if the websocket client isn't fast enough it will just discard
    # *** if that happens it said to try using a larger max size
    BUF_MAX_SIZE = CHUNK * 20
    # Buffer to store audio
    q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK)))

    # Create an instance of AudioSource
    audio_source = AudioSource(q, True, True)

    ###############################################
    #### Prepare Speech to Text Service ########
    ###############################################

    # initialize speech to text service
    speech_to_text = SpeechToTextV1(
        iam_apikey="ZAM8vwm2g3Dsnh1UPjOqyI-PloGvZ-PjSEAbjT_JHk1s",
        url="https://gateway-wdc.watsonplatform.net/speech-to-text/api")

    # define callback for the speech to text service
    class MyRecognizeCallback(RecognizeCallback):
        def __init__(self):
            RecognizeCallback.__init__(self)

        def on_transcription(self, transcript):
            print(transcript)
            # status = "Translating..."

        def on_connected(self):
            print('Connection was successful')
            # status = "Connected"

        def on_error(self, error):
            print('Error received: {}'.format(error))
            # status = "Error"

        def on_inactivity_timeout(self, error):
            print('Inactivity timeout: {}'.format(error))
            # status = "Timeout"

        def on_listening(self):
            print('Service is listening')
            # status = "Listening..."

        def on_hypothesis(self, hypothesis):
            print(hypothesis)
            # return

        def on_data(self, data):
            print(data)
            # text_translation = data
            # header.configure(text=text_translation)
            # return

        def on_close(self):
            print("Connection closed")
            # status = "Listening stopped"

    # this function will initiate the recognize service and pass in the AudioSource
    def recognize_using_weboscket(*args):
        mycallback = MyRecognizeCallback()
        speech_to_text.recognize_using_websocket(
            audio=audio_source,
            content_type='audio/l16; rate=44100',
            recognize_callback=mycallback,
            interim_results=True)

    ###############################################
    #### Prepare the for recording using Pyaudio ##
    ###############################################

    # Variables for recording the speech
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 44100

    # define callback for pyaudio to store the recording in queue
    def pyaudio_callback(in_data, frame_count, time_info, status):
        try:
            q.put(in_data)
        except Full:
            pass  # discard
        return (None, pyaudio.paContinue)

    # instantiate pyaudio
    audio = pyaudio.PyAudio()

    # open stream using callback
    stream = audio.open(format=FORMAT,
                        channels=CHANNELS,
                        rate=RATE,
                        input=True,
                        frames_per_buffer=CHUNK,
                        stream_callback=pyaudio_callback,
                        start=False)

    #########################################################################
    #### Start the recording and start service to recognize the stream ######
    #########################################################################

    print("Enter CTRL+C to end recording...")
    stream.start_stream()

    try:
        recognize_thread = Thread(target=recognize_using_weboscket, args=())
        recognize_thread.start()

        while True:
            pass
    except KeyboardInterrupt:
        # stop recording
        recognize_thread.stop()
        stream.stop_stream()
        stream.close()
        audio.terminate()
        audio_source.completed_recording()
Esempio n. 6
0
def listen_to_mic(api_key, service_url):
    q = Queue(maxsize=int(round(BUF_MAX_SIZE / CHUNK)))
    audio_source = AudioSource(q, is_recording=True, is_buffer=True)

    # Prepare Speech to Text Service

    # initialize speech to text service
    authenticator = IAMAuthenticator(apikey=api_key)
    speech_to_text = SpeechToTextV1(authenticator=authenticator)
    speech_to_text.set_service_url(service_url)

    # define callback for the speech to text service
    class MyRecognizeCallback(RecognizeCallback):
        def __init__(self):
            RecognizeCallback.__init__(self)

        def on_transcription(self, transcript):
            print(transcript[0]['transcript'])

        def on_connected(self):
            print('Connection was successful')

        def on_error(self, error):
            print('Error received: {}'.format(error))

        def on_inactivity_timeout(self, error):
            print('Inactivity timeout: {}'.format(error))

        def on_listening(self):
            print('Service is listening')

        def on_hypothesis(self, hypothesis):
            pass
            # print(hypothesis)

        def on_data(self, data):
            pass
            # print(data)

        def on_close(self):
            print("Connection closed")

    # this function will initiate the recognize service and pass in the AudioSource
    def recognize_using_websocket(*args):
        mycallback = MyRecognizeCallback()
        if FORMAT == pyaudio.paInt16:
            content_type = f"audio/l16; rate={RATE}"
        else:
            raise NotImplementedError(
                "only pyaudio.paInt16 format is supported")

        speech_to_text.recognize_using_websocket(audio=audio_source,
                                                 content_type=content_type,
                                                 recognize_callback=mycallback,
                                                 interim_results=True)

    # Prepare the for recording using Pyaudio

    # define callback for pyaudio to store the recording in queue
    def pyaudio_callback(in_data, frame_count, time_info, status):
        try:
            q.put(in_data)
        except Full:
            pass  # discard
        return None, pyaudio.paContinue

    # instantiate pyaudio
    audio = pyaudio.PyAudio()

    # open stream using callback
    stream = audio.open(format=FORMAT,
                        channels=CHANNELS,
                        rate=RATE,
                        input=True,
                        output=False,
                        frames_per_buffer=CHUNK,
                        stream_callback=pyaudio_callback,
                        start=False)

    # Start the recording and start service to recognize the stream
    print("Enter CTRL+C to end recording...")

    stream.start_stream()

    try:
        recognize_thread = Thread(target=recognize_using_websocket, args=())
        recognize_thread.start()

        while True:
            pass
    except KeyboardInterrupt:
        # stop recording
        stream.stop_stream()
        stream.close()
        audio.terminate()
        audio_source.completed_recording()
Esempio n. 7
0
class ibmTranscribe:

    def __init__(self, audio_device):
        self.is_supported = is_supported
        if not self.is_supported:
            return

        self.audio_device = audio_device

        APIKEY = None
        URL = None
        with open(speakreader.CONFIG.IBM_CREDENTIALS_FILE) as f:
            for line in f.read().splitlines():
                parm = line.split('=')
                if parm[0] == 'SPEECH_TO_TEXT_APIKEY':
                    APIKEY = parm[1]
                if parm[0] == 'SPEECH_TO_TEXT_URL':
                    URL = parm[1]

        if APIKEY is None or URL is None:
            logger.warn('ibmTranscribe: APIKEY or URL not found in credentials file')

        # initialize speech to text service
        self.authenticator = IAMAuthenticator(APIKEY)
        self.speech_to_text = SpeechToTextV1(authenticator=self.authenticator)
        self.speech_to_text.set_service_url(URL)
        self.mycallback = ProcessResponses()

        self.audio_source = AudioSource(audio_device._streamBuff, is_recording=True, is_buffer=True)

    def transcribe(self):
        if not self.is_supported:
            return
        # Generator to return transcription results
        logger.debug('ibmTranscribe.transcribe ENTER')

        recognize_thread = Thread(target=self.recognize_using_websocket, args=())
        recognize_thread.start()

        while True:
            response = self.mycallback.responseQueue.get()
            if response is None:
                break
            yield response

        self.audio_source.completed_recording()
        recognize_thread.join()
        logger.debug('ibmTranscribe.transcribe EXIT')


    # this function will initiate the recognize service and pass in the AudioSource
    def recognize_using_websocket(self, *args):
        logger.debug("ibmTransribe.recognize_using_websocket ENTER")
        self.speech_to_text.recognize_using_websocket(
            audio=self.audio_source,
            content_type='audio/l16; rate=%s' % self.audio_device._outputSampleRate,
            recognize_callback=self.mycallback,
            interim_results=True,
            max_alternatives=1,
            inactivity_timeout=-1,
            smart_formatting=True,
            word_alternatives_threshold=0.75,
            profanity_filter=bool(speakreader.CONFIG.ENABLE_CENSORSHIP),
        )
        logger.debug("ibmTransribe.recognize_using_websocket EXIT")