Example #1
0
def speech_recognize_once_from_file_with_customized_model():
    """performs one-shot speech recognition with input from an audio file, specifying a custom
    model"""
    # <SpeechRecognitionUsingCustomizedModel>
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)

    # Set the endpoint ID of your customized model
    # Replace with your own CRIS endpoint ID.
    speech_config.endpoint_id = "YourEndpointId"

    audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
    # Creates a speech recognizer using a file as audio input.
    # The default language is "en-us".
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config,
                                                   audio_config=audio_config)

    # Starts speech recognition, and returns after a single utterance is recognized. The end of a
    # single utterance is determined by listening for silence at the end or until a maximum of 15
    # seconds of audio is processed. It returns the recognition text as result.
    # Note: Since recognize_once() returns only a single utterance, it is suitable only for single
    # shot recognition like command or query.
    # For long-running multi-utterance recognition, use start_continuous_recognition() instead.
    result = speech_recognizer.recognize_once()

    # Check the result
    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("Recognized: {}".format(result.text))
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(
            result.no_match_details))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech Recognition canceled: {}".format(
            cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(
                cancellation_details.error_details))
def transcribe_streaming(stream_file, result_file):
    """Streams transcription of the given audio file."""
    import time
    
    # Microsoft authentication - add your API key on the line below. Using 'uksouth' as I am in the UK.
    speech_key, service_region = "your key", "uksouth"      ## Add your key 
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    # Return word timings!
    speech_config.request_word_level_timestamps()

    audio_input = speechsdk.AudioConfig(filename=stream_file)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)

    done = False

    def stop_cb(evt):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        store('CLOSING on {}'.format(evt), result_file)
        nonlocal done
        done = True

    speech_recognizer.recognizing.connect(lambda evt: store('RECOGNIZING: {}'.format(evt.result.json), result_file))
    speech_recognizer.recognized.connect(lambda evt: store('JSON: {}'.format(evt.result.json), result_file))
    speech_recognizer.session_started.connect(lambda evt: store('SESSION STARTED: {}'.format(evt), result_file))
    speech_recognizer.session_stopped.connect(lambda evt: store('SESSION STOPPED {}'.format(evt), result_file))
    speech_recognizer.canceled.connect(lambda evt: store('CANCELED {}'.format(evt), result_file))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        # Request current hypothesis every 0.05s
        time.sleep(.05)

    speech_recognizer.stop_continuous_recognition()
def speech_synthesis_viseme_event():
    """performs speech synthesis and shows the viseme event."""
    # Creates an instance of a speech config with specified subscription key and service region.
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)

    # Creates a speech synthesizer with a null output stream.
    # This means the audio output data will not be written to any output channel.
    # You can just get the audio from the result.
    speech_synthesizer = speechsdk.SpeechSynthesizer(
        speech_config=speech_config, audio_config=None)

    # Subscribes to viseme received event
    # The unit of evt.audio_offset is tick (1 tick = 100 nanoseconds), divide it by 10,000 to convert to milliseconds.
    speech_synthesizer.viseme_received.connect(lambda evt: print(
        "Viseme event received: audio offset: {}ms, viseme id: {}.".format(
            evt.audio_offset / 10000, evt.viseme_id)))

    # Receives a text from console input and synthesizes it to result.
    while True:
        print("Enter some text that you want to synthesize, Ctrl-Z to exit")
        try:
            text = input()
        except EOFError:
            break
        result = speech_synthesizer.speak_text_async(text).get()
        # Check result
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print("Speech synthesized for text [{}]".format(text))
            audio_data = result.audio_data
            print("{} bytes of audio data received.".format(len(audio_data)))
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = result.cancellation_details
            print("Speech synthesis canceled: {}".format(
                cancellation_details.reason))
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                print("Error details: {}".format(
                    cancellation_details.error_details))
Example #4
0
    def riconosci_audio():
        pygame.mixer.init()
        # Creates an instance of a speech config with specified subscription key and service region.
        # Replace with your own subscription key and service region (e.g., "westus").
        speech_key, service_region = "fcf5b7ab293f41daa91b9daabb2076b3", "francecentral"
        speech_config = speechsdk.SpeechConfig(
            subscription=speech_key,
            region=service_region,
            speech_recognition_language="it-IT")
        # Creates a recognizer with the given settings
        pygame.mixer.music.load("./sound/beep.mp3")
        pygame.mixer.music.play()
        speech_recognizer = speechsdk.SpeechRecognizer(
            speech_config=speech_config)
        print("Sto registrando:")

        # Performs recognition. recognize_once() returns when the first utterance has been recognized,
        # so it is suitable only for single shot recognition like command or query. For long-running
        # recognition, use start_continuous_recognition() instead, or if you want to run recognition in a
        # non-blocking manner, use recognize_once_async().

        result = speech_recognizer.recognize_once()
        pygame.mixer.music.load("./sound/golf.mp3")
        pygame.mixer.music.play()
        # Checks result.
        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            print("Recognized: {}".format(result.text))
            return (result.text)
        elif result.reason == speechsdk.ResultReason.NoMatch:
            print("No speech could be recognized: {}".format(
                result.no_match_details))
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = result.cancellation_details
            print("Speech Recognition canceled: {}".format(
                cancellation_details.reason))
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                print("Error details: {}".format(
                    cancellation_details.error_details))
def speech_recognize_continuous_from_file():
    """performs continuous speech recognition with input from an audio file"""
    # <SpeechContinuousRecognitionWithFile>
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)
    audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config,
                                                   audio_config=audio_config)

    done = False

    def stop_cb(evt):
        """callback that stops continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done = True

    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(
        lambda evt: print('RECOGNIZING: {}'.format(evt)))
    speech_recognizer.recognized.connect(
        lambda evt: print('RECOGNIZED: {}'.format(evt)))
    speech_recognizer.session_started.connect(
        lambda evt: print('SESSION STARTED: {}'.format(evt)))
    speech_recognizer.session_stopped.connect(
        lambda evt: print('SESSION STOPPED {}'.format(evt)))
    speech_recognizer.canceled.connect(
        lambda evt: print('CANCELED {}'.format(evt)))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)
def record_audio(ask=False):
    speech_key, service_region = "61b8a438ab6e4afa8d7496ab6982d4e3", "eastus"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config)

    result = speech_recognizer.recognize_once()

    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("Recognized: {}".format(result.text))
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(
            result.no_match_details))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech Recognition canceled: {}".format(
            cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(
                cancellation_details.error_details))

    return result.text.lower()
Example #7
0
def msft_tts(text):
    text = text[0:101]
    
    # Creates an instance of a speech config with specified subscription key and service region.
    speech_key, service_region = '86a1e0bee52f41e29fee0eae40f94f3c', 'uksouth'
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    # speech_config.speech_synthesis_voice_name = 'nb-NO-HuldaRUS'

    # Creates an audio configuration that points to an audio file.
    file_object = tempfile.NamedTemporaryFile(suffix='.wav')
    audio_output = speechsdk.AudioOutputConfig(
        filename=file_object.name,

    )

    # Creates a synthesizer with the given settings
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_output)

    # Synthesizes the text to speech.
    result = speech_synthesizer.speak_text_async(text).get()

    # Checks result.
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print("Speech synthesized to [{}] for text [{}]".format(file_object.name, text))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech synthesis canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            if cancellation_details.error_details:
                print("Error details: {}".format(cancellation_details.error_details))
        print("Did you update the subscription info?")

    # upload file
    gcs_key = f'audio_files/{time.time()}.wav'
    upload_blob('arabia', file_object.name, gcs_key)

    return f'https://arabia.storage.googleapis.com/{gcs_key}'
Example #8
0
def translate(text, language):
    # Creates an instance of a speech config with specified subscription key and service region.
    # Replace with your own subscription key and service region (e.g., "westus").
    speech_key, service_region = "51efd601e17f47d9bd3bf67bf81d0755", "westeurope"
    language = language_dict[language]
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)
    speech_config.set_property(
        speechsdk.PropertyId.SpeechServiceConnection_SynthLanguage, language)
    speech_config.set_speech_synthesis_output_format(
        speechsdk.SpeechSynthesisOutputFormat.Audio16Khz128KBitRateMonoMp3)

    # Creates a speech synthesizer using the default speaker as audio output.
    speech_synthesizer = speechsdk.SpeechSynthesizer(
        speech_config=speech_config)

    # Synthesizes the received text to speech.
    # The synthesized speech is expected to be heard on the speaker with this line executed.
    result = speech_synthesizer.speak_text_async(text).get()

    # Checks result.
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print("Speech synthesized to speaker for text [{}]".format(text))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech synthesis canceled: {}".format(
            cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            if cancellation_details.error_details:
                print("Error details: {}".format(
                    cancellation_details.error_details))
        print("Did you update the subscription info?")
    result = result.audio_data
    return result


#print(translate("Hello world!", 'en'))
def speech_synthesis_with_auto_language_detection_to_speaker():
    """performs speech synthesis to the default speaker with auto language detection
       Note: this is a preview feature, which might be updated in future versions."""
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)

    # create the auto detection language configuration without specific languages
    auto_detect_source_language_config = \
        speechsdk.languageconfig.AutoDetectSourceLanguageConfig()

    # Creates a speech synthesizer using the default speaker as audio output.
    speech_synthesizer = speechsdk.SpeechSynthesizer(
        speech_config=speech_config,
        auto_detect_source_language_config=auto_detect_source_language_config)

    while True:
        # Receives a text from console input and synthesizes it to speaker.
        # For example, you can input "Bonjour le monde. Hello world.", then you will hear "Bonjour le monde."
        # spoken in a French voice and "Hello world." in an English voice.
        print(
            "Enter some multi lingual text that you want to speak, Ctrl-Z to exit"
        )
        try:
            text = input()
        except EOFError:
            break
        result = speech_synthesizer.speak_text_async(text).get()
        # Check result
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print("Speech synthesized to speaker for text [{}]".format(text))
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = result.cancellation_details
            print("Speech synthesis canceled: {}".format(
                cancellation_details.reason))
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                print("Error details: {}".format(
                    cancellation_details.error_details))
    def process(self):
        logger.info('process:Enter')
        speech_key = model.key.AZURE_SPEECH_KEY
        service_region = model.key.AZURE_SERVICE_REGION
        speech_config = speechsdk.SpeechConfig(
            subscription=speech_key,
            region=service_region,
            speech_recognition_language="ja-JP")
        # setup the audio stream
        stream = speechsdk.audio.PushAudioInputStream(
            stream_format=speechsdk.audio.AudioStreamFormat(
                samples_per_second=16000, bits_per_sample=16))
        audio_config = speechsdk.audio.AudioConfig(stream=stream)
        # instantiate the speech recognizer with push stream input
        speech_recognizer = speechsdk.SpeechRecognizer(
            speech_config=speech_config, audio_config=audio_config)

        # Connect callbacks to the events fired by the speech recognizer
        def write_transcribed_data(evt):
            logger.debug(f'write_transcribed(text={evt.result.text})')
            self.transcript = evt.result.text

        speech_recognizer.recognizing.connect(write_transcribed_data)
        speech_recognizer.recognized.connect(write_transcribed_data)
        speech_recognizer.session_started.connect(
            lambda evt: logger.debug('SESSION STARTED: {}'.format(evt)))
        speech_recognizer.session_stopped.connect(
            lambda evt: logger.debug('SESSION STOPPED {}'.format(evt)))
        speech_recognizer.canceled.connect(
            lambda evt: logger.debug('CANCELED {}'.format(evt)))
        # start continuous speech recognition
        logger.info('start transcode')
        speech_recognizer.start_continuous_recognition()
        self.stream_generator(stream)
        speech_recognizer.stop_continuous_recognition()
        stream.close()
        logger.info('end transcode')
Example #11
0
def processAudio():
    API_KEY = os.getenv('API_KEY')
    REGION = os.getenv('REGION')

    speech_config = speechsdk.SpeechConfig(subscription=API_KEY, region=REGION)
    speech_config.request_word_level_timestamps()

    audio_input = speechsdk.AudioConfig(filename="converted.wav")
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config,
                                                   audio_config=audio_input)

    done = False
    results = []

    def stopCallBack(evt):
        nonlocal done
        done = True

    def getResults(evt):
        nonlocal results

        results.append(evt.result)

    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognized.connect(getResults)

    speech_recognizer.session_stopped.connect(stopCallBack)
    speech_recognizer.canceled.connect(stopCallBack)

    speech_recognizer.start_continuous_recognition()

    while not done:
        time.sleep(.5)

    speech_recognizer.stop_continuous_recognition()

    return results
Example #12
0
def get_response():
    # Creates an instance of a speech config with specified subscription key and service region.
    # Replace with your own subscription key and service region (e.g., "westus").
    speech_key, service_region = config.api_key, config.service_region
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)

    # Creates a recognizer with the given settings
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config)

    print("Say something...")

    # Starts speech recognition, and returns after a single utterance is recognized. The end of a
    # single utterance is determined by listening for silence at the end or until a maximum of 15
    # seconds of audio is processed.  The task returns the recognition text as result.
    # Note: Since recognize_once() returns only a single utterance, it is suitable only for single
    # shot recognition like command or query.
    # For long-running multi-utterance recognition, use start_continuous_recognition() instead.
    result = speech_recognizer.recognize_once()

    # Checks result.
    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("Recognized: {}".format(result.text))
        return result.text
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(
            result.no_match_details))
        return "nomatch"
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech Recognition canceled: {}".format(
            cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(
                cancellation_details.error_details))

    return result.text
def speech_recognize_continuous_from_file(filepath):
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)
    audio_config = speechsdk.audio.AudioConfig(filename=filepath)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config,
                                                   audio_config=audio_config)

    global transcription
    transcription = ''

    def addToTranscription(text):
        global transcription
        if text == '':
            transcription = transcription + text
        else:
            transcription = transcription + ' ' + text

    done = False

    def stop_cb(evt):
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done = True

    speech_recognizer.recognized.connect(
        lambda evt: addToTranscription(evt.result.text))

    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)

    return transcription
def speech_synthesis_with_voice():
    """performs speech synthesis to the default speaker with specified voice"""
    # Creates an instance of a speech config with specified subscription key and service region.
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)
    # Sets the synthesis voice name.
    # e.g. "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)".
    # The full list of supported voices can be found here:
    # https://aka.ms/csspeech/voicenames
    # And, you can try get_voices_async method to get all available voices (see speech_synthesis_get_available_voices() sample below).
    voice = "Microsoft Server Speech Text to Speech Voice (en-US, GuyNeural)"
    speech_config.speech_synthesis_voice_name = voice
    # Creates a speech synthesizer for the specified voice,
    # using the default speaker as audio output.
    speech_synthesizer = speechsdk.SpeechSynthesizer(
        speech_config=speech_config)

    # Receives a text from console input and synthesizes it to speaker.
    while True:
        print("Enter some text that you want to speak, Ctrl-Z to exit")
        try:
            text = input()
        except EOFError:
            break
        result = speech_synthesizer.speak_text_async(text).get()
        # Check result
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print(
                "Speech synthesized to speaker for text [{}] with voice [{}]".
                format(text, voice))
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = result.cancellation_details
            print("Speech synthesis canceled: {}".format(
                cancellation_details.reason))
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                print("Error details: {}".format(
                    cancellation_details.error_details))
def speech_synthesis_to_mp3_file():
    """performs speech synthesis to an mp3 file"""
    # Creates an instance of a speech config with specified subscription key and service region.
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)
    # Sets the synthesis output format.
    # The full list of supported format can be found here:
    # https://docs.microsoft.com/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs
    speech_config.set_speech_synthesis_output_format(
        speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3)
    # Creates a speech synthesizer using file as audio output.
    # Replace with your own audio file name.
    file_name = "outputaudio.mp3"
    file_config = speechsdk.audio.AudioOutputConfig(filename=file_name)
    speech_synthesizer = speechsdk.SpeechSynthesizer(
        speech_config=speech_config, audio_config=file_config)

    # Receives a text from console input and synthesizes it to mp3 file.
    while True:
        print("Enter some text that you want to synthesize, Ctrl-Z to exit")
        try:
            text = input()
        except EOFError:
            break
        result = speech_synthesizer.speak_text_async(text).get()
        # Check result
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print(
                "Speech synthesized for text [{}], and the audio was saved to [{}]"
                .format(text, file_name))
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = result.cancellation_details
            print("Speech synthesis canceled: {}".format(
                cancellation_details.reason))
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                print("Error details: {}".format(
                    cancellation_details.error_details))
def speech_recognize_continuous_from_file():
    """performs continuous speech recognition with input from an audio file"""
    speech_key, service_region = "api_key", "region"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)
    audio_config = speechsdk.audio.AudioConfig(
        filename=globals()["audiofilename"])
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config,
                                                   audio_config=audio_config)
    done = False

    def stop_cb(evt):
        """callback that stops continuous recognition upon receiving an event `evt`"""
        #print('CLOSING on {}'.format(evt))
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done = True

    # Connect callbacks to the events fired by the speech recognizer
    #speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
    globals()["all_results"] = []

    def handle_final_result(evt):
        globals()["all_results"].append(evt.result.text)

    speech_recognizer.recognized.connect(handle_final_result)
    ''''#speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
    #speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    #speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    #speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
    # stop continuous recognition on either session stopped or canceled events'''
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)
    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)
def recognize_intent_once_from_file():
    """performs one-shot intent recognition from input from an audio file"""
    # <IntentRecognitionOnceWithFile>
    # Set up the config for the intent recognizer (remember that this uses the Language Understanding key, not the Speech Services key)!
    intent_config = speechsdk.SpeechConfig(subscription=intent_key, region=intent_service_region)
    audio_config = speechsdk.audio.AudioConfig(filename=lampfilename)

    # Set up the intent recognizer
    intent_recognizer = speechsdk.intent.IntentRecognizer(speech_config=intent_config, audio_config=audio_config)

    # set up the intents that are to be recognized. These can be a mix of simple phrases and
    # intents specified through a LanguageUnderstanding Model.
    model = speechsdk.intent.LanguageUnderstandingModel(app_id=language_understanding_app_id)
    intents = [
        (model, "HomeAutomation.TurnOn"),
        (model, "HomeAutomation.TurnOff"),
        ("This is a test.", "test"),
        ("Switch the to channel 34.", "34"),
        ("what's the weather like", "weather"),
    ]
    intent_recognizer.add_intents(intents)

    # Run the intent recognizer.
    intent_result = intent_recognizer.recognize_once()

    # Check the results
    if intent_result.reason == speechsdk.ResultReason.RecognizedIntent:
        print("Recognized: \"{}\" with intent id `{}`".format(intent_result.text, intent_result.intent_id))
    elif intent_result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("Recognized: {}".format(intent_result.text))
    elif intent_result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(intent_result.no_match_details))
    elif intent_result.reason == speechsdk.ResultReason.Canceled:
        print("Translation canceled: {}".format(intent_result.cancellation_details.reason))
        if intent_result.cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(intent_result.cancellation_details.error_details))
Example #18
0
def synthesize_translations(result):
    language_to_voice_map = {
        "de": "de-DE-KatjaNeural",
        "en": "en-US-AriaNeural",
        "it": "it-IT-ElsaNeural",
        "pt": "pt-BR-FranciscaNeural",
        "zh-Hans": "zh-CN-XiaoxiaoNeural"
    }
    print(f'Recognized: "{result.text}"')

    for language in result.translations:
        translation = result.translations[language]
        print(f'Translated into "{language}": {translation}')

        speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                               region=service_region)
        speech_config.speech_synthesis_voice_name = language_to_voice_map.get(
            language)

        audio_config = speechsdk.audio.AudioOutputConfig(
            filename=f'{language}-translation.wav')
        speech_synthesizer = speechsdk.SpeechSynthesizer(
            speech_config=speech_config, audio_config=audio_config)
        speech_synthesizer.speak_text_async(translation).get()
Example #19
0
def llamada():

    speech_key, service_region = "e21c5662cc5c4e7aa983ba12c67f6a90", "eastus"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config,
                                                   language="es-MX")
    print("Se ha iniciado la grabación de la llamada...")
    result = speech_recognizer.recognize_once()
    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("Recognized: {}".format(result.text))
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(
            result.no_match_details))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech Recognition canceled: {}".format(
            cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(
                cancellation_details.error_details))

    #result = 2
    return result.text
def speech2text():


    speech_key, service_region = "30bf61e9604041eba9e79231abfa89af", "westus"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

    # Creates a recognizer with the given settings
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config)

    result = speech_recognizer.recognize_once()

    # Checks result.
    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("Recognized: {}".format(result.text))
        return result.text
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(result.no_match_details))
        return 'error please try again'
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech Recognition canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))
        return 'error please try again'
Example #21
0
def speech_recognize_continuous_from_file(filename, lang):
    """performs continuous speech recognition with input from an audio file"""
    speech_config = speechsdk.SpeechConfig(
        subscription=app.config['speech_key'],
        region=app.config['service_region'])
    speech_config.speech_recognition_language = lang
    #speech_config.request_word_level_timestamps()
    audio_config = speechsdk.audio.AudioConfig(filename=filename)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config,
                                                   audio_config=audio_config)

    done = False
    all_res = []

    def stop_cb(evt):
        """callback that stops continuous recognition upon receiving an event `evt`"""
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done = True

    def handle_final_result(evt):
        """callback that handles continuous recognition results upon receiving an event `evt`"""
        all_res.append(evt.result.text)

    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognized.connect(handle_final_result)
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)

    return all_res
Example #22
0
def speech_synthesis_to_speaker():
    """performs speech synthesis to the default speaker"""
    # Creates an instance of a speech config with specified subscription key and service region.
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    # Creates a speech synthesizer using the default speaker as audio output.
    # The default spoken language is "en-us".
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)

    # Receives a text from console input and synthesizes it to speaker.
    while True:
        print("Enter some text that you want to speak, Ctrl-Z to exit")
        try:
            text = input()
        except EOFError:
            break
        result = speech_synthesizer.speak_text_async(text).get()
        # Check result
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print("Speech synthesized to speaker for text [{}]".format(text))
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = result.cancellation_details
            print("Speech synthesis canceled: {}".format(cancellation_details.reason))
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                print("Error details: {}".format(cancellation_details.error_details))
Example #23
0
def speech_synthesis_with_voice(language, gender, text_to_speech):

    if language.upper() == "ES":
        language = "spanish"
    elif language.upper() == "EN":
        language = "english"

    gender = gender.lower()

    gender_list = {
        "spanish": {
            "male":
            "Microsoft Server Speech Text to Speech Voice (es-MX, JorgeNeural)",
            "female":
            "Microsoft Server Speech Text to Speech Voice (es-MX, DaliaNeural)"
        },
        "english": {
            "male":
            "Microsoft Server Speech Text to Speech Voice (en-GB, RyanNeural)"
        }
    }

    random_file_name = str(random.choice(range(1, 5000))) + ".mp3"
    file_config = speechsdk.audio.AudioOutputConfig(
        filename=f"{random_file_name}")
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)
    voice = gender_list[language][gender]
    speech_config.speech_synthesis_voice_name = voice
    #speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio16Khz128KBitRateMonoMp3)
    speech_config.set_speech_synthesis_output_format(
        SpeechSynthesisOutputFormat["Riff24Khz16BitMonoPcm"])
    speech_synthesizer = speechsdk.SpeechSynthesizer(
        speech_config=speech_config, audio_config=file_config)
    speech_synthesizer.speak_text_async(text_to_speech).get()
    return random_file_name
Example #24
0
    def setup(self):
        """gives an example how to use a push audio stream to recognize speech from a custom audio
        source"""

        CHUNKSIZE = 1024
        SAMPLE_WIDTH = 2
        RATE = 16000
        CHANNELS = 1

        speech_config = speechsdk.SpeechConfig(
            subscription=self.speech_key,
            region=self.service_region,
            speech_recognition_language=self.language)

        # setup the audio stream
        self.asr_stream = speechsdk.audio.PushAudioInputStream()
        audio_config = speechsdk.audio.AudioConfig(stream=self.asr_stream)

        # instantiate the speech recognizer with push stream input
        self.speech_recognizer = speechsdk.SpeechRecognizer(
            speech_config=speech_config, audio_config=audio_config)

        # Connect callbacks to the events fired by the speech recognizer
        self.speech_recognizer.recognizing.connect(
            lambda evt: self.result_callback('RECOGNIZING', evt))
        self.speech_recognizer.recognized.connect(
            lambda evt: self.result_callback('RECOGNIZED', evt))
        self.speech_recognizer.session_started.connect(
            lambda evt: print('SESSION STARTED: {}'.format(evt)))
        self.speech_recognizer.session_stopped.connect(
            lambda evt: self.stop_callback('SESSION STOPPED', evt))
        self.speech_recognizer.canceled.connect(
            lambda evt: self.stop_callback('CANCELED', evt))
        self.speech_recognizer.start_continuous_recognition()
        t = threading.Thread(target=self._generator)
        t.start()
Example #25
0
def SpeechRecog():
    speech_key, service_region = "speech_key", "service_region"

    weatherfilename = "recorded.wav"

    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)
    audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
    all_results = []

    def handle_final_result(a):
        all_results.append(a)
# Creates a recognizer with the given settings

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config,
                                                   audio_config=audio_config)

    speech_recognizer.session_started.connect(
        lambda evt: print('SESSION STARTED: {}'.format(evt)))
    speech_recognizer.session_stopped.connect(
        lambda evt: print('\nSESSION STOPPED {}'.format(evt)))
    speech_recognizer.recognized.connect(
        lambda evt: handle_final_result(evt.result.text))
    print(all_results)
    # print('Say a few words\n\n')
    speech_recognizer.start_continuous_recognition()
    time.sleep(30)
    speech_recognizer.stop_continuous_recognition()

    speech_recognizer.session_started.disconnect_all()
    speech_recognizer.recognized.disconnect_all()
    speech_recognizer.session_stopped.disconnect_all()
    all_results
    with open('your_file.txt', 'w') as f:
        for item in all_results:
            f.write("%s\n" % item)
def main():
    try:
        global speech_config

        # Get Configuration Settings
        load_dotenv()
        cog_key = os.getenv('COG_SERVICE_KEY')
        cog_region = os.getenv('COG_SERVICE_REGION')

        # Configure speech service
        speech_config = speech_sdk.SpeechConfig(cog_key, cog_region)
        print('Ready to use Pedalboard DAW:', speech_config.region)

        # Get raw audio file and sample rate
        audio_file, sample_rate = sf.read('Bass.wav')

        # Make a Pedalboard object, containing multiple plugins:
        board = Pedalboard(
            [
                #Compressor(threshold_db=-25, ratio=10),
                #Limiter(),
            ],
            sample_rate=sample_rate)

        # Get user input
        command = ''
        while command != 'quit session.':
            command = transcribe_command().lower()

            if command != 'quit session.':
                execute_command(command, board, audio_file, sample_rate)
            else:
                command = 'quit session.'

    except Exception as ex:
        print(ex)
Example #27
0
    def __init__(self, key=CONFIGS["stt_key"], region=CONFIGS["service_region"]):
        self.speech_config = speechsdk.SpeechConfig(subscription=key, region=region)

        # setup the audio stream
        self.stream = speechsdk.audio.PushAudioInputStream()
        audio_config = speechsdk.audio.AudioConfig(stream=self.stream)

        self._reset()

        # instantiate the speech recognizer with push stream input
        self.speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config, audio_config=audio_config)

        # Connect callbacks to the events fired by the speech recognizer
        #speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
        def recognized_handler(evt):
            print('RECOGNIZED: {}'.format(evt.result.text))
            self.recognized = True
            self.recognized_text = evt.result.text

        self.speech_recognizer.recognized.connect(recognized_handler)
        self.speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
        self.speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
        self.speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
        self.speech_recognizer.start_continuous_recognition()
def speech_recognize_once_from_file_with_customized_model():
    """performs one-shot speech recognition with input from an audio file, specifying a custom
    model"""
    # <SpeechRecognitionUsingCustomizedModel>
    speech_config = speechsdk.SpeechConfig(subscription=speech_key,
                                           region=service_region)

    # Set the endpoint ID of your customized model
    # Replace with your own CRIS endpoint ID.
    speech_config.endpoint_id = "YourEndpointId"

    audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
    # Creates a speech recognizer using a file as audio input.
    # The default language is "en-us".
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config,
                                                   audio_config=audio_config)

    # Perform recognition. `recognize_once` blocks until an utterance has been recognized, after
    # which recognition stops and a result is returned.  Thus, it is suitable only for single shot
    # recognition like command or query.  For long-running recognition, use continuous recognitions
    # instead.
    result = speech_recognizer.recognize_once()

    # Check the result
    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("Recognized: {}".format(result.text))
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(
            result.no_match_details))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech Recognition canceled: {}".format(
            cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(
                cancellation_details.error_details))
def speech_recognize_once_from_mic():
    """performs one-shot speech recognition from the default microphone"""
    # <SpeechRecognitionWithMicrophone>
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    # Creates a speech recognizer using microphone as audio input.
    # The default language is "en-us".
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config)

    # Perform recognition. `recognize_once` blocks until an utterance has been recognized, after
    # which recognition stops and a result is returned.  Thus, it is suitable only for single shot
    # recognition like command or query.  For long-running recognition, use continuous recognitions
    # instead.
    result = speech_recognizer.recognize_once()

    # Check the result
    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("Recognized: {}".format(result.text))
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized")
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech Recognition canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))
Example #30
0
import os
sys.path.append('transcript/ghostvlad')
import model as spkModel
import toolkits
import azure.cognitiveservices.speech as speechsdk
from pydub import AudioSegment
import numpy as np
import uisrnn
import librosa
# sys.path.append('visualization')
#from viewer import PlotDiar

# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and region.
speech_key, service_region = "3021013d1649482f91008c7df0a0d971", "centralindia"
speech_config = speechsdk.SpeechConfig(
    subscription=speech_key, region=service_region)


def pipeline(audio):
    timestamps = dia_audio(audio)
    output = asr(audio, timestamps)
    return output


"""A demo script showing how to DIARIZATION ON WAV USING UIS-RNN."""

# ===========================================
#        Parse the argument
# ===========================================