Esempio n. 1
0
def speech_recognize(source_audio):
    """Take source audio uri and call google-cloud-speech recognize
    method. Return transcript"""

    # instantiate client
    from google.cloud import speech
    sp_client = speech.SpeechClient()
    
    # speech RecognitionConfig
    rec_config = speech.types.RecognitionConfig(
        encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=_SAMPLE_RATE,
        profanity_filter=True,
        language_code='en-US'
        # max_alternatives, # Not using but available (0-30)
        # enable_word_time_offsets=TRUE,  # provide time offsets
    )                

    # speech RecognitionAudio from argument
    rec_audio = speech.types.RecognitionAudio(uri=source_audio)

    # recognition operation
    return sp_client.recognize(rec_config, rec_audio)
def transcribe_gcs(gcs_uri):
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    import time
    start = time.time()
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        language_code='en-US')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=9000000)

    filename = gcs_uri.split('/')[3][0:-5]
    with open(filename+".txt","w") as gsp:
        for result in response.results:
            gsp.write(result.alternatives[0].transcript)
    end = time.time()
    print(end-start)
def main():
    # language_code = 'en-US'  # a BCP-47 language tag te-IN en-IN
    language_code = 'en-IN'  # a BCP-47 language tag te-IN en-IN
    credentials = service_account.Credentials.from_service_account_file(
        'googleKeys.json')
    client = speech.SpeechClient(credentials=credentials)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        print("inside stream")
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)
Esempio n. 4
0
def listen_mic():
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'en-US'  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        voice_command = listen_return_speech(responses)
        return voice_command
Esempio n. 5
0
def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    with open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        #       encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='ko-KR')

    response = client.recognize(config, audio)
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    try:
        return (response.results[0].alternatives[0].transcript)
    except (IndexError):
        return ("Out")
Esempio n. 6
0
def mic():

    language_code = "ko-KR"  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code,
    )

    streaming_config = speech.StreamingRecognitionConfig(config=config,
                                                         interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (speech.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)
def main(sample_rate):
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'ko-KR'  # a BCP-47 language tag

    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=sample_rate,
        language_code=language_code,
        max_alternatives=1)
    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(sample_rate, int(sample_rate / 10)) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)
Esempio n. 8
0
def transcribe_file_with_word_time_offsets(speech_file):
    '''
    :param speech_file: input 할 오디오 파일 경로와 이름입니다.
    :return: 사용자가 말한 단어들점의 시작점과 끝점을 포함한 리스트입니다.
    '''
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='ko-KR',
        enable_word_time_offsets=True)

    response = client.recognize(config, audio)

    word_and_startend_time = []

    for result in response.results:
        alternative = result.alternatives[0]

        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            word_and_startend_time += [[
                word, start_time.seconds + start_time.nanos * 1e-9,
                end_time.seconds + end_time.nanos * 1e-9
            ]]

    return word_and_startend_time
Esempio n. 9
0
def transcribe_streaming(stream_file):
    """Streams transcription of the given audio file."""
    import io
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    with io.open(stream_file, 'rb') as audio_file:
        content = audio_file.read()

    # In practice, stream should be a generator yielding chunks of audio data.
    stream = [content]
    requests = (types.StreamingRecognizeRequest(audio_content=chunk)
                for chunk in stream)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='es-es')
    streaming_config = types.StreamingRecognitionConfig(config=config)

    # streaming_recognize returns a generator.
    responses = client.streaming_recognize(streaming_config, requests)

    for response in responses:
        # Once the transcription has settled, the first result will contain the
        # is_final result. The other results will be for subsequent portions of
        # the audio.
        for result in response.results:
            print('Finished: {}'.format(result.is_final))
            print('Stability: {}'.format(result.stability))
            alternatives = result.alternatives
            # The alternatives are ordered from most likely to least.
            for alternative in alternatives:
                print('Confidence: {}'.format(alternative.confidence))
                print(u'Transcript: {}'.format(alternative.transcript))
def transcribe_file_without_word_time_offsets(speech_file, language):
    """Transcribe the given audio file synchronously and output the word time
    offsets."""
    #print("Start")

    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types

    #print("checking credentials")

    client = speech.SpeechClient(credentials=credentials)

    #print("Checked")
    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    #print("audio file read")

    audio = types.RecognitionAudio(content=content)

    #print("config start")
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code=language,
        enable_word_time_offsets=True,
        audio_channel_count=1)
    # enableSeparateRecognitionPerChannel=True)

    #print("Recognizing:")
    response = client.recognize(config, audio)
    #print("Recognized")

    #return only transcript
    for result in response.results:
        alternative = result.alternatives[0]
        return ((alternative.transcript))
Esempio n. 11
0
def run_quickstart():
    # [START speech_quickstart]
    import io
    import os

    # Imports the Google Cloud client library
    # [START migration_import]
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    # [END migration_import]

    # Instantiates a client
    # [START migration_client]
    client = speech.SpeechClient()
    # [END migration_client]

    # The name of the audio file to transcribe
    file_name = os.path.join(os.path.dirname(__file__), 'resources',
                             'audio.raw')

    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='en-US')

    # Detects speech in the audio file
    response = client.recognize(config, audio)
    alternatives = response.results[0].alternatives

    for alternative in alternatives:
        print('Transcript: {}'.format(alternative.transcript))
Esempio n. 12
0
def performWork(name):
    client = speech.SpeechClient()
    json_arr = []
    extract(name)
    real_name = name[0:len(name) - 3] + ".flac"
    os.remove(name)
    with io.open(real_name, 'rb') as audio_file:

        content = audio_file.read()

        audio = types.RecognitionAudio(content=content)
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
            sample_rate_hertz=16000,
            language_code='en-US',
            enable_word_time_offsets=True)

        response = client.recognize(config, audio)
        # Each result is for a consecutive portion of the audio. Iterate through
        # them to get the transcripts for the entire audio file.
        for result in response.results:
            alternative = result.alternatives[0]
            # The first alternative is the most likely one for this portion.
            #print(u'Transcript: {}'.format(result.alternatives[0].transcript))
            for word_info in alternative.words:
                word = word_info.word
                json_ret = {}
                start_time = word_info.start_time
                end_time = word_info.end_time
                json_ret['word'] = word
                json_ret['start_time'] = start_time.seconds
                json_ret['end_time'] = end_time.seconds
                json_arr.append(json_ret)

        os.remove(real_name)

        reduce((real_name, json_arr))
Esempio n. 13
0
def transcribe_gcs_with_word_time_offsets(gcs_uri):
    """Transcribe the given audio file asynchronously and output the word time
    offsets."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='en-US',
        enable_word_time_offsets=True,
        model='video')

    operation = client.long_running_recognize(config, audio)
    result = operation.result(timeout=200)
    result_time_offsets = []
    for result in result.results:
        alternative = result.alternatives[0]
        # print(u'Transcript: {}'.format(alternative.transcript))
        # print('Confidence: {}'.format(alternative.confidence))
        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            start = start_time.seconds + start_time.nanos * 1e-9
            end_time = word_info.end_time
            end = end_time.seconds + end_time.nanos * 1e-9
            result_time_offsets.append((word, start, end))
        with open("word-time-offsets", "w") as the_file:
            csv.register_dialect("custom",
                                 delimiter=" ",
                                 skipinitialspace=True)
            writer = csv.writer(the_file, dialect="custom")
            for tup in result_time_offsets:
                writer.writerow(tup)
Esempio n. 14
0
def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code='en-US',
        enable_word_time_offsets=True)

    operation = client.long_running_recognize(config, audio)

    retry_count = 100
    while retry_count > 0 and not operation.done():
        retry_count -= 1
        time.sleep(2)

    if not operation.done():
        print('Operation not complete and retry limit reached.')
        return

    alternatives = operation.result().results[0].alternatives
    for alternative in alternatives:
        print('Transcript: {}'.format(alternative.transcript))
        print('Confidence: {}'.format(alternative.confidence))

        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            print('Word: {}, start_time: {}, end_time: {}'.format(
                word, start_time.seconds + start_time.nanos * 1e-9,
                end_time.seconds + end_time.nanos * 1e-9))
Esempio n. 15
0
def transcribe_file(speech_file):
    """Transcribe the given audio file asynchronously."""
    from google.cloud import speech

    client = speech.SpeechClient()

    # [START speech_python_migration_async_request]
    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    """
     Note that transcription is limited to a 60 seconds audio file.
     Use a GCS file for audio longer than 1 minute.
    """
    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=44100,
        language_code="en-GB",
        audio_channel_count = 2,
    )

    # [START speech_python_migration_async_response]

    operation = client.long_running_recognize(config=config, audio=audio)
    # [END speech_python_migration_async_request]

    print("Waiting for operation to complete...")
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u"Transcript: {}".format(result.alternatives[0].transcript))
        print("Confidence: {}".format(result.alternatives[0].confidence))
Esempio n. 16
0
    def stream_speach(self):
        try:
            if get_env_value('DEVICE') == 'PI':
                from lib.PiControls import PiControls
                pi = PiControls()
                pi.flash_blue()
            else:
                print("sorry! can't blink blue you don't have pi")

            print('live speech recognition started')
            print(threading.enumerate())
            # See http://g.co/cloud/speech/docs/languages
            # for a list of supported languages..
            language_code = 'en-US'  # a BCP-47 language tag
            credentials = service_account.Credentials.from_service_account_file(
                'google-cloud.json')
            client = speech.SpeechClient(credentials=credentials)
            config = types.RecognitionConfig(
                encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz=RATE,
                language_code=language_code)
            streaming_config = types.StreamingRecognitionConfig(
                config=config,
                interim_results=True)

            with MicrophoneStream(RATE, CHUNK) as stream:
                audio_generator = stream.generator()
                requests = (types.StreamingRecognizeRequest(audio_content=content)
                            for content in audio_generator)

                responses = client.streaming_recognize(streaming_config, requests)

                # Now, put the transcription responses to use.
                self.listen_print_loop(responses)
        except:
            print('exception occured')
            self.stream_speach()
Esempio n. 17
0
def transcribe_file_with_word_time_offsets(speech_file):
    """Transcribe the given audio file synchronously and output the word time
    offsets."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code='en-US',
        enable_word_time_offsets=True)

    response = client.recognize(config, audio)

    word_with_ts = []
    for result in response.results:
        #print result
        alternative = result.alternatives[0]
        print('Transcript: {}'.format(alternative.transcript))

        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            word_with_ts.append(
                (word, start_time.seconds + start_time.nanos * 1e-9,
                 end_time.seconds + end_time.nanos * 1e-9))
            #print('Word: {}, start_time: {}, end_time: {}'.format(
            #  word,
            #      start_time.seconds + start_time.nanos * 1e-9,
            #    end_time.seconds + end_time.nanos * 1e-9))
    return word_with_ts
Esempio n. 18
0
def transcribe_gcs_return(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        #encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,  # default : 16000
        language_code='en-US',  # 영어 : en-US , 한국어 : ko-KR
        model='phone_call')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    tuple_result_msg = ()
    result_msg = ''
    confidence = 0
    print('len : ', len(response.results))
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print('Transcript: {}'.format(result.alternatives[0].transcript))
        print('Confidence: {}'.format(result.alternatives[0].confidence))

        result_msg += result.alternatives[0].transcript + ' '
        confidence += result.alternatives[0].confidence

    #결과를 빅쿼리에 저장하기 위해 return
    tuple_result_msg = (result_msg, confidence / len(response.results))
    return tuple_result_msg
Esempio n. 19
0
def wav_conversion(filename, tgt_path):
    print(filename)

    client = speech.SpeechClient()
    corrupted_aud = "corrupted.txt"
    corrupted_log = open(corrupted_aud, "w+")

    # #write the recognized text
    fh = open(tgt_path, "w+")

    try:
        # Loads the audio into memory
        with io.open(filename, 'rb') as audio_file:
            content = audio_file.read()
            audio = types.RecognitionAudio(content=content)

        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code='en-US',
            model='video')

        response = client.recognize(config, audio)
        for result in response.results:
            print(result)
            rec = result.alternatives[0].transcript
            fh.write(rec + ". ")

        # with sr.AudioFile(filename) as source:
        #         audio = r.record(source)  # read the entire audio file
        #         rec=r.recognize_google(audio)
        #         fh.write(rec+". ")

    # catch any errors.  except sr.UnknownValueError:
    except:
        corrupted_log.write(filename + '\n')
        print("Could not understand audio")
Esempio n. 20
0
def main():
    procs = []
    #화면에 화자가 있고 출력할 문자열이 있는지 확인하는 Queue
    q = Queue()

    #작은 말풍선을 출력할지 큰 말풍선을 출력할지 결정하는 Queue
    bubble_q = Queue()
    #멀티 프로세싱으로 frame출력하는 프로그램 start
    proc = Process(target=visualize_frame, args=(
        q,
        bubble_q,
    ))
    procs.append(proc)
    proc.start()

    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'ko-KR'  # a BCP-47 language tag

    #마이크로 들어오는 speech를 google cloud platform의 speech to text api 호출하여 말풍선에 문자열 저장
    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses, q, bubble_q)
Esempio n. 21
0
    def google_transcribe(audio_file_name):

        file_name = audio_file_name
        mp3_to_wav(file_name)

        # The name of the audio file to transcribe

        frame_rate, channels = frame_rate_channel(file_name)

        if channels > 1:
            stereo_to_mono(file_name)

        bucket_name = bucketname
        source_file_name = audio_file_name
        destination_blob_name = audio_file_name

        upload_blob(bucket_name, source_file_name, destination_blob_name)

        gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
        transcript = ''

        client = speech.SpeechClient()
        audio = types.RecognitionAudio(uri=gcs_uri)

        config = types.RecognitionConfig(encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
                                         sample_rate_hertz=frame_rate, language_code='en-US',
                                         enable_automatic_punctuation=True)

        # Detects speech in the audio file
        operation = client.long_running_recognize(config, audio)
        response = operation.result(timeout=10000)

        for result in response.results:
            transcript += result.alternatives[0].transcript

        delete_blob(bucket_name, destination_blob_name)
        return transcript
Esempio n. 22
0
def get_text_from_audio():
    os.system(
        "export GOOGLE_APPLICATION_CREDENTIALS='analyze_tone_from_voice/Google_cloud_key/My_First_Project_926af8a5744c.json'"
    )
    os.system(
        'ffmpeg -i analyze_tone_from_voice/input.m4a -acodec libmp3lame -ab 128k analyze_tone_from_voice/input.mp3'
    )
    os.system(
        'sox analyze_tone_from_voice/input.mp3 --rate 16k --bits 16 --channels 1 analyze_tone_from_voice/input.flac'
    )

    # Instantiates a client
    client = speech.SpeechClient()

    # The name of the audio file to transcribe
    file_name = 'analyze_tone_from_voice/input.flac'

    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code='en-US')

    # Detects speech in the audio file
    response = client.recognize(config, audio)

    # for result in response.results:
    text = response.results[0].alternatives[0].transcript
    # print('Transcript: {}'.format(response.results[0].alternatives[0].transcript))
    # remove audio files
    os.system('rm analyze_tone_from_voice/input.mp3')
    os.system('rm analyze_tone_from_voice/input.flac')
    return text
def get_and_save_raw(input_path: str, bucket_name: str, out_path: str,
                     verbosity: int) -> None:
    """
    Gets raw JSON from Google Cloud Speech-to-text API

    :param input_path:  Path to read files from
    :param bucket_name: Name of the GCS bucket
    :param verbosity:   Verbosity level

    :return: None
    """
    bin_print(verbosity, 1, "Reading files from", input_path)

    bin_print(verbosity, 2, "Trying to find all .flac files...")
    flac_files = [
        f for f in listdir(input_path)
        if isfile(join(input_path, f)) and f.split(".")[1] == "flac"
    ]
    bin_print(verbosity, 3, "Found flac files:", flac_files)
    bin_print(verbosity, 3, "Total flac files:", len(flac_files))

    client = speech.SpeechClient()

    bin_print(verbosity, 1, "Running Google STT...")
    for flac_file in flac_files:
        if "stadt_zuerich" in flac_file:
            bin_print(verbosity, 2, "Processing " + flac_file)
            try:
                json = get_raw("gs://" + bucket_name + "/" + flac_file, client)
                json_path = out_path + "/" + flac_file.replace(
                    ".flac", "_google_output") + ".json"
                bin_print(verbosity, 2, "Writing " + json_path)
                f = open(json_path, "w")
                f.write(json)
                f.close()
            except _OperationNotComplete:
                bin_print(verbosity, 1, "Timeout for " + flac_file)
Esempio n. 24
0
def legenda_gcs(gcs_uri):
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=44100,
        language_code='pt-BR')

    operation = client.long_running_recognize(config, audio)

    print('Esperando a conclusao da operacao...')
    response = operation.result(timeout=3000)
    file = open('c:\legenda2.txt', 'w')

    for result in response.results:
        #print(u'Legenda: {}'.format(result.alternatives[0].transcript))
        #print('Acuracia: {}'.format(result.alternatives[0].confidence))
        file.write('{}\n'.format(result.alternatives[0].transcript))

    file.close()


#ffmpeg.exe -i "flac1.flac" -ac 1 mono1.flac

#gsutil acl ch -u AllUsers:R "gs://legenda/mono02.flac" para deixar arquivo publico no Storage
#set GOOGLE_APPLICATION_CREDENTIALS = c:\MyProject.json
#export GOOGLE_APPLICATION_CREDENTIALS="/home/joao_sinieghi/chave.json"

##Ambiente de desenvolvimento do Python##
#cd your-project
#virtualenv --python python3 env
#.\env\Scripts\activate
#source env/bin/activate
def main():
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    # 언어 설정코드 변경 가능
    language_code = 'ko-KR'  # a BCP-47 language tag

    # 언어 데이터셋을 Snips NLU에 넣어주기
    with io.open("./lights_dataset_train_ko.json", encoding="utf8") as f:
        sample_dataset = json.load(f)

    nlu_engine = SnipsNLUEngine(config=CONFIG_KO)
    nlu_engine = nlu_engine.fit(sample_dataset)

    print("성공")

    # 전처리 종료

    # 음성인식 시작

    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code)
    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        # 무한루프 시작
        listen_print_loop(responses, nlu_engine)
Esempio n. 26
0
def transcribe_file(fileName):
    """Convert given audio file to single channel."""
    monoFileName = uploader._safe_filename('mono.wav')
    sound = AudioSegment.from_file('./' + fileName)
    sound = sound.set_channels(1)
    sound = sound.set_sample_width(2)
    duration_in_milliseconds = len(sound)
    sound.export(monoFileName, format='wav')
    """Transcribe the given audio file."""
    client = speech.SpeechClient()

    # [START migration_sync_request]
    # [START migration_audio_config_file]
    with io.open(monoFileName, 'rb') as audio_file:
        content = audio_file.read()
        gcs_uri = uploader.upload_file(content, monoFileName, 'audio/wav')
        plotGraph(monoFileName, gcs_uri)

    audio = types.cloud_speech_pb2.RecognitionAudio(content=content)
    config = types.cloud_speech_pb2.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code='en-IN')
    # [END migration_audio_config_file]

    # [START migration_sync_response]
    response = client.recognize(config, audio)
    # [END migration_sync_request]
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    text = ''
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        text += u'{}'.format(result.alternatives[0].transcript)
        if (len(result.alternatives) > 0):
            text += ' '
    # [END migration_sync_response]
    return [duration_in_milliseconds, text, gcs_uri]
Esempio n. 27
0
def transcribe_gcs_with_word_time_offsets(gcs_uri):
    """Transcribe the given audio file asynchronously and output the word time
    offsets."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code='en-US',
        enable_word_time_offsets=True)

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    result = operation.result(timeout=90)

    for result in result.results:
        alternative = result.alternatives[0]
        print('Transcript: {}'.format(alternative.transcript))
        print('Confidence: {}'.format(alternative.confidence))

        for word_info in alternative.words:
            word = word_info.word
            print(word)
            start_time = word_info.start_time
            end_time = word_info.end_time
            if word in bad_words_list_final1:
                aList.append(word)
                aList.append(start_time)
                aList.append(end_time)
            print('Word: {}, start_time: {}, end_time: {}'.format(
                word, start_time.seconds + start_time.nanos * 1e-9,
                end_time.seconds + end_time.nanos * 1e-9))
Esempio n. 28
0
def transcribe_file(speech_file):
    """Transcribe the given audio file asynchronously."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    # [START migration_async_request]
    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code='en-US')

    # [START migration_async_response]
    operation = client.long_running_recognize(config, audio)
    # [END migration_async_request]

    print('Waiting for operation to complete...')
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    f = open("speech_file.html", "w+")
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        #print(u'Transcript: {}'.format(result.alternatives[0].transcript))
        #print('Confidence: {}'.format(result.alternatives[0].confidence))
        f.write(format(result.alternatives[0].transcript))
    # [END migration_async_response]
    f.close()
    webbrowser.open(
        'file:///media/spritle/ACER%20DATA/Health%20NLP/audiototext/speech_file.html'
    )
Esempio n. 29
0
def sub_main(profanityFilterBool):
    """
    *** Code taken from Google Cloud Speech to text documentation ***
    Turns on the profanity filter so bad words are censored and not printed
    """
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'en-US'  # a BCP-47 language tag
    sp_c_cico = {"phrases": ["cico"],"boost": 20} #speech_contexts_cico
    #sp_c_kiko = {"phrases": ["Kiko"],"boost": 0}#speech_contexts_kiko
    speech_contexts = [sp_c_cico]
    client = speech.SpeechClient()
    #print(help(types.RecognitionConfig))
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code,
        enable_automatic_punctuation=True,
        speech_contexts=speech_contexts)

    streaming_config = types.StreamingRecognitionConfig(
        config=config,
        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:

        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)
        # Now, put the transcription responses to use.
        solution = returnResponseString(responses) #solution is the result

        append_to_file("log.txt",str(solution))

    return solution
def main():
    client = speech.SpeechClient()
    config = speech.types.RecognitionConfig(
        encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=SAMPLE_RATE,
        language_code='en-US',
        max_alternatives=1,
        enable_word_time_offsets=True)
    streaming_config = speech.types.StreamingRecognitionConfig(
        config=config, interim_results=True)

    mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)

    print('Say "Quit" or "Exit" to terminate the program.')

    with mic_manager as stream:
        while not stream.closed:
            audio_generator = stream.generator()
            requests = (speech.types.StreamingRecognizeRequest(
                audio_content=content) for content in audio_generator)

            responses = client.streaming_recognize(streaming_config, requests)
            # Now, put the transcription responses to use.
            listen_print_loop(responses, stream)