def main():
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'en-US'  # a BCP-47 language tag

    # If enabled, each word in the first alternative of each result will be
    # tagged with a speaker tag to identify the speaker.
    enable_speaker_diarization = True

    # Optional. Specifies the estimated number of speakers in the conversation.
    # diarization_speaker_count = 2

    client = speech_v1p1beta1.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code,
        enable_speaker_diarization=enable_speaker_diarization)

    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)
Esempio n. 2
0
def google_transcribe_file(fp, bucket_name='prof-resp-trans'):
    storage_client = google.cloud.storage.Client()
    bucket = storage_client.get_bucket(bucket_name)

    client = google.cloud.speech_v1p1beta1.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,
        language_code='en-US',
        enable_word_time_offsets=True,
        enable_automatic_punctuation=True)

    blob = bucket.blob(fp.name)

    if not blob.exists():
        new_print('Uploading File: {}'.format(fp.name))

        blob.upload_from_filename(str(fp.resolve()))

        new_print('Finished Uploading: {}'.format(fp.name))
    else:
        new_print('File already uploaded: {}'.format(fp.name))

    new_print('Starting transcription...')

    audio = types.RecognitionAudio(
        uri='gs://{}/{}'.format(bucket_name, fp.name))

    response = client.long_running_recognize(config, audio)
    results = response.result()

    new_print('Transciption finished')

    return results
Esempio n. 3
0
def read(file_name, result_file, time=50):
    # Instantiates a client
    client = speech.SpeechClient()

    # The name of the audio file to transcribe
    file_name = os.path.join(
        os.path.dirname(__file__),
        file_name)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        audio_channel_count=2,
        sample_rate_hertz=44100,
        enable_speaker_diarization=True,
        diarization_speaker_count=2,
        language_code='ko-KR')

    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        with io.open(result_file, 'w') as f:
            FILE_BYTE = 44100 * 2
            read_byte = FILE_BYTE * time

            while True:
                content = audio_file.read(read_byte)
                if not content:
                    break
                audio = types.RecognitionAudio(content=content)
                response = client.recognize(config, audio)

                for result in response.results:
                    f.write(result.alternatives[0].transcript)
                    print(result.alternatives[0])
                    print('Transcript: {}'.format(result.alternatives[0].transcript))
    def configureAPI(self):

        if self.title.find('.flac') != -1:
            with open(os.path.join(self.path, self.title), 'rb') as audio_file:
                content = audio_file.read()
                self.audio = types.RecognitionAudio(content=content)

        else:
            self.audio = types.RecognitionAudio(uri="gs://twiml-mp3/" +
                                                self.title + ".flac")

        self.config = types.RecognitionConfig(
            encoding=self.encoding,
            sample_rate_hertz=self.sample_rate,
            language_code=self.language_code,
            enable_automatic_punctuation=self.punctuation,
            enable_speaker_diarization=self.diarization,
            diarization_speaker_count=self.num_speakers,
            audio_channel_count=1,
            use_enhanced=self.enhanced,
            model=self.model,
            enable_word_time_offsets=self.time_offsets,
            enable_word_confidence=self.word_confidence,
            max_alternatives=self.max_alternatives,
            metadata=self.metadata,
            speech_contexts=[types.SpeechContext(phrases=self.phrases)])
Esempio n. 5
0
def transcribe_gcs(gcs_uri, hint_phrases, set_config):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    client = speech_v1p1beta1.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)

    # hint_phrase = []
    # set_config['enable_speaker_diarization'] = 'False'

    print(set_config.get('enable_automatic_punctuation'))
    # Set default values, check dict having each key and cast from str to each type.
    config = types.RecognitionConfig(
        encoding=eval(set_config.get('encoding', 'enums.RecognitionConfig.AudioEncoding.FLAC')),
        sample_rate_hertz=int(set_config.get('sample_rate_hertz', 16000)),
        language_code=set_config.get('language_code', 'en-US'),
        enable_automatic_punctuation=eval(set_config.get('enable_automatic_punctuation', True)),
        enable_speaker_diarization=eval(set_config.get('enable_speaker_diarization', False)),
        diarization_speaker_count=int(set_config.get('diarization_speaker_count', 1)),
        speech_contexts=[speech_v1p1beta1.types.SpeechContext(phrases=hint_phrases)])

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=900)
    return response
    def transcribe_audio(self, audio_blob):
        phrases = [c.command_variant for c in self.commands]
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            language_code='en-US',
            audio_channel_count=audio_blob.pop('n_channels'),
            enable_word_time_offsets=True,
            model='video',
            speech_contexts=[
                dict(phrases=phrases, boost=self.commandword_bias)
            ])

        try:
            operation = self.client.long_running_recognize(config, audio_blob)
        except ResourceExhausted:
            err_msg = f"The project has run out of it's quota for today. Try again tomorrow or set up your own Google Cloud project, see '{meta_utils.install_url()}'"
            print(err_msg)
            sys.exit(1)

        print(u"Analyzing speech...")
        response = operation.result()

        words = []
        for result in response.results:
            for word in result.alternatives[0].words:
                words.append(word)
        return words
Esempio n. 7
0
def voice_recognize(storage_uri):
    """
    Performs synchronous speech recognition on an audio file

    Args:
      storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
    """

    client = speech_v1p1beta1.SpeechClient()

    # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3'

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.MP3,
        sample_rate_hertz=44100,
        language_code='en-US',
        # Enable automatic punctuation
        enable_automatic_punctuation=True)

    audio = {"uri": storage_uri}

    response = client.recognize(config, audio)

    result = [r.alternatives[0].transcript for r in response.results]

    return ' '.join(result)
Esempio n. 8
0
def microphone_streaming_start(wf, output_stream):
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'ko-KR'

    client = speech.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code,
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True)
    #     enable_speaker_diarization=True,
    #     diarization_speaker_count=3)

    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(RATE, CHUNK, wf, output_stream) as stream:
        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)

        # Now, put the transcription responses to use.
        listen_print_loop(responses)
Esempio n. 9
0
def transcribe_file_with_diarization():
    audio = types.RecognitionAudio(uri=args['input'])
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        # encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=22050,
        language_code='en-US',
        enable_speaker_diarization=True,
        diarization_speaker_count=2,
        model='phone_call')

    operation = client.long_running_recognize(config, audio)
    print("Waiting on response from google cloud...")
    response = operation.result(timeout=720)  ## 360 call 01
    for result in response.results:
        print("\n\n::BEGIN TRANSCRIPT::\n")
        print("{}".format(result.alternatives[0].transcript))
        print("\n::END TRANSCRIPT::\n\n")

        print("\t\tCONFIDENCE: {} \n\n".format(
            result.alternatives[0].confidence))
        print("::BEGIN SPEAKER DIARIZATION::\n")
        words_info = result.alternatives[0].words
        for word_info in words_info:
            print("{}: '{}'".format(word_info.speaker_tag, word_info.word))
        print("\n::END SPEAKER DIARIZATION")
Esempio n. 10
0
def convertWAVToTranscript(fileFullPathname, split_length_inSeconds):
    pool = Pool(8) # Number of concurrent threads

    #spk = new SpeechContext { phrases : [ "લગભગ", "માત્ર","શ્રી ત્રંબકભાઈ","સોભાગભાઈ","દેહ વિલય","જ્ઞાની પુરુષ","દશા","રુવાડા","ઐશ્વર્ય","એ","જ્ઞાન","વિકલ્પ","ત્યારે","હમણાં","મુમુક્ષુ","દશા","માર્ગ","અદ્દભુત","નિશ્ચય","સ્મૃતિ" ] };
    config = types.RecognitionConfig(
    encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
    #sample_rate_hertz=512000,
    language_code='gu-IN',
    speechContexts = {"phrases" : ['લગભગ', 'માત્ર','શ્રી ત્રંબકભાઈ','સોભાગભાઈ','દેહ વિલય','જ્ઞાની પુરુષ','દશા','રુવાડા','ઐશ્વર્ય','એ','જ્ઞાન','વિકલ્પ','ત્યારે','હમણાં']} #,'મુમુક્ષુ','દશા','માર્ગ','અદ્દભુત','નિશ્ચય','સ્મૃતિ'])] 
        #,"મુમુક્ષુ","દશા","માર્ગ","અદ્દભુત","નિશ્ચય","સ્મૃતિ" ])]
    )
    files = [f for f in listdir(fileFullPathname) if isfile(join(fileFullPathname, f))]   
    #file_Direcotry = os.path.dirname(fileFullPathname)
    
    def transcribe(data):
        idx, file = data
        num, _ = file.split('.')
        text_script = ""
        name = file
        #print(file + " - started")
        # Loads the audio into memory
        with io.open(fileFullPathname+'\\'+file, 'rb') as audio_file:
            content = audio_file.read()
            audio = types.RecognitionAudio(content=content)
        # Transcribe audio file
        # Detects speech in the audio file
        client = speech.SpeechClient()
        response = client.recognize(config, audio)
        for result in response.results:
            text_script += result.alternatives[0].transcript
            
        #print(name + " - done")
        return {
            "idx": num,
            "text": text_script
        }

    all_text = pool.map(transcribe, enumerate(files))
    pool.close()
    pool.join()

    transcript = ""
    total_seconds = 0
    for t in sorted_nicely(all_text): #sorted(all_text, key=lambda x: sorted_nicely(x['idx'])):
        #total_seconds += split_length_inSeconds
        print("Duration of file {} is {}".format(fileFullPathname+t['idx']+'.wav', math.ceil(get_duration(fileFullPathname+'\\'+t['idx']+'.wav'))))
        total_seconds += math.ceil(get_duration(fileFullPathname+t['idx']+'.wav'))
        # Cool shortcut from:
        # https://stackoverflow.com/questions/775049/python-time-seconds-to-hms
        # to get hours, minutes and seconds
        m, s = divmod(total_seconds, 60)
        h, m = divmod(m, 60)

        # Format time as h:m:s - 30 seconds of text
        transcript = transcript + "{:0>2d}:{:0>2d}:{:0>2d} {}\n".format(h, m, s, t['text'])

    #print(transcript)

    with open("transcript.txt", "w", encoding='utf-8') as f:
        f.write(transcript)
def transcribe_gcs(gcs_uri, hertz, channel):
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,  # flacの設定
        sample_rate_hertz=int(hertz),  # ヘルツは音声ファイルに合わせる
        audio_channel_count=int(channel),
        language_code='ja-JP',  # 日本語音声の場合
        enable_speaker_diarization=True,  # 異なる話者の分離
        enable_automatic_punctuation=True,  # 句読点
        speech_contexts=SELECTED_PHRASES  # 音声適応ブースト
    )
    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    operationResult = operation.result()

    filename = gcs_uri.rsplit('/', 1)[1].split('.')[0] + ".txt"
    outputfilepath = os.path.join(OUTPUT_FOLDER, filename)
    fout = codecs.open(outputfilepath, 'a', 'utf-8')
    for result in operationResult.results:
        for alternative in result.alternatives:
            fout.write(u'{}\n'.format(alternative.transcript))
    fout.close()
Esempio n. 12
0
def diarized_transcribe(gcred, gcs_uri, speakercount):
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = gcred

    client = speech_v1p1beta1.SpeechClient()
    audio = beta_types.RecognitionAudio(uri=gcs_uri)

    config = beta_types.RecognitionConfig(
        encoding=beta_enums.RecognitionConfig.AudioEncoding.FLAC,
        language_code='en-US',
        enable_speaker_diarization=True,
        diarization_speaker_count=speakercount,
        enable_word_time_offsets=True,
        model='video',
        enable_automatic_punctuation=True)

    operation = client.long_running_recognize(config, audio)

    response = operation.result(timeout=3600)

    transcript = MessageToDict(response)

    transcript = transcript.get('results')
    transcript = transcript.pop()
    transcript = transcript.get('alternatives')
    transcript = transcript.pop()
    transcript = transcript.get('words')

    return transcript
Esempio n. 13
0
  def run(self):
    """Called from [start]. Connects to service and begins streaming."""

    # Exit if stop event occurred.
    if self._stop_event.is_set():
      return

    # Create SSL channel.
    channel = self._create_channel()
    self.is_started = True

    # Open stream
    service = cloud_speech.SpeechClient(channel)
    streaming_config = types.StreamingRecognitionConfig(
        config=types.RecognitionConfig(
            enable_automatic_punctuation=self.punctuation,
            encoding=self.encoding,
            sample_rate_hertz=self.rate,
            language_code=self.language,),
        interim_results=self.interim_results)

    try:
      request_stream = self._request_stream()
      resp_stream = service.streaming_recognize(
          streaming_config, request_stream)
      self._handle_results(resp_stream)
    finally:
      self.stop()
Esempio n. 14
0
def get_client(lang='en-US',
               sample_rate=16000,
               interim_results=False,
               single_utterance=True,
               phrase_key=""):
    """
    Helper to return client and config
    """
    client = SpeechClient()
    config = types.StreamingRecognitionConfig(
        config=types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=sample_rate,
            language_code=lang,
            # Enhanced models are only available to projects that
            # opt in for audio data collection.
            use_enhanced=True,
            # A model must be specified to use enhanced model.
            model="command_and_search",
            speech_contexts=[
                types.SpeechContext(phrases=PhraseGenerator.get_phrases(
                    "app/config.json", phrase_key), )
            ]),
        interim_results=interim_results,
        single_utterance=single_utterance)
    print(str(config))
    return client, config
def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code='en-US',
        profanity_filter=True,
        speech_contexts=[
            speech.types.SpeechContext(
                phrases=['Andy', 'Wisy', 'EEP', 'Project', 'Tom', 'Jeff'], )
        ],
    )

    operation = client.long_running_recognize(config, audio)

    print('GCS -- Waiting for operation to complete...')
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(('Transcript: {}'.format(result.alternatives[0].transcript)))
        print(('Confidence: {}'.format(result.alternatives[0].confidence)))
Esempio n. 16
0
def audio_main():
    f = open(u"Nao_log.txt", u"a")
    f.write(
        u'##**************************** Audio Log File (Group 1) *********************************##'
    )
    f.close()

    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = u'en-US'  # a BCP-47 language tag

    # If enabled, each word in the first alternative of each result will be
    # tagged with a speaker tag to identify the speaker.
    enable_speaker_diarization = True

    # Optional. Specifies the estimated number of speakers in the conversation.
    #diarization_speaker_count = 2

    client = speech_v1p1beta1.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code,
        enable_speaker_diarization=enable_speaker_diarization)

    streaming_config = types.StreamingRecognitionConfig(config=config,
                                                        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:

        while not stream.closed:
            sys.stdout.write(YELLOW)
            sys.stdout.write(u'\n' + unicode(STREAMING_LIMIT *
                                             stream.restart_counter) +
                             u': NEW REQUEST\n')

            stream.audio_input = []
            audio_generator = stream.generator()

            requests = (types.StreamingRecognizeRequest(audio_content=content)
                        for content in audio_generator)

            responses = client.streaming_recognize(streaming_config, requests)

            # Now, put the transcription responses to use.

            listen_print_loop(responses, stream)

            if stream.result_end_time > 0:
                stream.final_request_end_time = stream.is_final_end_time
            stream.result_end_time = 0
            stream.last_audio_input = []
            stream.last_audio_input = stream.audio_input
            stream.audio_input = []
            stream.restart_counter = stream.restart_counter + 1

            if not stream.last_transcript_was_final:
                sys.stdout.write(u'\n')
            stream.new_stream = True
Esempio n. 17
0
def google_transcribe(audio_file_name):

    file_name = filepath + audio_file_name
    frame_rate, channels = frame_rate_channel(file_name)

    if channels > 1:
        stereo_to_mono(file_name)

    bucket_name = bucketname
    source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name

    upload_blob(bucket_name, source_file_name, destination_blob_name)

    gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
    transcript = ''

    client = speech.SpeechClient()
    audio = types.RecognitionAudio(uri=gcs_uri)

    print("Setting up configurations")
    speech_context = speech.types.SpeechContext(phrases=[
        "$OOV_CLASS_DIGIT_SEQUENCE", "$YEAR", "$PERCENT", "$MONEY", "$MONTH"
    ])
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        language_code='en-US',
        enable_speaker_diarization=True,
        diarization_speaker_count=2,
        speech_contexts=[speech_context],
        use_enhanced=True,
        model="phone_call")

    # Detects speech in the audio file
    print("detecting speech")
    operation = client.long_running_recognize(config, audio)
    response = operation.result(timeout=10000)
    result = response.results[-1]
    words_info = result.alternatives[0].words

    tag = 1
    speaker = ""

    print("Assembling words")
    for word_info in words_info:
        if word_info.speaker_tag == tag:
            speaker = speaker + " " + word_info.word
        else:  #Changed
            transcript += "speaker {}: {}".format(tag, speaker) + '\n'
            tag = word_info.speaker_tag
            speaker = "" + word_info.word

    transcript += "speaker {}: {}".format(tag, speaker)

    delete_blob(bucket_name, destination_blob_name)
    return transcript
Esempio n. 18
0
def sub_main(profanityFilterBool):
    """
    *** Code taken from Google Cloud Speech to text documentation ***
    Turns on the profanity filter so bad words are censored and not printed
    """
    # See http://g.co/cloud/speech/docs/languages
    # for a list of supported languages.
    language_code = 'en-US'  # a BCP-47 language tag
    sp_c_cico = {
        "phrases": ["Hey cico", "Hey Kiko"],
        "boost": 30.0
    }  # speech_contexts_cico
    sp_c_kiko = {
        "phrases": ["cico", "Cico", "kiko", "Kiko", "kygo", "Kitty, girl"],
        "boost": 0
    }
    movement_words = {
        "phrases" : ["move", "feet", "forward", "right", "left", "backward", "degrees", "radians", "to the left", "to the right"],
        "boost": 20.0
    }
    numbers = {
        "phrases": ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"],
        "boost": 5.0
    }
    relevant_words = {
        "phrases": ["cornell cup robotics", "and", "pick up", "grab"],
        "boost": 10.0
    }
    speech_contexts = [sp_c_cico, sp_c_kiko, movement_words, relevant_words]
    client = speech_v1p1beta1.SpeechClient()
    # print(help(types.RecognitionConfig))
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=language_code,
        enable_automatic_punctuation=True,
        speech_contexts=speech_contexts)

    streaming_config = types.StreamingRecognitionConfig(
        config=config,
        interim_results=True)

    with MicrophoneStream(RATE, CHUNK) as stream:

        audio_generator = stream.generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)
        # Now, put the transcription responses to use.
        solution = returnResponseString(responses)  # solution is the result

        append_to_file("log.txt", str(solution))

    return solution
Esempio n. 19
0
def convertFLACToTranscript(fileFullPathname, split_length_inSeconds):
    pool = Pool(16) # Number of concurrent threads

    config = types.RecognitionConfig(
    encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
    #sample_rate_hertz=512000,
    language_code='gu-IN')
    files = [f for f in listdir(fileFullPathname) if isfile(join(fileFullPathname, f))]   
    #file_Direcotry = os.path.dirname(fileFullPathname)
    
    def transcribe(data):
        idx, file = data
        num, _ = file.split('.')
        text_script = ""
        name = file
        print(file + " - started")
        # Loads the audio into memory
        with io.open(fileFullPathname+'\\'+file, 'rb') as audio_file:
            content = audio_file.read()
            audio = types.RecognitionAudio(content=content)
        # Transcribe audio file
        # Detects speech in the audio file
        client = speech.SpeechClient()
        response = client.recognize(config, audio)
        for result in response.results:
            text_script += result.alternatives[0].transcript
            
        print(name + " - done")
        return {
            "idx": num,
            "text": text_script
        }

    all_text = pool.map(transcribe, enumerate(files))
    pool.close()
    pool.join()

    transcript = ""
    total_seconds = 0
    for t in sorted_nicely(all_text): #sorted(all_text, key=lambda x: sorted_nicely(x['idx'])):
        #print("Duration of file {} is {}".format(fileFullPathname+t['idx']+'.flac', math.ceil(get_duration(fileFullPathname+'\\'+t['idx']+'.flac'))))
        total_seconds += math.ceil(get_duration(fileFullPathname+t['idx']+'.flac'))
        # Cool shortcut from:
        # https://stackoverflow.com/questions/775049/python-time-seconds-to-hms
        # to get hours, minutes and seconds
        m, s = divmod(total_seconds, 60)
        h, m = divmod(m, 60)

        # Format time as h:m:s - 30 seconds of text
        transcript = transcript + "{:0>2d}:{:0>2d}:{:0>2d} {}\n".format(h, m, s, t['text'])

    #print(transcript)

    with open("transcript.txt", "w", encoding='utf-8') as f:
        f.write(transcript)
Esempio n. 20
0
def transcribe_gcs(gcs_uri):
    from google.cloud import speech_v1p1beta1 as speech
    from google.cloud.speech_v1p1beta1 import enums
    from google.cloud.speech_v1p1beta1 import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        sample_rate_hertz=32000,
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        enable_speaker_diarization=True,
        diarization_speaker_count=2,
        language_code='ja-JP')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    operationResult = operation.result()

    d = datetime.datetime.today()
    today = d.strftime("%Y%m%d-%H%M%S")
    fout = codecs.open('output{}.txt'.format(today), 'a', 'shift_jis')

    speaker_1_words = ""
    speaker_1_s = float(0)
    speaker_2_words = ""
    speaker_2_s = float(0)

    for word in operationResult.results[-1].alternatives[0].words:
        tmp_word = u'{}'.format(word.word.split("|")[0])
        start_time = float(word.start_time.seconds) + float(
            word.start_time.nanos) / 1000 / 1000 / 1000
        end_time = float(word.end_time.seconds) + float(
            word.end_time.nanos) / 1000 / 1000 / 1000

        s = end_time - start_time

        if word.speaker_tag == 1:
            speaker_1_s += s
            speaker_1_words += tmp_word
        else:
            speaker_2_s += s
            speaker_2_words += tmp_word

    fout.write('speaker_1: \n{}\n'.format(speaker_1_words))
    fout.write('s: {}\n'.format(speaker_1_s))

    fout.write('speaker_2: \n{}\n'.format(speaker_2_words))
    fout.write('s: {}\n'.format(speaker_2_s))

    #for result in operationResult.results:
    #  for alternative in result.alternatives:
    #      fout.write(u'{}\n'.format(alternative.transcript))

    fout.close()
Esempio n. 21
0
def google_transcribe(audio_file_name):

    file_name = filepath + audio_file_name
    m4a_to_wav(file_name)

    # The name of the audio file to transcribe

    frame_rate, channels = frame_rate_channel(file_name)

    if channels > 1:
        stereo_to_mono(file_name)

    bucket_name = bucketname
    source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name

    upload_blob(bucket_name, source_file_name, destination_blob_name)

    gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
    transcript = ''

    client = speech.SpeechClient()
    audio = types.RecognitionAudio(uri=gcs_uri)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        language_code='en-IN',
        enable_speaker_diarization=True,
        diarization_speaker_count=2)

    # Detects speech in the audio file
    operation = client.long_running_recognize(config, audio)
    response = operation.result(timeout=10000)
    result = response.results[-1]
    words_info = result.alternatives[0].words

    tag = 1
    speaker = ""

    for word_info in words_info:
        if word_info.speaker_tag == tag:
            speaker = speaker + " " + word_info.word
        else:

            transcript += f"speaker {tag}: {speaker}\n"
            tag = word_info.speaker_tag
            speaker = "" + word_info.word

    transcript += f"speaker {tag}: {speaker}"
    delete_blob(bucket_name, destination_blob_name)
    return transcript
def transcribe_file(speech_file):
    """Transcribe the given audio file asynchronously."""
    from google.cloud import speech_v1p1beta1
    from google.cloud.speech_v1p1beta1 import enums
    from google.cloud.speech_v1p1beta1 import types
    client = speech_v1p1beta1.SpeechClient()

    # [START migration_async_request]
    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    print "Using ", speech_file, ", with the below config:"
    print ""
    print "importing speech_v1p1beta1"
    print "language_code='en-US'"
    print "use_enhanced=True"
    print "enable_automatic_punctuation=False"
    print "enable_word_time_offsets=False"
    print "profanity_filter=True"
    print "sample_rate=44100hz"
    print ""
    print "Transcript is as follows"

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='en-US',
        use_enhanced=True,
        # A model must be specified to use enhanced model.
        model='phone_call',
        enable_automatic_punctuation=False,
        enable_word_time_offsets=False,
        profanity_filter=True,
        #        speech_contexts=[speech.types.SpeechContext(
        #           phrases=['Andy', 'Wisy', 'EEP', 'Project', 'Tom', 'Jeff'],
        #          )],
    )

    # [START migration_async_response]
    operation = client.long_running_recognize(config, audio)
    # [END migration_async_request]

    print('Waiting for operation to complete...')
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
        print('Confidence: {}'.format(result.alternatives[0].confidence))
Esempio n. 23
0
def transcribe_streaming(stream_file, encoding="LINEAR16", sample_rate=16000):
    client = speech.SpeechClient()

    with io.open(stream_file, 'rb') as audio_file:
        content = audio_file.read()

    # In practice, stream should be a generator yielding chunks of audio data.
    stream = [content]

    requests = (types.StreamingRecognizeRequest(audio_content=chunk)
                for chunk in stream)

    config = types.RecognitionConfig(
        encoding=ENCODINGS[encoding],
        sample_rate_hertz=sample_rate,
        language_code='ko-KR',
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True,
        enable_speaker_diarization=True,  # 한국어 지원 안됨 (speaker_tag가 모두 동일인으로 분류됨)
        diarization_speaker_count=3)
    streaming_config = types.StreamingRecognitionConfig(config=config)

    # streaming_recognize returns a generator.
    responses = client.streaming_recognize(streaming_config, requests)

    words_with_tags = []
    transcripts = []

    print("Waiting for transcribe...")
    for response in responses:
        for result in response.results:
            alternatives = result.alternatives
            for alternative in alternatives:
                print(u'Transcript: {}'.format(alternative.transcript))
                transcripts.append(
                    alternative.transcript)  # punctuation 포함된 문장을 사용하기 위해 저장
                for words in alternative.words:
                    word = words.word
                    start_time = round(
                        words.start_time.seconds +
                        words.start_time.nanos * 1e-9, 3)
                    end_time = round(
                        words.end_time.seconds + words.end_time.nanos * 1e-9,
                        3)
                    speaker_tag = words.speaker_tag
                    words_with_tags.append([
                        word, start_time, end_time, speaker_tag
                    ])  # [word, start_time, end_time, speaker_tag]
            print()  # newline

    return words_with_tags, transcripts
Esempio n. 24
0
    def process(self, loop):
        """
        Audio stream recognition and result parsing
        """
        #You can add speech contexts for better recognition
        cap_speech_context = types.SpeechContext(**self.context)
        metadata = types.RecognitionMetadata(**self.metadata)
        client = speech.SpeechClient()
        config = types.RecognitionConfig(encoding=self.encoding,
                                         sample_rate_hertz=self.rate,
                                         language_code=self.language,
                                         speech_contexts=[
                                             cap_speech_context,
                                         ],
                                         enable_automatic_punctuation=True,
                                         model=self.model,
                                         metadata=metadata)

        streaming_config = types.StreamingRecognitionConfig(
            config=config,
            interim_results=self.interim_results,
            single_utterance=self.single_utterance)
        audio_generator = self.stream_generator()
        requests = iter(
            types.StreamingRecognizeRequest(audio_content=content)
            for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)
        #print('process',type(responses))
        try:
            #print('process')
            for response in responses:
                #print('process received')
                if self.terminated:
                    break
                if not response.results:
                    continue
                result = response.results[0]
                if not result.alternatives:
                    continue
                speechData = MessageToDict(response)
                global_async_worker.add_task(self.async_callback(speechData))

                # debug
                transcript = result.alternatives[0].transcript

                print('>>', transcript, "(OK)" if result.is_final else "")
        except Exception as e:
            print('process excepted', e)
            self.start()
def sample_long_running_recognize(local_file_path):
    """
    Print confidence level for individual words in a transcription of a short audio
    file
    Separating different speakers in an audio file recording

    Args:
      local_file_path Path to local audio file, e.g. /path/audio.wav
    """

    client = speech_v1p1beta1.SpeechClient()

    # local_file_path = 'audio_files/2speaker.m4a'

    # If enabled, each word in the first alternative of each result will be
    # tagged with a speaker tag to identify the speaker.
    enable_speaker_diarization = True

    # Optional. Specifies the estimated number of speakers in the conversation.
    diarization_speaker_count = 2

    # The language of the supplied audio
    language_code = "en-US"

    config = types.RecognitionConfig(
        enable_speaker_diarization=enable_speaker_diarization,
        enable_automatic_punctuation=True,
        #diarization_speaker_count=diarization_speaker_count,
        # model='phone_call',
        language_code='en-US')

    with io.open(local_file_path, "rb") as f:
        content = f.read()
    audio = types.RecognitionAudio(content=content)

    operation = client.long_running_recognize(config, audio)

    print(u"Waiting for operation to complete...")
    response = operation.result()

    print(response.results)

    for result in response.results:
        # First alternative has words tagged with speakers
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))
        # Print the speaker_tag of each word
        for word in alternative.words:
            print(u"Word: {}".format(word.word))
            print(u"Speaker tag: {}".format(word.speaker_tag))
Esempio n. 26
0
 def gspeech_client(self):
     """Creates the Google Speech API client, configures it, and sends/gets
     audio/text data for parsing.
     """
     language_code = 'en-US'
     # Hints for the API
     context = types.SpeechContext(phrases=self.context)
     client = speech.SpeechClient()
     # Create metadata object, helps processing
     metadata = types.RecognitionMetadata()
     # Interaction Type:
     # VOICE_SEARCH: Transcribe spoken questions and queries into text.
     # VOICE_COMMAND: Transcribe voice commands, such as for controlling a device.
     metadata.interaction_type = (
         enums.RecognitionMetadata.InteractionType.VOICE_COMMAND)
     # Microphone Distance:
     # NEARFIELD: The audio was captured from a closely placed microphone.
     # MIDFIELD: The speaker is within 3 meters of the microphone.
     # FARFIELD: The speaker is more than 3 meters away from the microphone.
     metadata.microphone_distance = (
         enums.RecognitionMetadata.MicrophoneDistance.MIDFIELD)
     # Device Type:
     # PC: Speech was recorded using a personal computer or tablet.
     # VEHICLE: Speech was recorded in a vehicle.
     # OTHER_OUTDOOR_DEVICE: Speech was recorded outdoors.
     # OTHER_INDOOR_DEVICE: Speech was recorded indoors.
     metadata.recording_device_type = (
         enums.RecognitionMetadata.RecordingDeviceType.PC)
     # Media Type:
     # AUDIO: The speech data is an audio recording.
     # VIDEO: The speech data originally recorded on a video.
     metadata.original_media_type = (
         enums.RecognitionMetadata.OriginalMediaType.AUDIO)
     config = types.RecognitionConfig(
         encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
         sample_rate_hertz=16000,
         language_code=language_code,
         speech_contexts=[context],
         use_enhanced=True,
         model='command_and_search',
         metadata=metadata)
     streaming_config = types.StreamingRecognitionConfig(
         config=config, single_utterance=False, interim_results=False)
     # Hack from Google Speech Python docs, very pythonic c:
     requests = (types.StreamingRecognizeRequest(audio_content=content)
                 for content in self.generator())
     responses = client.streaming_recognize(streaming_config, requests)
     self._listen_print_loop(responses)
    def __init__(self):
        self.client = speech.SpeechClient()
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code='en-US',
            enable_word_time_offsets=True,
            model='video',
            diarization_speaker_count=2,
            enable_automatic_punctuation=True,
            use_enhanced=True,
            enable_speaker_diarization=True,
            speech_contexts=[speech.types.SpeechContext(phrases=[])]
        )

        self.streaming_config = types.StreamingRecognitionConfig(config=config)
Esempio n. 28
0
def google_transcribe(uploaded_file_path):
    print("Converting: \t" + uploaded_file_path.split("/")[-1])
    wav_file_path = mp3_to_wav(uploaded_file_path)
    print("Converted: \t" + wav_file_path.split("/")[-1])
    print("Checking frame rate: \t", wav_file_path.split("/")[-1])
    frame_rate, channels = frame_rate_channel(wav_file_path)
    wav_name = wav_file_path.split("/")[-1]

    print("Uploading blob: \t", wav_name)
    upload_blob(bucket_name, wav_file_path, wav_name)

    print("Starting Transcripting: \t", wav_name)
    gcs_uri = 'gs://' + bucket_name + '/' + wav_name
    transcript = ''
    client = speech.SpeechClient()
    audio = types.RecognitionAudio(uri=gcs_uri)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        language_code=Language_code,
        enable_speaker_diarization=True,
        diarization_speaker_count=2)

    # Detects speech in the audio file
    operation = client.long_running_recognize(config, audio)
    response = operation.result(timeout=10000)
    result = response.results[-1]
    words_info = result.alternatives[0].words

    tag = 1
    speaker = ""

    for word_info in words_info:
        if word_info.speaker_tag == tag:
            speaker = speaker + " " + word_info.word
        else:
            transcript += "speaker {}: {}".format(tag, speaker) + '\n'
            tag = word_info.speaker_tag
            speaker = "" + word_info.word

    transcript += "speaker {}: {}".format(tag, speaker)

    print("Deleting blob: \t", wav_name)
    delete_blob(bucket_name, wav_name)
    return transcript
Esempio n. 29
0
    def listen(self, language_code='ja-JP'):
        """Listen."""
        # See http://g.co/cloud/speech/docs/languages
        # for a list of supported languages.

        client = speech.SpeechClient()
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=self.rate,
            model=None,
            speech_contexts=[types.SpeechContext(
            )],
            language_code=language_code)
        streaming_config = types.StreamingRecognitionConfig(
            config=config,
            single_utterance=True,
            interim_results=True
        )

        self.callbacks.get("ready", lambda: True)()

        with MicrophoneStream(self.rate, int(self.rate/10)) as stream:

            self.callbacks.get("start", lambda: True)()

            while True:
                try:
                    audio_generator = stream.generator()
                    requests = (types.StreamingRecognizeRequest(audio_content=content)
                                for content in audio_generator)
                    responses = client.streaming_recognize(streaming_config, requests)

                    self.listen_print_loop(responses)

                except exceptions.OutOfRange:
                    print("Time exceeded.(OutOfRange)")
                except exceptions.ServiceUnavailable:
                    print("Connection closed.(ServiceUnavailable)")
                except KeyboardInterrupt:
                    print("KeyboardInterrupt.")
                    break
                except:
                    print("Unexpected error:", sys.exc_info()[0])
                    raise

            self.callbacks.get("end", lambda: True)()
Esempio n. 30
0
    def __init__(self, speakers, speaker_count, sample_rate, chunk, language_code, exit_command):
        self.speakers = speakers
        self.speaker_count = speaker_count
        self.sample_rate = sample_rate
        self.chunk = chunk
        self.language_code = language_code
        self.exit_command = exit_command

        self.client = speech.SpeechClient()
        self.recognition_config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=self.sample_rate,
            language_code=self.language_code,
            enable_speaker_diarization=True,
            diarization_speaker_count=self.speaker_count)
        self.streaming_config = types.StreamingRecognitionConfig(
            config=self.recognition_config,
            interim_results=True)