Exemple #1
0
def transcribe_file_with_auto_punctuation():
    """Transcribe the given audio file with auto punctuation enabled."""
    # [START speech_transcribe_auto_punctuation_beta]
    from google.cloud import speech_v1p1beta1 as speech

    client = speech.SpeechClient()

    speech_file = "resources/commercial_mono.wav"

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code="en-US",
        # Enable automatic punctuation
        enable_automatic_punctuation=True,
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print(u"First alternative of result {}".format(i))
        print(u"Transcript: {}".format(alternative.transcript))
Exemple #2
0
def transcribe_file_with_multichannel():
    """Transcribe the given audio file synchronously with
    multi channel."""
    # [START speech_transcribe_multichannel_beta]
    from google.cloud import speech_v1p1beta1 as speech

    client = speech.SpeechClient()

    speech_file = "resources/Google_Gnome.wav"

    with open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
        audio_channel_count=1,
        enable_separate_recognition_per_channel=True,
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print("First alternative of result {}".format(i))
        print(u"Transcript: {}".format(alternative.transcript))
        print(u"Channel Tag: {}".format(result.channel_tag))
Exemple #3
0
def google_transcribe(audio_file_name):
    file_name = filepath + audio_file_name
   # mp3_to_wav(file_name)

    # The name of the audio file to transcribe
    
    frame_rate, channels = frame_rate_channel(file_name)
    
    if channels > 1:
        stereo_to_mono(file_name)
    
    bucket_name = bucketname
    source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name
    
    upload_blob(bucket_name, source_file_name, destination_blob_name)
    
    gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
    transcript = ''
        
    credential_path = "/home/asheeshg01/Speech-f22e193c0063.json"
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path
    
    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=frame_rate,
    language_code='en-US',
    enable_speaker_diarization=True,
    diarization_speaker_count=2)

    # Detects speech in the audio file
    #operation = client.long_running_recognize(config, audio)
    
    operation = client.long_running_recognize(request={"config":config, "audio":audio})
    response = operation.result(timeout=10000)
    result = response.results[-1] #Changed
    words_info = result.alternatives[0].words #Changed
    
    tag=1 #Changed
    speaker="" #Changed

    for word_info in words_info: #Changed
        if word_info.speaker_tag==tag: #Changed
            speaker=speaker+" "+word_info.word #Changed
        else: #Changed
            transcript += "speaker {}: {}".format(tag,speaker) + '\n' #Changed
            tag=word_info.speaker_tag #Changed
            speaker=""+word_info.word #Changed
          
    
    transcript += "speaker {}: {}".format(tag,speaker) #Changed
    #for result in response.results:
        #transcript += result.alternatives[0].transcript
    
    delete_blob(bucket_name, destination_blob_name)
    return transcript
Exemple #4
0
def google_word_details(audio_file_name):
    file_name = filepath + audio_file_name
    second_lang = "hi-IN"
    frame_rate, channels = frame_rate_channel(file_name)
    bucket_name = bucketname
    source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name
    upload_blob(bucket_name, source_file_name, destination_blob_name)
    gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
    transcript = ''
    word_details = ''
    credential_path = s.get("credential_path")
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        language_code='en-US',
        alternative_language_codes=[second_lang],
        enable_speaker_diarization=True,
        diarization_speaker_count=2,
        enable_word_time_offsets=True)

    # Detects speech in the audio file
    #operation = client.long_running_recognize(config, audio)

    operation = client.long_running_recognize(request={
        "config": config,
        "audio": audio
    })
    response = operation.result(timeout=10000)
    result = response.results[-1]
    words_info = result.alternatives[0].words

    tag = 1
    speaker = ""

    for word_info in words_info:
        word = word_info.word
        start_time = word_info.start_time
        end_time = word_info.end_time
        speaker1 = word_info.speaker_tag
        word_details += " Word: {} : start_time: {}: end_time: {}: speaker {}".format(
            word, start_time.total_seconds(), end_time.total_seconds(),
            speaker1)

    storage_client = storage.Client()
    bucket_name = storage_client.get_bucket(bucket_name)
    word_details_filename = audio_file_name.split(
        '.')[0] + '_word_details' + '.txt'
    blob_word_details_file = bucket_name.blob(word_details_filename)
    blob_word_details_file.upload_from_string(word_details)

    #delete_blob(bucket_name, destination_blob_name)
    return word_details
Exemple #5
0
def get_transcripts_json(gcstorage_path,
                         lang,
                         phrase_hints=[],
                         speaker_count=1,
                         enhanced_model=None):
    # transcribes audio files
    def _jsonify(res):
        # helper func for simplifying gcp speech client response
        json = []
        for section in res.results:
            data = {
                'transcript': section.alternatives[0].transcript,
                'words': []
            }
            for word in section.alternative[0].words:
                data['words'].append({
                    'word':
                    word.word,
                    'start_time':
                    word.start_time.total_seconds(),
                    'end_time':
                    word.end_time.total_seconds(),
                    'speaker_tag':
                    word.speaker_tag
                })
            json.append(data)

        return json

    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcstorage_path)
    diarize = speaker_count if speaker_count > 1 else False
    print(f"Diarizing: {diarize}")
    diarizationConfig = speech.SpeakerDiarizationConfig(
        enable_speaker_diarization=speaker_count
        if speaker_count > 1 else False, )

    # if eng only, can use the optimized video model
    if lang == 'en':
        enhanced_model = 'video'

    config = speech.RecognitionConfig(
        lang_code='en-US' if lang == 'en' else lang,
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True,
        speech_contexts=[{
            'phrases': phrase_hints,
            'boost': 15
        }],
        diarization_config=diarizationConfig,
        profanity_filter=True,
        use_enhanced=True if enhanced_model else False,
        model='video' if enhanced_model else None)

    res = client.long_running_recognize(config=config, audio=audio).result()

    return _jsonify(res)
Exemple #6
0
def transcribe_file_with_multilanguage(files_path=r'D:/dirname'):
    client = speech.SpeechClient()

    first_lang = "fr-FR"
    #second_lang = "cmn-Hans-CN"

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
        #language_code='fr-FR',
        model='command_and_search',
        enable_automatic_punctuation=True,
        sample_rate_hertz=16000,
        #audio_channel_count=2,
        #enable_speaker_diarization=True,
        language_code=first_lang,
        #alternative_language_codes=[second_lang],
        #model="video",
    )

    for f in os.listdir(files_path):
        speech_file = os.path.join(files_path, f)
        outputfile = os.path.splitext(f)[0] + '.txt'
        outputfile = os.path.join(files_path, outputfile)
        if os.path.splitext(speech_file)[-1] != '.mp3':
            continue

        if os.path.exists(outputfile):
            print(speech_file + ' already transcribed in ' + outputfile)
            continue

        print(speech_file)
        with open(speech_file, "rb") as audio_file:
            content = audio_file.read()

        audio = speech.RecognitionAudio(content=content)
        #gcs_uri = "gs://pathname.mp3"
        #audio = speech.RecognitionAudio(uri=gcs_uri)

        #operation = client.long_running_recognize(config=config, audio=audio)
        operation = client.recognize(config=config, audio=audio)
        print("Waiting for operation to complete...")
        #response = operation.result(timeout=30)
        response = operation
        #print(response.results)

        print('saving to ' + outputfile)
        with open(outputfile, 'w', encoding='utf-8') as f:
            for i, result in enumerate(response.results):
                alternative = result.alternatives[0]

                print("-" * 20)
                print(u"First alternative of result {}: {}".format(
                    i, alternative))
                print(u"Transcript: {}".format(alternative.transcript))
                f.write(alternative.transcript)
                f.write('\n')
def transcribe_file(speech_file, num_speakers):
    """Transcribe the given audio file asynchronously."""

    # Imports the Google Cloud client library
    #from google.cloud import speech
    from google.cloud import speech_v1p1beta1 as speech


    # Instantiates a client
    client = speech.SpeechClient()

    # The name of the audio file to transcribe
    file_name = os.path.join(os.path.dirname(__file__), "resources", "audio.raw")
    
    # Loads the audio into memory
    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()
    
    # Construct a recognition metadata object
    metadata = speech.RecognitionMetadata()
    metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION
    metadata.recording_device_type = (
        speech.RecognitionMetadata.RecordingDeviceType.OTHER_INDOOR_DEVICE
    )
    metadata.audio_topic = "court trial hearing" 
    metadata.original_mime_type = "audio/mp3"

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code="en-US",
        enable_automatic_punctuation=True,
        enable_speaker_diarization=True,
        diarization_speaker_count=num_speakers,
        # Enhanced models cost more than standard models. 
        use_enhanced=True,
        model="video",
        enable_word_time_offsets=True,

        )

    # Detects speech in the audio file -- short audio file
    print("Waiting for operation to complete...")
    response = client.recognize(config=config, audio=audio)
    result = response.results[-1]

    words_info = result.alternatives[0].words

    # Printing out the output:
    for word_info in words_info:
        print(
            u"word: '{}', speaker_tag: {}, start_time:{}, end_time:{}".format(word_info.word, word_info.speaker_tag, word_info.start_time.total_seconds(), word_info.end_time.total_seconds())
        )
Exemple #8
0
def speech_to_text(gcs_URI, keypath):
    # Reference: https://cloud.google.com/speech-to-text/docs/async-recognize
    # Set up credentials from local keypath
    G = 'https://www.listennotes.com/e/p/ea09b575d07341599d8d5b71f205517b/'
    credentials = service_account.Credentials.from_service_account_file(
        keypath)
    audio = speech.RecognitionAudio(uri=gcs_URI)
    config = speech.RecognitionConfig(
        language_code="en-US",
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True,
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
        sample_rate_hertz=16000,
    )

    client = speech.SpeechClient(credentials=credentials)
    operation = client.long_running_recognize(config=config, audio=audio)
    print("Waiting for operation to complete...")
    response = operation.result()
    i = 1
    sentence = ''
    transcript_all = ''
    start_time_offset = []
    # Building a python dict (contains start time and words) from the response:
    for result in response.results:
        best_alternative = result.alternatives[0]
        transcript = best_alternative.transcript
        if i == 1:
            transcript_all = transcript
        else:
            transcript_all += " " + transcript
        i += 1
        # Getting timestamps
        for word in best_alternative.words:
            start_s = word.start_time.total_seconds()
            word = word.word
            if sentence == '':
                sentence = word
                sentence_start_time = start_s
            else:
                sentence += ' ' + word
                if '.' in word:
                    start_time_offset.append({
                        'time': sentence_start_time,
                        'sentence': sentence
                    })
                    sentence = ''
    speech_to_text_data = {
        'transcript': transcript_all,
        'timestamps': start_time_offset
    }
    print('Finish transcription.')
    return speech_to_text_data
Exemple #9
0
def transcribe_gcs(gcs_uri):
    print('Process', gcs_uri)
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech_v1p1beta1 as speech
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = osp.abspath(
        configs['google_ca_dir'])

    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
        sample_rate_hertz=44100,
        language_code="ja-jp",
        enable_word_time_offsets=True,
    )

    operation = client.long_running_recognize(config=config, audio=audio)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.

    res = []
    for result in response.results:
        alternative = result.alternatives[0]
        # The first alternative is the most likely one for this portion.
        print(u"Transcript: {}".format(alternative.transcript))
        print("Confidence: {}".format(alternative.confidence))
        words = []
        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            words.append({
                'word': word,
                'start_time': start_time,
                'end_time': end_time
            })
            print(
                f"Word: {word}, start_time: {start_time.total_seconds()}, end_time: {end_time.total_seconds()}"
            )
        res.append({
            "Transcript": alternative.transcript,
            "Confidence": alternative.confidence,
            'word': words
        })

    with open(osp.join('res', gcs_uri[-7:-4]), 'wb') as f:
        pickle.dump(res, f)
Exemple #10
0
def my_transcribe():
    from google.cloud import speech_v1p1beta1 as speech
    client = speech.SpeechClient()

    speech_file = 'resources/voice_tom2.wav'
    # speech_file = 'resources/voice_tom_southern.wav'

    with open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        #sample_rate_hertz=44100,
        language_code="th-TH",
        audio_channel_count=2,  # 2 (stereo), 1 (mono)
        enable_word_confidence=True,
        enable_word_time_offsets=True,
        enable_speaker_diarization=True,
        diarization_speaker_count=2,
        model="default",
    )

    print("Waiting for operation to complete...")
    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 30)
        #print(u"Transcript: {}".format(alternative.transcript))
        print("Confidence: {}".format(alternative.confidence))
        print(u"Channel Tag: {}".format(result.channel_tag))
        ground_truth = get_ground_truth_text()
        hypothesis = str(alternative.transcript)
        print("Ground Truth: ", get_ground_truth_text())
        print("Hypothesis: ", hypothesis)

        atta = Tokenizer(model="attacut-sc")
        gt_word_tokenize = atta.tokenize(ground_truth)
        hp_word_tokenize = atta.tokenize(hypothesis)

        # gt_word_tokenize = word_tokenize(ground_truth, engine="newmm") # default=newmm, longest
        # hp_word_tokenize = word_tokenize(hypothesis, engine="newmm")

        print("Ground Truth Word Tokenize:", gt_word_tokenize)
        print("Hypothesis Word Tokenize:", hp_word_tokenize)
        error = evaluation.util.word_error_rate(hp_word_tokenize,
                                                gt_word_tokenize)
        print("WER: ", error)
def get_speaker_diarization_results(source_file_name, speaker_count):
    client = speech.SpeechClient()

    gcs_uri = "gs://ami_corpus/meeting_files/" + source_file_name
    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=48000,
        language_code="en-US",
        enable_speaker_diarization=True,
        diarization_speaker_count=speaker_count,
    )
    response = client.long_running_recognize(config=config, audio=audio)
    result = response.result().results[-1]
    return result.alternatives[0].words
Exemple #12
0
def google():
    if request.method == 'POST':
        if os.path.exists("speechtotext.wav"):
            os.remove("speechtotext.wav")
        if os.path.exists("monosound.wav"):
            os.remove("monosound.wav")

        f = request.files['file']
        content = f.read()

        with open('speechtotext.wav', mode='bx') as file:
            file.write(content)

        client = speech.SpeechClient()
        speech_file = "speechtotext.wav"

        rate, data = wf.read(speech_file)
        data0 = data[:, 0]

        wf.write("monosound.wav", 48000, data0)

        with io.open("monosound.wav", "rb") as audio_file:
            content = audio_file.read()

        audio = speech.RecognitionAudio(content=content)

        ob = sf.SoundFile(speech_file)

        first_lang = "en-US"
        second_lang = "es-US"
        third_lang = "zh-cmn-Hans-CN"
        fourth_lang = "hi-IN"

        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=ob.samplerate,
            language_code="en-US",
            alternative_language_codes=[second_lang, third_lang, fourth_lang])

        response = client.recognize(config=config, audio=audio)

        text = ""
        for i, result in enumerate(response.results):
            alternative = result.alternatives[0]
            text = text + alternative.transcript + "\n"

        return jsonify({'text': text})
Exemple #13
0
def transcribe_file_with_metadata():
    """Send a request that includes recognition metadata."""
    # [START speech_transcribe_recognition_metadata_beta]
    from google.cloud import speech_v1p1beta1 as speech

    client = speech.SpeechClient()

    speech_file = "resources/commercial_mono.wav"

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    # Here we construct a recognition metadata object.
    # Most metadata fields are specified as enums that can be found
    # in speech.enums.RecognitionMetadata
    metadata = speech.RecognitionMetadata()
    metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION
    metadata.microphone_distance = (
        speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD
    )
    metadata.recording_device_type = (
        speech.RecognitionMetadata.RecordingDeviceType.SMARTPHONE
    )

    # Some metadata fields are free form strings
    metadata.recording_device_name = "Pixel 2 XL"
    # And some are integers, for instance the 6 digit NAICS code
    # https://www.naics.com/search/
    metadata.industry_naics_code_of_audio = 519190

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code="en-US",
        # Add this in the request to send metadata.
        metadata=metadata,
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print(u"First alternative of result {}".format(i))
        print(u"Transcript: {}".format(alternative.transcript))
Exemple #14
0
def google_word_details(audio_file_name):
    file_name = filepath + audio_file_name
    frame_rate, channels = frame_rate_channel(file_name)
    bucket_name = bucketname
    source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name
    upload_blob(bucket_name, source_file_name, destination_blob_name)
    gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
    transcript = ''
    word_details = ''
    credential_path = "/home/asheeshg01/Speech-f22e193c0063.json"
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path
    
    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=frame_rate,
    language_code='en-US',
    enable_speaker_diarization=True,
    diarization_speaker_count=2,
    enable_word_time_offsets=True)

    # Detects speech in the audio file
    #operation = client.long_running_recognize(config, audio)
    
    operation = client.long_running_recognize(request={"config":config, "audio":audio})
    response = operation.result(timeout=10000)
    result = response.results[-1] #Changed
    words_info = result.alternatives[0].words #Changed
    
    tag=1 #Changed
    speaker="" #Changed

    for word_info in words_info: #Changed
        word = word_info.word
        start_time = word_info.start_time
        end_time = word_info.end_time
        speaker1 = word_info.speaker_tag
        word_details += " Word: {} : start_time: {}: end_time: {}: speaker {}".format(word,start_time.total_seconds(),end_time.total_seconds(),speaker1)
        
    
    delete_blob(bucket_name, destination_blob_name)
    return word_details
Exemple #15
0
def run_stt(lang):
    try:
        client = speech.SpeechClient()
        audio = speech.RecognitionAudio(uri=f'gs://kuza_audio/audio_file')
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.MP3,
            sample_rate_hertz=16000,
            language_code=lang,
        )
        operation = client.long_running_recognize(config=config, audio=audio)
    except Exception:
        return 'Fatal:STT 서비스 연결에 실패하였습니다.'

    try:
        response = operation.result(timeout=6000)
        text = ''
        for result in response.results:
            text += result.alternatives[0].transcript + ' '
    except Exception:
        return 'Fatal:자막 생성에 실패하였습니다.(시간 초과)'
    return text
def transcribe_gcs(gcs_uri, speakers_num, encoding):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech_v1p1beta1 as speech
    import spacy
    # import paralleldots
    import operator

    output = []

    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=encoding,  #speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=48000,
        language_code="en-US",
        audio_channel_count=1,
        enable_automatic_punctuation=True,
        enable_speaker_diarization=True,
        diarization_speaker_count=speakers_num,
    )

    operation = client.long_running_recognize(config=config, audio=audio)

    # print("Waiting for operation to complete...")
    response = operation.result(timeout=6000)
    speaker_tagged_result_1 = response.results[len(response.results) - 1]
    print(speaker_tagged_result_1.alternatives[0].words)
    for wordObj in speaker_tagged_result_1.alternatives[0].words:
        speaker_tag = str(wordObj.speaker_tag)
        output_item = {
            "word": wordObj.word,
            "start_time": wordObj.start_time.total_seconds(),
            "end_time": wordObj.end_time.total_seconds(),
            "speaker_tag": speaker_tag
        }
        output.append(output_item)

    output = sorted(output, key=lambda x: x['start_time'], reverse=False)
    return output
def export_transcript_to_storage_beta(input_storage_uri, output_storage_uri,
                                      encoding, sample_rate_hertz,
                                      language_code):

    # input_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
    audio = speech.RecognitionAudio(uri=input_storage_uri)

    # Pass in the URI of the Cloud Storage bucket to hold the transcription
    output_config = speech.TranscriptOutputConfig(gcs_uri=output_storage_uri)

    # Speech configuration object
    config = speech.RecognitionConfig(
        encoding=encoding,
        sample_rate_hertz=sample_rate_hertz,
        language_code=language_code,
    )

    # Compose the long-running request
    request = speech.LongRunningRecognizeRequest(audio=audio,
                                                 config=config,
                                                 output_config=output_config)

    # Create the speech client
    speech_client = speech.SpeechClient()

    operation = speech_client.long_running_recognize(request=request)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print("Transcript: {}".format(result.alternatives[0].transcript))
        print("Confidence: {}".format(result.alternatives[0].confidence))

    # [END speech_transcribe_with_speech_to_storage_beta]
    return response.results[0].alternatives[0].transcript
Exemple #18
0
def transcribe_file_with_auto_punctuation():
    from google.cloud import speech_v1p1beta1 as speech
    client = speech.SpeechClient()

    speech_file = 'resources/Google_Gnome.wav'

    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code='en-US',
        enable_automatic_punctuation=True)

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print('-' * 20)
        print(u'First alternative of result {}'.format(i))
        print(u'Transcript: {}'.format(alternative.transcript))
Exemple #19
0
def transcribe_file_with_diarization():
    """Transcribe the given audio file synchronously with diarization."""
    # [START speech_transcribe_diarization_beta]
    from google.cloud import speech_v1p1beta1 as speech

    client = speech.SpeechClient()

    speech_file = "resources/commercial_mono.wav"

    with open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code="en-US",
        enable_speaker_diarization=True,
        diarization_speaker_count=2,
    )

    print("Waiting for operation to complete...")
    response = client.recognize(config=config, audio=audio)

    # The transcript within each result is separate and sequential per result.
    # However, the words list within an alternative includes all the words
    # from all the results thus far. Thus, to get all the words with speaker
    # tags, you only have to take the words list from the last result:
    result = response.results[-1]

    words_info = result.alternatives[0].words

    # Printing out the output:
    for word_info in words_info:
        print(
            u"word: '{}', speaker_tag: {}".format(word_info.word, word_info.speaker_tag)
        )
def main():

    client = speech_v1p1beta1.SpeechClient.from_service_account_json('key.json')

    #gcs_uri = "gs://edward-raw/audio/teste_speech_to_texto.mp3"
    gcs_uri = "gs://edward-raw/audio/test.mp3"
    audio = speech_v1p1beta1.RecognitionAudio(uri=gcs_uri)

    config = speech_v1p1beta1.RecognitionConfig(
        encoding=speech_v1p1beta1.RecognitionConfig.AudioEncoding.MP3,
        sample_rate_hertz=16000,
        language_code="pt-BR",
    )

    print("Waiting for operation to complete...")
    # Audio Longo
    #operation = client.long_running_recognize(config=config, audio=audio)
    #response = operation.result(timeout=100000)
    # Audio Curto
    response = client.recognize(config=config, audio=audio)

    for result in response.results:
        print(u"Transcript: {}".format(result.alternatives[0].transcript))
Exemple #21
0
def toText(file):
    speech_file = file
    first_lang = "he"  # Hebrew
    second_lang = "en-US"  # English US
    third_lang = "ru_RU"  # Russian
    fourth_lang = "ar"  # Arabic

    with open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
        sample_rate_hertz=48000,
        language_code=first_lang,
        alternative_language_codes=[second_lang, third_lang, fourth_lang],
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        # alternative = result.alternatives[0]
        return result
Exemple #22
0
    def transcribe(
        self,
        file_uri: Union[str, Path],
        phrases: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> transcript_model.Transcript:
        """
        Transcribe audio from GCS file and return a Transcript model.

        Parameters
        ----------
        file_uri: Union[str, Path]
            The GCS file uri to the audio file or caption file to transcribe.
            It should be in format 'gs://...'.
        phrases: Optional[List[str]] = None
            A list of strings to feed as targets to the model.

        Returns
        -------
        outputs: transcript_model.Transcript
            The transcript model for the supplied media file.
        """
        # Create client
        client = speech.SpeechClient.from_service_account_file(
            filename=str(self.credentials_file))

        # Create basic metadata
        metadata = speech.RecognitionMetadata()
        metadata.interaction_type = (
            speech.RecognitionMetadata.InteractionType.PHONE_CALL)
        metadata.original_media_type = (
            speech.RecognitionMetadata.OriginalMediaType.VIDEO)

        # Add phrases
        event_metadata_speech_context = speech.SpeechContext(
            phrases=self._clean_phrases(phrases))

        # Prepare for transcription
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code="en-US",
            enable_automatic_punctuation=True,
            enable_word_time_offsets=True,
            enable_spoken_punctuation=True,
            speech_contexts=[
                GOOGLE_SPEECH_ADAPTION_CLASSES,
                event_metadata_speech_context,
            ],
            metadata=metadata,
            model="video",
            use_enhanced=True,
        )
        audio = speech.RecognitionAudio(uri=file_uri)

        # Begin transcription
        log.debug(f"Beginning transcription for: {file_uri}")
        operation = client.long_running_recognize(request={
            "config": config,
            "audio": audio
        })

        # Wait for complete
        response = operation.result(timeout=10800)

        # Select highest confidence transcripts
        confidence_sum = 0
        segments = 0

        # Create timestamped sentences
        timestamped_sentences: List[transcript_model.Sentence] = []
        transcript_sentence_index = 0

        # Create sentence boundary pipeline
        nlp = English()
        nlp.add_pipe("sentencizer")

        for result in response.results:
            # Some portions of audio may not have text
            if len(result.alternatives) > 0:
                # Split transcript into sentences
                doc = nlp(result.alternatives[0].transcript)

                # Convert generator to list
                sentences = [str(sent) for sent in doc.sents]

                # Index holder for word results of response
                w_marker = 0
                for s_ind, _ in enumerate(sentences):
                    # Sentence text
                    s_text = sentences[s_ind]

                    num_words = len(s_text.split())

                    # Initialize sentence model
                    timestamped_sentence = transcript_model.Sentence(
                        index=transcript_sentence_index,
                        confidence=result.alternatives[0].confidence,
                        # Start and end time are placeholder values
                        start_time=0.0,
                        end_time=0.0,
                        words=[],
                        text=s_text,
                    )

                    for w_ind in range(w_marker, w_marker + num_words):
                        # Extract word from response
                        word = result.alternatives[0].words[w_ind]

                        # Nanos no longer supported, use microseconds instead
                        # https://github.com/googleapis/python-speech/issues/71
                        start_time = (word.start_time.seconds +
                                      word.start_time.microseconds * 1e-6)

                        end_time = (word.end_time.seconds +
                                    word.end_time.microseconds * 1e-6)

                        # Add start_time to Sentence if first word
                        if w_ind - w_marker == 0:
                            timestamped_sentence.start_time = start_time

                        # Add end_time to Sentence if last word
                        if (w_ind - w_marker) == (num_words - 1):
                            timestamped_sentence.end_time = end_time

                        # Create Word model
                        timestamped_word = transcript_model.Word(
                            index=w_ind - w_marker,
                            start_time=start_time,
                            end_time=end_time,
                            text=self._clean_word(word.word),
                        )

                        timestamped_sentence.words.append(timestamped_word)

                    # Increment word marker
                    w_marker += num_words

                    # Add Sentence to sentence list
                    timestamped_sentences.append(timestamped_sentence)

                    # Increment transcript sentence index
                    transcript_sentence_index += 1

                # Update confidence stats
                confidence_sum += result.alternatives[0].confidence
                segments += 1

        # Compute mean confidence
        if segments > 0:
            confidence = confidence_sum / segments
        else:
            confidence = 0.0
        log.info(
            f"Completed transcription for: {file_uri}. Confidence: {confidence}"
        )

        # Create transcript model
        transcript = transcript_model.Transcript(
            generator=f"Google Speech-to-Text -- CDP v{__version__}",
            confidence=confidence,
            session_datetime=None,
            created_datetime=datetime.utcnow().isoformat(),
            sentences=timestamped_sentences,
        )

        return transcript
Exemple #23
0
def get_transcripts_json(gcsPath,
                         langCode,
                         phraseHints=[],
                         speakerCount=1,
                         enhancedModel=None):
    """Transcribes audio files.

    Args:
        gcsPath (String): path to file in cloud storage (i.e. "gs://audio/clip.mp4")
        langCode (String): language code (i.e. "en-US", see https://cloud.google.com/speech-to-text/docs/languages)
        phraseHints (String[]): list of words that are unusual but likely to appear in the audio file.
        speakerCount (int, optional): Number of speakers in the audio. Only works on English. Defaults to None.
        enhancedModel (String, optional): Option to use an enhanced speech model, i.e. "video"

    Returns:
        list | Operation.error
    """

    # Helper function for simplifying Google speech client response
    def _jsonify(result):
        json = []
        for section in result.results:
            data = {
                "transcript": section.alternatives[0].transcript,
                "words": []
            }
            for word in section.alternatives[0].words:
                data["words"].append({
                    "word":
                    word.word,
                    "start_time":
                    word.start_time.total_seconds(),
                    "end_time":
                    word.end_time.total_seconds(),
                    "speaker_tag":
                    word.speaker_tag
                })
            json.append(data)
        return json

    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcsPath)

    diarize = speakerCount if speakerCount > 1 else False
    print(f"Diarizing: {diarize}")
    diarizationConfig = speech.SpeakerDiarizationConfig(
        enable_speaker_diarization=speakerCount
        if speakerCount > 1 else False, )

    # In English only, we can use the optimized video model
    if langCode == "en":
        enhancedModel = "video"

    config = speech.RecognitionConfig(
        language_code="en-US" if langCode == "en" else langCode,
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True,
        speech_contexts=[{
            "phrases": phraseHints,
            "boost": 15
        }],
        diarization_config=diarizationConfig,
        profanity_filter=True,
        use_enhanced=True if enhancedModel else False,
        model="video" if enhancedModel else None)
    res = client.long_running_recognize(config=config, audio=audio).result()

    return _jsonify(res)
def transcribe_gcs(gcs_uri, num_speakers):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""

    # Imports the Google Cloud client library
    #from google.cloud import speech
    from google.cloud import speech_v1p1beta1 as speech


    # Instantiates a client
    client = speech.SpeechClient()
    
    # Construct a recognition metadata object
    metadata = speech.RecognitionMetadata()
    metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION
    metadata.recording_device_type = (
        speech.RecognitionMetadata.RecordingDeviceType.OTHER_INDOOR_DEVICE
    )
    metadata.audio_topic = "court trial hearing" 
    metadata.original_mime_type = "audio/mp3"

    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code="en-US",
        enable_automatic_punctuation=True,
        enable_speaker_diarization=True,
        diarization_speaker_count=num_speakers,
        # Enhanced models cost more than standard models. 
        use_enhanced=True,
        model="video",
        enable_word_time_offsets=True,

    )

    # Detects speech in the audio file -- long audio file
    operation = client.long_running_recognize(config=config, audio=audio)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=300)

    # Writing results to json

    result_counter = 0 
    word_counter = 0 
    output_json = {}

    for result in response.results:
        alternative = result.alternatives[0]
        output_json[f"{result_counter}_Transcript"] =  alternative.transcript
        output_json[f"{result_counter}_Confidence"] =  alternative.confidence
        result_counter += 1

        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            speaker_tag = word_info.speaker_tag

            output_json[f"{word_counter}_Word"] =  word
            output_json[f"{word_counter}_start_time"] =  start_time.total_seconds()
            output_json[f"{word_counter}_end_time"] =  end_time.total_seconds()
            output_json[f"{word_counter}_speaker_tag"] =  speaker_tag

            word_counter += 1

    with open("{}.json".format(gcs_uri.split('/')[-1][:-5]) , "w+") as file:
        json.dump(output_json, file)
    

    print("Dirized and transcribed {}".format(gcs_uri.split('/')[-1]))
def transcribe_with_model_adaptation(
    project_id, location, storage_uri, custom_class_id, phrase_set_id
):

    """
    Create`PhraseSet` and `CustomClasses` to create custom lists of similar
    items that are likely to occur in your input data.
    """

    # Create the adaptation client
    adaptation_client = speech.AdaptationClient()

    # The parent resource where the custom class and phrase set will be created.
    parent = f"projects/{project_id}/locations/{location}"

    # Create the custom class
    custom_class_response = adaptation_client.create_custom_class(
        {
            "parent": parent,
            "custom_class_id": custom_class_id,
            "custom_class": {
                "items": [
                    {"value": "sushido"},
                    {"value": "altura"},
                    {"value": "taneda"},
                ]
            },
        }
    )

    # Create the phrase set
    phrase_set_response = adaptation_client.create_phrase_set(
        {
            "parent": parent,
            "phrase_set_id": phrase_set_id,
            "phrase_set": {
                "boost": 10,
                "phrases": [{"value": f"Visit restaurants like ${custom_class_id}"}],
            },
        }
    )

    # The next section shows how to use the newly created custom
    # class and phrase set to send a transcription request with speech adaptation

    # Speech adaptation configuration
    speech_adaptation = speech.SpeechAdaptation(
        phrase_sets=[phrase_set_response], custom_classes=[custom_class_response]
    )

    # speech configuration object
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=24000,
        language_code="en-US",
        adaptation=speech_adaptation,
    )

    # The name of the audio file to transcribe
    # storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]

    audio = speech.RecognitionAudio(uri=storage_uri)

    # Create the speech client
    speech_client = speech.SpeechClient()

    response = speech_client.recognize(config=config, audio=audio)

    for result in response.results:
        print("Transcript: {}".format(result.alternatives[0].transcript))

    # [END speech_transcribe_with_model_adaptation]
    return response.results[0].alternatives[0].transcript
Exemple #26
0
# Auth
credentials = service_account.Credentials.from_service_account_file(
    PurePath(Path(__file__).resolve().parent).joinpath(
        Path(str(script_config["OPTS"]["Credentials"]))))

# Instantiate GC Speech client
client = speech.SpeechClient(credentials=credentials)

if str(script_config["OPTS"]["Mode"]) == "local":
    # Read-in audio from local file (60s limit, gs is recommended Mode)
    with io.open(
            PurePath(Path(__file__).resolve().parent).joinpath(
                str(script_config["OPTS"]["Path"])), "rb") as audio_file:
        content = audio_file.read()
        audio = speech.RecognitionAudio(content=content)
else:
    # Read-in audio from GS
    print(str(script_config["OPTS"]["Path"]))
    audio = speech.RecognitionAudio(uri=str(script_config["OPTS"]["Path"]))

# Config request
req_config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
    language_code=str(script_config["OPTS"]["Language"]),
    enable_speaker_diarization=True,
    diarization_speaker_count=int(script_config["OPTS"]["Speakers"]),
    enable_automatic_punctuation=True,
)

# Set GC Operation
Exemple #27
0
def async_transcribe(audio_file_paths,
                     bucket_name,
                     output_tsv_path,
                     sample_rate,
                     language_code,
                     speaker_count=0,
                     begin_sec=0.0):
    """Transcribe a given audio file using the async GCloud Speech-to-Text API.

  The async API has the advantage of being able to handler longer audio without
  state reset. Empirically, we've observed that the async calls lead to slightly
  better accuracy than streaming calls.

  Args:
    audio_file_paths: Paths to the audio files as a list of strings in the
      correct order.
    bucket_name: Name of GCS bucket used for holding objects temporarily.
    output_tsv_path: Path to the output TSV file.
    sample_rate: Audio sample rate.
    language_code: Language code for recognition.
    speaker_count: Number of speakers. If 0, speaker diarization will be
      disabled.
    begin_sec: Transcript begin timestamp in seconds.
  """
    tmp_audio_file = tempfile.mktemp(suffix=".flac")
    print("Temporary audio file: %s" % tmp_audio_file)
    audio_duration_s = concatenate_audio_files(audio_file_paths,
                                               tmp_audio_file)

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    destination_blob_name = os.path.basename(tmp_audio_file)
    blob = bucket.blob(destination_blob_name)
    print("Uploading %s to GCS bucket %s" % (tmp_audio_file, bucket_name))
    blob.upload_from_filename(tmp_audio_file)
    gcs_uri = "gs://%s/%s" % (bucket_name, destination_blob_name)
    print("Uploaded to GCS URI: %s" % gcs_uri)

    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)
    enable_speaker_diarization = speaker_count > 0
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=sample_rate,
        language_code=language_code,
        enable_speaker_diarization=enable_speaker_diarization,
        diarization_speaker_count=speaker_count)

    operation = client.long_running_recognize(config=config, audio=audio)
    timeout_s = int(audio_duration_s * 0.25)
    print("Waiting for async ASR operation to complete "
          "(audio duration: %.3f s; ASR timeout: %d s)..." %
          (audio_duration_s, timeout_s))
    response = operation.result(timeout=timeout_s)
    blob.delete()
    os.remove(tmp_audio_file)

    utterances = []
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        alt = result.alternatives[0]
        utterances.append(alt.transcript)
        print(u"Transcript: {}".format(alt.transcript))
        diarized_words = [(word.word, word.speaker_tag,
                           word.start_time.total_seconds(),
                           word.end_time.total_seconds())
                          for word in alt.words]
        # print("Confidence: {}".format(result.alternatives[0].confidence))

    regrouped_utterances = regroup_utterances(utterances, diarized_words)
    with open(output_tsv_path, "w" if not begin_sec else "a") as f:
        if not begin_sec:
            # Write the TSV header.
            f.write(tsv_data.HEADER + "\n")
        utterance_counter = 0
        for (regrouped_utterance, speaker_index, start_time_sec,
             end_time_sec) in regrouped_utterances:
            utterance_counter += 1
            line = "%.3f\t%.3f\t%s\t%s [U%d] [Speaker #%d]" % (
                start_time_sec + begin_sec, end_time_sec + begin_sec,
                tsv_data.SPEECH_TRANSCRIPT_TIER, regrouped_utterance,
                utterance_counter, speaker_index)
            print(line)
            f.write(line + "\n")
Exemple #28
0
from google.cloud import speech_v1p1beta1 as speech

client = speech.SpeechClient()

speech_file = "resources/commercial_mono.wav"

with open(speech_file, "rb") as audio_file:
    content = audio_file.read()

audio = speech.RecognitionAudio(content=content)

config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=8000,
    language_code="en-US",
    enable_speaker_diarization=True,
    diarization_speaker_count=2,
)

print("Waiting for operation to complete...")
response = client.recognize(config=config, audio=audio)

# The transcript within each result is separate and sequential per result.
# However, the words list within an alternative includes all the words
# from all the results thus far. Thus, to get all the words with speaker
# tags, you only have to take the words list from the last result:
result = response.results[-1]

words_info = result.alternatives[0].words

# Printing out the output:
Exemple #29
0
def get_transcript(speech_file,content_type):
    # google authentication
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/andrewfung/Programming/Multiple Speaker Detection/multiple-speaker-detection-3ed65d50eff1.json'

    # wget -nc https://realenglishconversations.com/...

    # instantiate a speech client and declare an audio file
    client = speech.SpeechClient()

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    if 'wav' in content_type:
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code="en-US",
            enable_speaker_diarization=True,
            diarization_speaker_count=2,
        )
    elif 'mpeg' in content_type:
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.MP3,
            sample_rate_hertz=16000,
            language_code="en-US",
            enable_speaker_diarization=True,
            diarization_speaker_count=2,
        )
    elif 'flac' in content_type:
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
            sample_rate_hertz=16000,
            language_code="en-US",
            enable_speaker_diarization=True,
            diarization_speaker_count=2,
        )

    print("Waiting for operation to complete...")
    response = client.recognize(config=config, audio=audio)

    result = response.results[-1]
    words_info = result.alternatives[0].words

    words_list = []
    # Printing out the output:
    for word_info in words_info:
        words_list.append(
            {
                'word': word_info.word,
                'speaker_tag': word_info.speaker_tag,
                'start_time': word_info.start_time,
                'end_time': word_info.end_time,
            }
        )
    # print(words_list)

    # create a script based on the words_list
    current_speaker = words_list[0]['speaker_tag']
    current_line = []
    script = []

    for item in words_list:
        if item['speaker_tag'] != current_speaker:
            # speaker changed, end of line
            script.append(
                {
                    'speaker': current_speaker,
                    'line': current_line
                }
            )
            current_line = []
            current_speaker = item['speaker_tag']
        else:
            # same speaker, add to the current line
            current_line.append(item['word'])

    script.append(
        {
            'speaker': current_speaker,
            'line': current_line
        }
    )

    script = [(f"Speaker {line['speaker']}: " + " ".join(line['line']) + "\n") for line in script]
    return script
Exemple #30
0
def google_api(id):
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join(app.config['API_KEYS'],'Google_Api_Key.json')
    id=str(id)
    audio_file_name = r"interaction" + id + ".wav"
    audio_file_path = os.path.join(app.config['AUDIO_FILES'], audio_file_name)
    
    true_label_file_name = r'speaker_id_' + id + '.txt'
    true_label_path = os.path.join(app.config['TRUE_LABEL'], true_label_file_name) 
    
        # Instantiates a client
    client = speech.SpeechClient()

    # Loads the audio into memory
    with io.open(audio_file_path, "rb") as audio_file:
        content = audio_file.read()
        audio = speech.RecognitionAudio(content=content)
    
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
        enable_speaker_diarization=True,
        #diarization_speaker_count=3
    )
    
    # print("Waiting for operation to complete...\n")
    response = client.recognize(request={"config": config, "audio": audio})
    
    # The transcript within each result is separate and sequential per result.
    # However, the words list within an alternative includes all the words
    # from all the results thus far. Thus, to get all the words with speaker
    # tags, you only have to take the words list from the last result:
    result = response.results[-1]
    words_info = result.alternatives[0].words
    
    # Filling list of transcribed words
    list_of_words = []
    
    for word_info in words_info:
        list_of_words.append(word_info.word)
    
    # initialize an empty string  for text output
    string = " " 
    text = string.join(list_of_words)
    
    # Creating list of labels
    speaker_tags = []
        
    for word_info in words_info:
        speaker_tags.append(word_info.speaker_tag)   
    
    # Create new-labels dictionary for speaker tags
    speaker_tags_dict = {}
    counter = 0
    
    for tag in speaker_tags:
        if tag in speaker_tags_dict:
            continue
        else:
            speaker_tags_dict[tag] = counter
            counter += 1

    # Normalize speaker tags
    speaker_tags_normalized = [speaker_tags_dict[tag] for tag in speaker_tags]
    
    # True Labels
    speaker_id_file = open(true_label_path, 'r')
    true_label_id = speaker_id_file.read()
    true_label_speaker_id = []
    
    for c in true_label_id.split(','):
        n = int(c)
        true_label_speaker_id.append(n)
    
    # Setting length of label lists equal
    if len(speaker_tags_normalized) < len(true_label_speaker_id):
        length = len(speaker_tags_normalized)
        true_label_speaker_id = true_label_speaker_id[:length]
    else:
        length = len(true_label_speaker_id)
        speaker_tags_normalized = speaker_tags_normalized[:length]
    
    return text, speaker_tags_normalized, adjusted_rand_score(speaker_tags_normalized, true_label_speaker_id)