def configureAPI(self):

        if self.title.find('.flac') != -1:
            with open(os.path.join(self.path, self.title), 'rb') as audio_file:
                content = audio_file.read()
                self.audio = types.RecognitionAudio(content=content)

        else:
            self.audio = types.RecognitionAudio(uri="gs://twiml-mp3/" +
                                                self.title + ".flac")

        self.config = types.RecognitionConfig(
            encoding=self.encoding,
            sample_rate_hertz=self.sample_rate,
            language_code=self.language_code,
            enable_automatic_punctuation=self.punctuation,
            enable_speaker_diarization=self.diarization,
            diarization_speaker_count=self.num_speakers,
            audio_channel_count=1,
            use_enhanced=self.enhanced,
            model=self.model,
            enable_word_time_offsets=self.time_offsets,
            enable_word_confidence=self.word_confidence,
            max_alternatives=self.max_alternatives,
            metadata=self.metadata,
            speech_contexts=[types.SpeechContext(phrases=self.phrases)])
def transcribe_gcs(gcs_uri, hertz, channel):
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,  # flacの設定
        sample_rate_hertz=int(hertz),  # ヘルツは音声ファイルに合わせる
        audio_channel_count=int(channel),
        language_code='ja-JP',  # 日本語音声の場合
        enable_speaker_diarization=True,  # 異なる話者の分離
        enable_automatic_punctuation=True,  # 句読点
        speech_contexts=SELECTED_PHRASES  # 音声適応ブースト
    )
    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    operationResult = operation.result()

    filename = gcs_uri.rsplit('/', 1)[1].split('.')[0] + ".txt"
    outputfilepath = os.path.join(OUTPUT_FOLDER, filename)
    fout = codecs.open(outputfilepath, 'a', 'utf-8')
    for result in operationResult.results:
        for alternative in result.alternatives:
            fout.write(u'{}\n'.format(alternative.transcript))
    fout.close()
Exemple #3
0
def google_transcribe_file(fp, bucket_name='prof-resp-trans'):
    storage_client = google.cloud.storage.Client()
    bucket = storage_client.get_bucket(bucket_name)

    client = google.cloud.speech_v1p1beta1.SpeechClient()
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,
        language_code='en-US',
        enable_word_time_offsets=True,
        enable_automatic_punctuation=True)

    blob = bucket.blob(fp.name)

    if not blob.exists():
        new_print('Uploading File: {}'.format(fp.name))

        blob.upload_from_filename(str(fp.resolve()))

        new_print('Finished Uploading: {}'.format(fp.name))
    else:
        new_print('File already uploaded: {}'.format(fp.name))

    new_print('Starting transcription...')

    audio = types.RecognitionAudio(
        uri='gs://{}/{}'.format(bucket_name, fp.name))

    response = client.long_running_recognize(config, audio)
    results = response.result()

    new_print('Transciption finished')

    return results
Exemple #4
0
def transcribe_file_with_diarization():
    audio = types.RecognitionAudio(uri=args['input'])
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        # encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=22050,
        language_code='en-US',
        enable_speaker_diarization=True,
        diarization_speaker_count=2,
        model='phone_call')

    operation = client.long_running_recognize(config, audio)
    print("Waiting on response from google cloud...")
    response = operation.result(timeout=720)  ## 360 call 01
    for result in response.results:
        print("\n\n::BEGIN TRANSCRIPT::\n")
        print("{}".format(result.alternatives[0].transcript))
        print("\n::END TRANSCRIPT::\n\n")

        print("\t\tCONFIDENCE: {} \n\n".format(
            result.alternatives[0].confidence))
        print("::BEGIN SPEAKER DIARIZATION::\n")
        words_info = result.alternatives[0].words
        for word_info in words_info:
            print("{}: '{}'".format(word_info.speaker_tag, word_info.word))
        print("\n::END SPEAKER DIARIZATION")
Exemple #5
0
    def read_audio(self, filepath):
        with io.open(filepath, "rb") as f:
            content = f.read()
            audio = types.RecognitionAudio(content=content)
        #audio = {"content": content}

        return audio
Exemple #6
0
def diarized_transcribe(gcred, gcs_uri, speakercount):
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = gcred

    client = speech_v1p1beta1.SpeechClient()
    audio = beta_types.RecognitionAudio(uri=gcs_uri)

    config = beta_types.RecognitionConfig(
        encoding=beta_enums.RecognitionConfig.AudioEncoding.FLAC,
        language_code='en-US',
        enable_speaker_diarization=True,
        diarization_speaker_count=speakercount,
        enable_word_time_offsets=True,
        model='video',
        enable_automatic_punctuation=True)

    operation = client.long_running_recognize(config, audio)

    response = operation.result(timeout=3600)

    transcript = MessageToDict(response)

    transcript = transcript.get('results')
    transcript = transcript.pop()
    transcript = transcript.get('alternatives')
    transcript = transcript.pop()
    transcript = transcript.get('words')

    return transcript
Exemple #7
0
def read(file_name, result_file, time=50):
    # Instantiates a client
    client = speech.SpeechClient()

    # The name of the audio file to transcribe
    file_name = os.path.join(
        os.path.dirname(__file__),
        file_name)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        audio_channel_count=2,
        sample_rate_hertz=44100,
        enable_speaker_diarization=True,
        diarization_speaker_count=2,
        language_code='ko-KR')

    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        with io.open(result_file, 'w') as f:
            FILE_BYTE = 44100 * 2
            read_byte = FILE_BYTE * time

            while True:
                content = audio_file.read(read_byte)
                if not content:
                    break
                audio = types.RecognitionAudio(content=content)
                response = client.recognize(config, audio)

                for result in response.results:
                    f.write(result.alternatives[0].transcript)
                    print(result.alternatives[0])
                    print('Transcript: {}'.format(result.alternatives[0].transcript))
def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code='en-US',
        profanity_filter=True,
        speech_contexts=[
            speech.types.SpeechContext(
                phrases=['Andy', 'Wisy', 'EEP', 'Project', 'Tom', 'Jeff'], )
        ],
    )

    operation = client.long_running_recognize(config, audio)

    print('GCS -- Waiting for operation to complete...')
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(('Transcript: {}'.format(result.alternatives[0].transcript)))
        print(('Confidence: {}'.format(result.alternatives[0].confidence)))
Exemple #9
0
def transcribe_gcs(gcs_uri, hint_phrases, set_config):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    client = speech_v1p1beta1.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)

    # hint_phrase = []
    # set_config['enable_speaker_diarization'] = 'False'

    print(set_config.get('enable_automatic_punctuation'))
    # Set default values, check dict having each key and cast from str to each type.
    config = types.RecognitionConfig(
        encoding=eval(set_config.get('encoding', 'enums.RecognitionConfig.AudioEncoding.FLAC')),
        sample_rate_hertz=int(set_config.get('sample_rate_hertz', 16000)),
        language_code=set_config.get('language_code', 'en-US'),
        enable_automatic_punctuation=eval(set_config.get('enable_automatic_punctuation', True)),
        enable_speaker_diarization=eval(set_config.get('enable_speaker_diarization', False)),
        diarization_speaker_count=int(set_config.get('diarization_speaker_count', 1)),
        speech_contexts=[speech_v1p1beta1.types.SpeechContext(phrases=hint_phrases)])

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=900)
    return response
Exemple #10
0
def google_transcribe(audio_file_name):

    file_name = filepath + audio_file_name
    frame_rate, channels = frame_rate_channel(file_name)

    if channels > 1:
        stereo_to_mono(file_name)

    bucket_name = bucketname
    source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name

    upload_blob(bucket_name, source_file_name, destination_blob_name)

    gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
    transcript = ''

    client = speech.SpeechClient()
    audio = types.RecognitionAudio(uri=gcs_uri)

    print("Setting up configurations")
    speech_context = speech.types.SpeechContext(phrases=[
        "$OOV_CLASS_DIGIT_SEQUENCE", "$YEAR", "$PERCENT", "$MONEY", "$MONTH"
    ])
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        language_code='en-US',
        enable_speaker_diarization=True,
        diarization_speaker_count=2,
        speech_contexts=[speech_context],
        use_enhanced=True,
        model="phone_call")

    # Detects speech in the audio file
    print("detecting speech")
    operation = client.long_running_recognize(config, audio)
    response = operation.result(timeout=10000)
    result = response.results[-1]
    words_info = result.alternatives[0].words

    tag = 1
    speaker = ""

    print("Assembling words")
    for word_info in words_info:
        if word_info.speaker_tag == tag:
            speaker = speaker + " " + word_info.word
        else:  #Changed
            transcript += "speaker {}: {}".format(tag, speaker) + '\n'
            tag = word_info.speaker_tag
            speaker = "" + word_info.word

    transcript += "speaker {}: {}".format(tag, speaker)

    delete_blob(bucket_name, destination_blob_name)
    return transcript
def transcribe_file(content):

    audio = types.RecognitionAudio(content=content)
    response = client.recognize(config, audio)

    try:
        text = response.results[0].alternatives[0].transcript
        return text[0].upper() + text[1:] + "?"
    except IndexError:
        return ""
Exemple #12
0
def transcribe_gcs(gcs_uri):
    from google.cloud import speech_v1p1beta1 as speech
    from google.cloud.speech_v1p1beta1 import enums
    from google.cloud.speech_v1p1beta1 import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        sample_rate_hertz=32000,
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        enable_speaker_diarization=True,
        diarization_speaker_count=2,
        language_code='ja-JP')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    operationResult = operation.result()

    d = datetime.datetime.today()
    today = d.strftime("%Y%m%d-%H%M%S")
    fout = codecs.open('output{}.txt'.format(today), 'a', 'shift_jis')

    speaker_1_words = ""
    speaker_1_s = float(0)
    speaker_2_words = ""
    speaker_2_s = float(0)

    for word in operationResult.results[-1].alternatives[0].words:
        tmp_word = u'{}'.format(word.word.split("|")[0])
        start_time = float(word.start_time.seconds) + float(
            word.start_time.nanos) / 1000 / 1000 / 1000
        end_time = float(word.end_time.seconds) + float(
            word.end_time.nanos) / 1000 / 1000 / 1000

        s = end_time - start_time

        if word.speaker_tag == 1:
            speaker_1_s += s
            speaker_1_words += tmp_word
        else:
            speaker_2_s += s
            speaker_2_words += tmp_word

    fout.write('speaker_1: \n{}\n'.format(speaker_1_words))
    fout.write('s: {}\n'.format(speaker_1_s))

    fout.write('speaker_2: \n{}\n'.format(speaker_2_words))
    fout.write('s: {}\n'.format(speaker_2_s))

    #for result in operationResult.results:
    #  for alternative in result.alternatives:
    #      fout.write(u'{}\n'.format(alternative.transcript))

    fout.close()
def transcribe_file(speech_file):
    """Transcribe the given audio file asynchronously."""
    from google.cloud import speech_v1p1beta1
    from google.cloud.speech_v1p1beta1 import enums
    from google.cloud.speech_v1p1beta1 import types
    client = speech_v1p1beta1.SpeechClient()

    # [START migration_async_request]
    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    print "Using ", speech_file, ", with the below config:"
    print ""
    print "importing speech_v1p1beta1"
    print "language_code='en-US'"
    print "use_enhanced=True"
    print "enable_automatic_punctuation=False"
    print "enable_word_time_offsets=False"
    print "profanity_filter=True"
    print "sample_rate=44100hz"
    print ""
    print "Transcript is as follows"

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='en-US',
        use_enhanced=True,
        # A model must be specified to use enhanced model.
        model='phone_call',
        enable_automatic_punctuation=False,
        enable_word_time_offsets=False,
        profanity_filter=True,
        #        speech_contexts=[speech.types.SpeechContext(
        #           phrases=['Andy', 'Wisy', 'EEP', 'Project', 'Tom', 'Jeff'],
        #          )],
    )

    # [START migration_async_response]
    operation = client.long_running_recognize(config, audio)
    # [END migration_async_request]

    print('Waiting for operation to complete...')
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
        print('Confidence: {}'.format(result.alternatives[0].confidence))
Exemple #14
0
def google_transcribe(audio_file_name):

    file_name = filepath + audio_file_name
    m4a_to_wav(file_name)

    # The name of the audio file to transcribe

    frame_rate, channels = frame_rate_channel(file_name)

    if channels > 1:
        stereo_to_mono(file_name)

    bucket_name = bucketname
    source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name

    upload_blob(bucket_name, source_file_name, destination_blob_name)

    gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
    transcript = ''

    client = speech.SpeechClient()
    audio = types.RecognitionAudio(uri=gcs_uri)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        language_code='en-IN',
        enable_speaker_diarization=True,
        diarization_speaker_count=2)

    # Detects speech in the audio file
    operation = client.long_running_recognize(config, audio)
    response = operation.result(timeout=10000)
    result = response.results[-1]
    words_info = result.alternatives[0].words

    tag = 1
    speaker = ""

    for word_info in words_info:
        if word_info.speaker_tag == tag:
            speaker = speaker + " " + word_info.word
        else:

            transcript += f"speaker {tag}: {speaker}\n"
            tag = word_info.speaker_tag
            speaker = "" + word_info.word

    transcript += f"speaker {tag}: {speaker}"
    delete_blob(bucket_name, destination_blob_name)
    return transcript
def sample_long_running_recognize(local_file_path):
    """
    Print confidence level for individual words in a transcription of a short audio
    file
    Separating different speakers in an audio file recording

    Args:
      local_file_path Path to local audio file, e.g. /path/audio.wav
    """

    client = speech_v1p1beta1.SpeechClient()

    # local_file_path = 'audio_files/2speaker.m4a'

    # If enabled, each word in the first alternative of each result will be
    # tagged with a speaker tag to identify the speaker.
    enable_speaker_diarization = True

    # Optional. Specifies the estimated number of speakers in the conversation.
    diarization_speaker_count = 2

    # The language of the supplied audio
    language_code = "en-US"

    config = types.RecognitionConfig(
        enable_speaker_diarization=enable_speaker_diarization,
        enable_automatic_punctuation=True,
        #diarization_speaker_count=diarization_speaker_count,
        # model='phone_call',
        language_code='en-US')

    with io.open(local_file_path, "rb") as f:
        content = f.read()
    audio = types.RecognitionAudio(content=content)

    operation = client.long_running_recognize(config, audio)

    print(u"Waiting for operation to complete...")
    response = operation.result()

    print(response.results)

    for result in response.results:
        # First alternative has words tagged with speakers
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))
        # Print the speaker_tag of each word
        for word in alternative.words:
            print(u"Word: {}".format(word.word))
            print(u"Speaker tag: {}".format(word.speaker_tag))
Exemple #16
0
def transcribe_gcs(gcs_uri: str) -> Dict[str, List[Any]]:
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    client = speech_v1p1beta1.SpeechClient()

    # The language of the supplied audio
    audio = types.RecognitionAudio(uri=gcs_uri)
    operation = client.long_running_recognize(RECOGNITION_CONFIG, audio)

    print(dt.now(), "Waiting for operation to complete...")
    print(dt.now(), "Operation", operation.operation)
    start = time()
    response = operation.result(timeout=90000)

    print(dt.now(), "Got response in ", time() - start)

    return MessageToDict(response)
Exemple #17
0
def google_transcribe(uploaded_file_path):
    print("Converting: \t" + uploaded_file_path.split("/")[-1])
    wav_file_path = mp3_to_wav(uploaded_file_path)
    print("Converted: \t" + wav_file_path.split("/")[-1])
    print("Checking frame rate: \t", wav_file_path.split("/")[-1])
    frame_rate, channels = frame_rate_channel(wav_file_path)
    wav_name = wav_file_path.split("/")[-1]

    print("Uploading blob: \t", wav_name)
    upload_blob(bucket_name, wav_file_path, wav_name)

    print("Starting Transcripting: \t", wav_name)
    gcs_uri = 'gs://' + bucket_name + '/' + wav_name
    transcript = ''
    client = speech.SpeechClient()
    audio = types.RecognitionAudio(uri=gcs_uri)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        language_code=Language_code,
        enable_speaker_diarization=True,
        diarization_speaker_count=2)

    # Detects speech in the audio file
    operation = client.long_running_recognize(config, audio)
    response = operation.result(timeout=10000)
    result = response.results[-1]
    words_info = result.alternatives[0].words

    tag = 1
    speaker = ""

    for word_info in words_info:
        if word_info.speaker_tag == tag:
            speaker = speaker + " " + word_info.word
        else:
            transcript += "speaker {}: {}".format(tag, speaker) + '\n'
            tag = word_info.speaker_tag
            speaker = "" + word_info.word

    transcript += "speaker {}: {}".format(tag, speaker)

    print("Deleting blob: \t", wav_name)
    delete_blob(bucket_name, wav_name)
    return transcript
def transcribe_interviews(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech_v1p1beta1 as speech
    from google.cloud.speech_v1p1beta1 import enums
    from google.cloud.speech_v1p1beta1 import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.
        LINEAR16,  #this is for WAV files, you can use multiple types
        sample_rate_hertz=44100,
        language_code='en-US',
        #use_enhanced=True, #can only use if your ethics allows sending data offsite
        enable_word_time_offsets=True,
        enable_automatic_punctuation=True,
        model='Video')  # change this if you're doing focus groups

    operation = client.long_running_recognize(config, audio)

    print('A little man is now listening and transcribing...')
    response = operation.result(timeout=90000000)
    f = open("Interview 1.txt",
             "w")  #can change the file name that the text gets written to
    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        f.write(str(i) + '\n')
        f.write('{}'.format(
            str(
                time.strftime(
                    '%H:%M:%S',
                    time.gmtime(int(
                        alternative.words[0].start_time.seconds))))))
        f.write('  -->  ')
        f.write('{}'.format(
            str(
                time.strftime(
                    '%H:%M:%S',
                    time.gmtime(int(alternative.words[-1].end_time.seconds)))))
                + '\n')

        #f.write('speaker {} :'.format(alternative.words[0].speaker_tag))
        f.write(u'{}'.format(alternative.transcript) + '\n\n')
Exemple #19
0
def transcribe_gcs(gcs_uri, encoding="LINEAR16", sample_rate=16000):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=ENCODINGS[encoding],
        sample_rate_hertz=sample_rate,
        language_code='ko-KR',
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True,
        enable_speaker_diarization=True,  # 한국어 지원 안됨 (speaker_tag가 모두 동일인으로 분류됨)
        diarization_speaker_count=3)

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=300)

    words_with_tags = []
    transcripts = []

    # Each result is for a consecutive portion of the audio.
    # Iterate through them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
        transcripts.append(result.alternatives[0].transcript
                           )  # punctuation 포함된 문장을 사용하기 위해 저장
        for words in result.alternatives[0].words:
            word = words.word
            start_time = round(
                words.start_time.seconds + words.start_time.nanos * 1e-9, 3)
            end_time = round(
                words.end_time.seconds + words.end_time.nanos * 1e-9, 3)
            speaker_tag = words.speaker_tag
            words_with_tags.append(
                [word, start_time, end_time,
                 speaker_tag])  # [word, start_time, end_time, speaker_tag]
        print()

    return words_with_tags, transcripts
Exemple #20
0
    def run_speaker_diarization(self, audio_uri, audio_ch, audio_sr,
                                max_speakers):
        logger.info('Performing Speaker Diarization for {}'.format(audio_uri))
        drzr_config = types.SpeakerDiarizationConfig(
            enable_speaker_diarization=True, max_speaker_count=max_speakers)
        config = speech.types.RecognitionConfig(
            language_code="en-US",
            sample_rate_hertz=int(audio_sr),
            encoding=enums.RecognitionConfig.AudioEncoding.MP3,
            audio_channel_count=int(audio_ch),
            enable_word_time_offsets=True,
            model="video",
            enable_automatic_punctuation=False,
            diarization_config=drzr_config)

        audio_file = types.RecognitionAudio(uri=audio_uri)
        operation = self.client.long_running_recognize(config=config,
                                                       audio=audio_file)
        res = operation.result()
        return res
Exemple #21
0
    def long_transcribe_gcs(self, gcs_uri):
        print("Transcribing... (This may take a while)")
        audio = types.RecognitionAudio(uri=gcs_uri)

        operation = self.config.client.long_running_recognize(self.config.recognition_config, audio)

        response = operation.result(timeout=90)

        # Each result is for a consecutive portion of the audio. Iterate through
        # them to get the transcripts for the entire audio file.
        text = ""
        for result in response.results:
            speaker_num = result.alternatives[0].words[0].speaker_tag
            speaker = self.config.speakers[speaker_num - 1]

            text += speaker + result.alternatives[0].transcript + '\n'
            # The first alternative is the most likely one for this portion.
            # print(u'Transcript: {}'.format(result.alternatives[0].transcript))
            # print('Confidence: {}'.format(result.alternatives[0].confidence))

        return text
Exemple #22
0
def speech_to_text(gcs_uri):
    #ltt_context = open('context.txt', 'r').read().split('\n')
    client = speech_v1p1beta1.SpeechClient()
    audio = types.RecognitionAudio(uri=gcs_uri)
    #speech_contexts_element = {"phrases": ltt_context, "boost": 11}
    #speech_contexts = [speech_contexts_element]
    config = {
        "encoding": enums.RecognitionConfig.AudioEncoding.MP3,
        "sample_rate_hertz": 48000,
        "language_code": 'en-US',
        #"speech_contexts": speech_contexts,
        "max_alternatives": 11,
        "model": "video",
        "enable_word_confidence": True,
        "enable_word_time_offsets": True,
        "enable_automatic_punctuation": True
    }
    operation = client.long_running_recognize(config, audio)
    print('Speech-to-Text running.')
    response = operation.result()
    return (response)
Exemple #23
0
def transcribe_audio():
    I = 4
    try:
        alter_db(I)
    except Exception as e:
        print e

    conn, c = get_db(databases[I])
    conn.execute("SELECT %s FROM %s LIMIT 1 " %
                 (database_keys[I], database_names[I]))
    rows = conn.fetchall()
    for row in rows:
        #print row
        audio_file = "%s%s/%s.wav" % (DIR, FILE_NAMES[I], row[0])
        audio_file = copy_files(audio_file, row[0])
        print audio_file

        if os.path.isfile(audio_file):
            with io.open(audio_file, 'rb') as audio_file2:
                content = audio_file2.read()
            audio_data = types.RecognitionAudio(content=content)
            # try:
            response = client.recognize(config, audio_data)

            transcript = "-1"
            confidence = 0
            if len(response.results):
                result = response.results[0]
                if result.alternatives:
                    transcript = result.alternatives[0].transcript
                    confidence = result.alternatives[0].confidence
            print response

            #score = get_jaccard_sim()

            sql = "UPDATE " + database_names[
                I] + " SET transcript=?,confidence=? WHERE Filename=? "
            print sql
            conn.execute(sql, (transcript, confidence, row[0]))
            c.commit()
Exemple #24
0
def wordTimeOffsets(filename, phrases, flag):

    client = speech.SpeechClient()

    if flag == 0:
        speech_file = "/home/malkaiv/project/final/recordings/wav/" + str(filename) + ".wav"
    else:
        print("in flag = 1")
        speech_file = "/home/malkaiv/project/final/recordings/wordsWav/" + str(filename) +".wav"
    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    with wave.open(speech_file, 'rb') as wave_file:
        frame_rate = wave_file.getframerate()

    # speechText = "please call Stella ask her to bring these things with her from the store \
    #                     six spoons of fresh snow peas five thick slabs of blue cheese and maybe a snack for her \
    #                     brother Bob we also need a small plastic snake and a big toy frog for the kids she can \
    #                     scoop these things into three red bags and we will go meet her Wednesday at the train station"
    # phrases = speechText.lower().split()


    boost = 20.0
    speech_contexts_element = {"phrases": phrases, "boost": boost}
    speech_contexts = [speech_contexts_element]

    config = types.RecognitionConfig(
        speech_contexts=speech_contexts,
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        language_code="en-US",
        enable_word_time_offsets=True,
        enable_word_confidence=True,
    )
    # print(client.recognize(config=config, audio=audio))
    # first = response.results[0].alternatives[0]
    # print("{}\n{}".format(first.transcript, first.confidence))
    return client.recognize(config=config, audio=audio)
Exemple #25
0
 def transcribe(data):
     idx, file = data
     num, _ = file.split('.')
     text_script = ""
     name = file
     print(file + " - started")
     # Loads the audio into memory
     with io.open(fileFullPathname+'\\'+file, 'rb') as audio_file:
         content = audio_file.read()
         audio = types.RecognitionAudio(content=content)
     # Transcribe audio file
     # Detects speech in the audio file
     client = speech.SpeechClient()
     response = client.recognize(config, audio)
     for result in response.results:
         text_script += result.alternatives[0].transcript
         
     print(name + " - done")
     return {
         "idx": num,
         "text": text_script
     }
def transcribe_gcs(gcs_uri, phrase_hints=[], language_code="en-US"):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    client = speech.SpeechClient()
    phrases = phrase_hints
    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code=language_code,
        enable_word_time_offsets=True,
        model = 'video',
        diarization_speaker_count=2,
        enable_automatic_punctuation=True,
        use_enhanced=True,
        enable_speaker_diarization=True,
        speech_contexts=[speech.types.SpeechContext(phrases=phrases)]
    )

    operation = client.long_running_recognize(config, audio)

    transcription_response = operation.result(timeout=90000)
    return transcription_response.results
Exemple #27
0
def transcribe_streaming_from_file(speech_file):
    """Transcribe the given audio file."""
    from google.cloud import speech_v1p1beta1 as speech
    from google.cloud.speech_v1p1beta1 import enums
    from google.cloud.speech_v1p1beta1 import types

    client = speech.SpeechClient()

    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code='en-US')

    response = client.recognize(config, audio)
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        # The first alternative is the most likely one for this portion.
        return result.alternatives[0].transcript
 def translate_with_timestamps(self, gs_uri, encoding, mode, hint):
     audio = types.RecognitionAudio(uri=gs_uri)
     config = types.RecognitionConfig(
         encoding=encoding,
         language_code=mode,
         enable_word_time_offsets=True,
         speech_contexts=[types.SpeechContext(phrases=hint)],
         enable_word_confidence=True)
     operation = self.client.long_running_recognize(config=config,
                                                    audio=audio)
     results = []
     for result in operation.result().results:
         alternatives = result.alternatives
         if not alternatives:
             continue
         alternative = alternatives[0]
         results.append([alternative.transcript, alternative.confidence])
         for word_info in alternative.words:
             word = word_info.word
             start_time = word_info.start_time.seconds + word_info.start_time.nanos * 1e-9
             end_time = word_info.end_time.seconds + word_info.end_time.nanos * 1e-9
             confidence = word_info.confidence
             results.append([word, start_time, end_time, confidence])
     return results
def transcribe_file(speech_file):
    """Transcribe the given audio file asynchronously."""
    from google.cloud import speech_v1p1beta1
    from google.cloud.speech_v1p1beta1 import enums
    from google.cloud.speech_v1p1beta1 import types
    client = speech_v1p1beta1.SpeechClient()

    # [START migration_async_request]
    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    print('Using ', speech_file, ', with the below config:')
    print("")
    print("importing speech_v1p1beta1")
    print("language_code='en-US'")
    print("use_enhanced=True")
    print("enable_automatic_punctuation=False")
    print("enable_word_time_offsets=False")
    print("profanity_filter=True")
    print("sample_rate=44100hz")
    print("")
    print("Transcript is as follows")
    print("Trans_Output_" + foldernametime)

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='en-US',
        use_enhanced=True,
        # A model must be specified to use enhanced model.
        model='phone_call',
        enable_automatic_punctuation=False,
        enable_word_time_offsets=False,
        profanity_filter=True,
        #        speech_contexts=[speech.types.SpeechContext(
        #           phrases=['Andy', 'Wisy', 'EEP', 'Project', 'Tom', 'Jeff'],
        #          )],
    )

    # [START migration_async_response]
    operation = client.long_running_recognize(config, audio)
    # [END migration_async_request]

    os.chdir("Trans_Output_" + foldernametime)

    with open("output_transcription.txt", "a") as myfile:
        #print('File -- before write file')
        #myfile.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S")+ "\n")

        print('File -- Waiting for operation to complete...')
        response = operation.result(timeout=90)

        # Each result is for a consecutive portion of the audio. Iterate through
        # them to get the transcripts for the entire audio file.
        for result in response.results:
            # The first alternative is the most likely one for this portion.
            print(('Transcript: {}'.format(result.alternatives[0].transcript)))
            print(('Confidence: {}'.format(result.alternatives[0].confidence)))
            with open("output_transcription.txt", "a") as myfile:
                myfile.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S -"))
                myfile.write(
                    (' {}'.format(result.alternatives[0].transcript)) + "\n")
                #myfile.write(('Confidence: {}'.format(result.alternatives[0].confidence))+ "\n")
        with open("output_transcription.txt", "a") as myfile:
            myfile.write('')
            myfile.close()

    # [END migration_async_response]
    exit()
Exemple #30
0
async def speech_to_text(queue):
    """
    This function implements the translation from speech to text with online and offline services, and compute the
    emotion related to the speech
    :param queue: process shared queue
    """

    kb_client = kb.KnowledgeBaseClient(False)
    kb_ID = (kb_client.register())['details']
    kb_client.registerTags(kb_ID, { 'AV_IN_TRANSC_EMOTION' : {'desc' : 'text from audio', 'doc' : """```json\n{\n\t"tag": 'AV_IN_TRANSC_EMOTION',\n\t"timestamp": int,\n\t"ID": int,\n\t"text": string,\n\t"language": string,\n\t"valence": float,\n\t"arousal": float\n}```"""} })


    # Create new recogniers for all the services used
    r = sr.Recognizer()
    google_client = None
    try:
        google_client = speech.SpeechClient()
    except exceptions.DefaultCredentialsError as e:
        log.error("Failed to authenticate with Google Cloud Speech recognition" + str(e))
    except :
        log.error("Unexpected error. Failed to authenticate with Google Cloud Speech recognition:"+ str(sys.exc_info()[0]))

    with ThreadPoolExecutor() as executor:

        while True:

            # Data stored in the queue contain all the information needed to create AudioData object
            timestamp, channels, sampleRate, bitPerSample, data = await queue.get()

            audio = sr.AudioData(data, sampleRate, bitPerSample/8)
            audio_gc = types.RecognitionAudio(content=data)

            # Compute the transcription of the audio
            google_cloud = executor.submit(recognize, "google-cloud", audio_gc, google_client)
            google = executor.submit(recognize, "google", audio, r)
            sphinx = executor.submit(recognize, "sphinx", audio, r)

            # Compute the emotion related to the audio
            #emotion = executor.submit(sentimental_analizer.emotion_from_speech, sampleRate, audio, log)

            res = google_cloud.result()
            if res["error"] is None:
                # Add to KB Google cloud speech recognition result with timestamp and ID
                log.info("Insert into KB --> Google cloud speech recognition result: " + str(res["text"]))

            else:
                log.error("Google cloud speech recognition retrieved an error: " + str(res["error"]))
                res = google.result()
                if res["error"] is None:
                    # Add to KB Google result with timestamp and ID
                    log.info("Insert into KB --> Google result: " + str(res["text"]))
                else:
                    log.error("Google retrieved an error: " + str(res["error"]))
                    res = sphinx.result()
                    if res["error"] is None:
                        # Add to KB Sphinx result with timestamp and ID
                        log.info("Insert into KB --> Sphinx result: " + str(res["text"]))
                    else:
                        log.error("Sphinx retrieved an error: " + str(res["error"]))

            emotion = {"valence": 1, "arousal":1} #emotion.result()

            myID = 'stt'
            if res["error"] is None:
                # Add to KB that the transcription of the audio
                kb_client.addFact(kb_ID, 'AV_IN_TRANSC_EMOTION', 1, 100, {"tag": 'AV_IN_TRANSC_EMOTION',
                                                                         "timestamp": timestamp,
                                                                         "ID": timestamp,
                                                                         "text": res["text"],
                                                                         "language": res["lang"],
                                                                         "valence": emotion["valence"],
                                                                         "arousal": emotion["arousal"]
                                                                         })
                # TODO adjust "text_f_audio", 2, 50, 'false'

            else:
                # Add to KB that none of google and sphinx retrieved a result
                log.critical("Insert into KB that no Google or Sphinx result")
                kb_client.addFact(kb_ID, 'AV_IN_TRANSC_EMOTION', 1, 100, {"tag": 'AV_IN_TRANSC_EMOTION',
                                                                         "timestamp": timestamp,
                                                                         "ID": timestamp,
                                                                         "text": "",
                                                                         "language": res["lang"],
                                                                         "valence": emotion["valence"],
                                                                         "arousal": emotion["arousal"]
                                                                         })