def retrieve_transcript(identifier, language, speaker_type, service_config): gcs_uri = f"gs://{identifier}/audio.wav" audio = speech.RecognitionAudio(uri=gcs_uri) if speaker_type == 'both': recognition_config = speech.RecognitionConfig( enable_automatic_punctuation=True, enable_word_time_offsets=True, enable_speaker_diarization=True, diarization_speaker_count=2, language_code=language) elif speaker_type in ['interviewee', 'interviewer']: recognition_config = speech.RecognitionConfig( enable_automatic_punctuation=True, enable_word_time_offsets=True, enable_speaker_diarization=False, language_code=language) else: raise TypeError( 'unknown speaker type: {speaker}'.format(speaker=speaker_type)) speech_client = get_google_client(type="speech", service_config=service_config) operation = speech_client.long_running_recognize(config=recognition_config, audio=audio) response = operation.result() response_dict = MessageToDict(response.__class__.pb(response)) return response_dict
def transcribe_file_with_spoken_punctuation_end_emojis(): """Transcribe the given audio file with spoken punctuation and emojis enabled.""" # [START speech_transcribe_spoken_punctuation_emojis_beta] from google.cloud import speech_v1p1beta1 as speech from google.protobuf import wrappers_pb2 client = speech.SpeechClient() speech_file = "resources/commercial_mono.wav" with io.open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code="en-US", # Enable spoken punctuation enable_spoken_punctuation=wrappers_pb2.BoolValue(value=True), # Enable spoken emojis enable_spoken_emojis=wrappers_pb2.BoolValue(value=True), ) response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 20) print(u"First alternative of result {}".format(i)) print(u"Transcript: {}".format(alternative.transcript))
def transcribe_file_with_auto_punctuation(): """Transcribe the given audio file with auto punctuation enabled.""" # [START speech_transcribe_auto_punctuation_beta] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = 'resources/commercial_mono.wav' with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code='en-US', # Enable automatic punctuation enable_automatic_punctuation=True) response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print('-' * 20) print(u'First alternative of result {}'.format(i)) print(u'Transcript: {}'.format(alternative.transcript))
def get_stt_response(audio_path: str, client: Any, stt_provider: str) -> Any: """sends a call to the STT specified by the client for the input audio_path""" with open(audio_path, "rb") as fid: content = fid.read() if stt_provider == "google": audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", enable_word_confidence=True, model="default", ) response = client.recognize(config=config, audio=audio) elif stt_provider == "ibm": response = client.recognize(audio=content, content_type='audio/wav', model="en-US_BroadbandModel", word_confidence=True).get_result() elif stt_provider == "azure": audio_input = speechsdk.AudioConfig(filename=audio_path) speech_recognizer = speechsdk.SpeechRecognizer( speech_config=client, audio_config=audio_input) result = speech_recognizer.recognize_once_async().get() else: raise ValueError( f"stt provider: {stt_provider} is unacceptable. Use 'google' or 'ibm'." ) return response
def mp3ToYML(fileName): inputFile = AudioSegment.from_mp3(fileName) inputFile.export(fileName + ".wav", format="wav") AUDIO_FILE = fileName + ".wav" r = sr.Recognizer() with sr.AudioFile(AUDIO_FILE) as source: r.adjust_for_ambient_noise(source) audio = r.record(source) client = speech.SpeechClient() with open(audio, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code="en-US", enable_speaker_diarization=True, diarization_speaker_count=2, ) response = client.recognize(config=config, audio=audio) result = response.results[-1] words_info = result.alternatives[0].words for word_info in words_info: print( u"word: '{}', speaker_tag: {}".format(word_info.word, word_info.speaker_tag) ) #converstations = [{'converstations' : ['soccer', 'football']}] #with open(r'E:\data\store_file.yaml', 'w') as file: #documents = yaml.dump(converstations, file)
def initialize_recognition_config(): """ LOADS THE CONFIGURATION FROM THE config.ini FILE. RETURNS A SPEECH RECOGNITION CONFIG FILE THAT CONTAINS INFORMATION ABOUT THE SAMPLE RATE, LANGUAGE CODE, TYPE OF ENCODING, LIST OF WORDS AND PHRASES THAT ARE MORE LIKELY TO OCCUR (i.e. barkod, potvrda, lokacija, vozilo) """ config_file = configparser.ConfigParser() config_file.read('config.ini') # Initialize the speech recognition config = speech.RecognitionConfig() # Set the configurations #config.sample_rate_hertz = int(config_file.get('config','sample_rate')) config.language_code = str(config_file.get('config','language_code')) config.enable_speaker_diarization = True config.encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16 # Extract the phrases list from the config.ini phrases_list = (config_file.get('speech_context','phrases_list')) # Set the speech context to match the phrases list from the config.ini #speech_context = speech.SpeechContext(phrases=phrases_list, boost=20) #config.speech_contexts = [speech_context] config.metadata = initialize_metadata() # Uncomment to get confidences for each word #config.enable_word_confidence = True return config
def transcribe_from_file(self, speech_file, frameRate=None): """ :param speech_file: str relative/ full path of the speech file :param frameRate: int, optional sample rate of the speech file :return: dictionary transcript and confidence level """ self.speech_file = speech_file client = speech.SpeechClient() with io.open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( self._get_recognition_config_params(frameRate)) operation = client.long_running_recognize(config=config, audio=audio) # print("Waiting for operation to complete...") response = operation.result() # print(f'result length: {len(response.results)}') if len(response.results) >= 1: result = { 'Transcript': response.results[0].alternatives[0].transcript, 'Confidence': response.results[0].alternatives[0].confidence } else: result = {'Transcript': None, 'Confidence': None} return result
def google_transcribe(audio_file_path): file_name = audio_file_path # mp3_to_wav(file_name) # The name of the audio file to transcribe frame_rate, channels = frame_rate_channel(file_name) if channels > 1: stereo_to_mono(file_name) with io.open(file_name, "rb") as audio_file: content = audio_file.read() client = speech.SpeechClient() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code='en-US', enable_word_confidence=True) # config = speech.RecognitionConfig(encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, # language_code='en-US' # ) # Detects speech in the audio file response = client.recognize(config=config, audio=audio) # print(response) return response
def transcribe_file_with_multiple_channels(): """Transcribe the given audio file synchronously with multiple channels""" # [START speech_transcribe_audio_with_multiple_channels] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = 'resources/voice_tom2.wav' with open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, #sample_rate_hertz=44100, language_code="th-TH", audio_channel_count=2, enable_separate_recognition_per_channel=True, ) response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 20) print("First alternative of result {}".format(i)) print(u"Transcript: {}".format(alternative.transcript)) print(u"Channel Tag: {}".format(result.channel_tag))
def transcribe_file(): client = speech.SpeechClient() w = wave.open('output.wav', 'w') w.setnchannels(1) w.close() speech_file = "output.wav" with open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, language_code="en-US", enable_speaker_diarization=True, diarization_speaker_count=2, audio_channel_count=2, ) print("Waiting for operation to complete...") response = client.recognize(config=config, audio=audio) # The transcript within each result is separate and sequential per result. # However, the words list within an alternative includes all the words # from all the results thus far. Thus, to get all the words with speaker # tags, you only have to take the words list from the last result: result = response.results[-1] words_info = result.alternatives[0].words # Printing out the output: for word_info in words_info: print(u"word: '{}', speaker_tag: {}".format(word_info.word, word_info.speaker_tag))
def transcribe_file_with_multichannel(): """Transcribe the given audio file synchronously with multi channel.""" # [START speech_transcribe_multichannel_beta] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = 'resources/Google_Gnome.wav' with open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-US', audio_channel_count=1, enable_separate_recognition_per_channel=True) response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print('-' * 20) print('First alternative of result {}'.format(i)) print(u'Transcript: {}'.format(alternative.transcript)) print(u'Channel Tag: {}'.format(result.channel_tag))
def transcribe_file_with_diarization(): """Transcribe the given audio file synchronously with diarization.""" # [START speech_transcribe_diarization_beta] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = 'resources/commercial_mono.wav' with open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code='en-US', enable_speaker_diarization=True, diarization_speaker_count=2) print('Waiting for operation to complete...') response = client.recognize(config=config, audio=audio) # The transcript within each result is separate and sequential per result. # However, the words list within an alternative includes all the words # from all the results thus far. Thus, to get all the words with speaker # tags, you only have to take the words list from the last result: result = response.results[-1] words_info = result.alternatives[0].words # Printing out the output: for word_info in words_info: print(u"word: '{}', speaker_tag: {}".format(word_info.word, word_info.speaker_tag))
def transcribe_file_with_multilanguage(): """Transcribe the given audio file synchronously with multi language.""" # [START speech_transcribe_multilanguage_beta] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = 'resources/multi.wav' first_lang = 'en-US' second_lang = 'es' with open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, audio_channel_count=2, language_code=first_lang, alternative_language_codes=[second_lang]) print('Waiting for operation to complete...') response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print('-' * 20) print(u'First alternative of result {}: {}'.format(i, alternative)) print(u'Transcript: {}'.format(alternative.transcript))
def transcribe_file_with_word_level_confidence(): """Transcribe the given audio file synchronously with word level confidence.""" # [START speech_transcribe_word_level_confidence_beta] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = 'resources/Google_Gnome.wav' with open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-US', enable_word_confidence=True) response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print('-' * 20) print('First alternative of result {}'.format(i)) print(u'Transcript: {}'.format(alternative.transcript)) print(u'First Word and Confidence: ({}, {})'.format( alternative.words[0].word, alternative.words[0].confidence))
def transcribe_file_with_enhanced_model(): """Transcribe the given audio file using an enhanced model.""" # [START speech_transcribe_enhanced_model_beta] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = "resources/commercial_mono.wav" with io.open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code="en-US", use_enhanced=True, # A model must be specified to use enhanced model. model="phone_call", ) response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 20) print(f"First alternative of result {i}") print(f"Transcript: {alternative.transcript}")
def transcribe_file_with_enhanced_model(): """Transcribe the given audio file using an enhanced model.""" # [START speech_transcribe_enhanced_model_beta] from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = 'resources/commercial_mono.wav' with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code='en-US', use_enhanced=True, # A model must be specified to use enhanced model. model='phone_call') response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print('-' * 20) print(u'First alternative of result {}'.format(i)) print(u'Transcript: {}'.format(alternative.transcript))
def google_transcribe(audio_file_name): file_name = filepath + audio_file_name # mp3_to_wav(file_name) # The name of the audio file to transcribe frame_rate, channels = frame_rate_channel(file_name) if channels > 1: stereo_to_mono(file_name) bucket_name = bucketname source_file_name = filepath + audio_file_name destination_blob_name = audio_file_name upload_blob(bucket_name, source_file_name, destination_blob_name) gcs_uri = 'gs://' + bucketname + '/' + audio_file_name transcript = '' credential_path = "/home/asheeshg01/Speech-f22e193c0063.json" os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code='en-US', enable_speaker_diarization=True, diarization_speaker_count=2) # Detects speech in the audio file #operation = client.long_running_recognize(config, audio) operation = client.long_running_recognize(request={"config":config, "audio":audio}) response = operation.result(timeout=10000) result = response.results[-1] #Changed words_info = result.alternatives[0].words #Changed tag=1 #Changed speaker="" #Changed for word_info in words_info: #Changed if word_info.speaker_tag==tag: #Changed speaker=speaker+" "+word_info.word #Changed else: #Changed transcript += "speaker {}: {}".format(tag,speaker) + '\n' #Changed tag=word_info.speaker_tag #Changed speaker=""+word_info.word #Changed transcript += "speaker {}: {}".format(tag,speaker) #Changed #for result in response.results: #transcript += result.alternatives[0].transcript delete_blob(bucket_name, destination_blob_name) return transcript
def google_word_details(audio_file_name): file_name = filepath + audio_file_name second_lang = "hi-IN" frame_rate, channels = frame_rate_channel(file_name) bucket_name = bucketname source_file_name = filepath + audio_file_name destination_blob_name = audio_file_name upload_blob(bucket_name, source_file_name, destination_blob_name) gcs_uri = 'gs://' + bucketname + '/' + audio_file_name transcript = '' word_details = '' credential_path = s.get("credential_path") os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code='en-US', alternative_language_codes=[second_lang], enable_speaker_diarization=True, diarization_speaker_count=2, enable_word_time_offsets=True) # Detects speech in the audio file #operation = client.long_running_recognize(config, audio) operation = client.long_running_recognize(request={ "config": config, "audio": audio }) response = operation.result(timeout=10000) result = response.results[-1] words_info = result.alternatives[0].words tag = 1 speaker = "" for word_info in words_info: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time speaker1 = word_info.speaker_tag word_details += " Word: {} : start_time: {}: end_time: {}: speaker {}".format( word, start_time.total_seconds(), end_time.total_seconds(), speaker1) storage_client = storage.Client() bucket_name = storage_client.get_bucket(bucket_name) word_details_filename = audio_file_name.split( '.')[0] + '_word_details' + '.txt' blob_word_details_file = bucket_name.blob(word_details_filename) blob_word_details_file.upload_from_string(word_details) #delete_blob(bucket_name, destination_blob_name) return word_details
def get_transcripts_json(gcstorage_path, lang, phrase_hints=[], speaker_count=1, enhanced_model=None): # transcribes audio files def _jsonify(res): # helper func for simplifying gcp speech client response json = [] for section in res.results: data = { 'transcript': section.alternatives[0].transcript, 'words': [] } for word in section.alternative[0].words: data['words'].append({ 'word': word.word, 'start_time': word.start_time.total_seconds(), 'end_time': word.end_time.total_seconds(), 'speaker_tag': word.speaker_tag }) json.append(data) return json client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcstorage_path) diarize = speaker_count if speaker_count > 1 else False print(f"Diarizing: {diarize}") diarizationConfig = speech.SpeakerDiarizationConfig( enable_speaker_diarization=speaker_count if speaker_count > 1 else False, ) # if eng only, can use the optimized video model if lang == 'en': enhanced_model = 'video' config = speech.RecognitionConfig( lang_code='en-US' if lang == 'en' else lang, enable_automatic_punctuation=True, enable_word_time_offsets=True, speech_contexts=[{ 'phrases': phrase_hints, 'boost': 15 }], diarization_config=diarizationConfig, profanity_filter=True, use_enhanced=True if enhanced_model else False, model='video' if enhanced_model else None) res = client.long_running_recognize(config=config, audio=audio).result() return _jsonify(res)
def transcribe_file_with_multilanguage(files_path=r'D:/dirname'): client = speech.SpeechClient() first_lang = "fr-FR" #second_lang = "cmn-Hans-CN" config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.MP3, #language_code='fr-FR', model='command_and_search', enable_automatic_punctuation=True, sample_rate_hertz=16000, #audio_channel_count=2, #enable_speaker_diarization=True, language_code=first_lang, #alternative_language_codes=[second_lang], #model="video", ) for f in os.listdir(files_path): speech_file = os.path.join(files_path, f) outputfile = os.path.splitext(f)[0] + '.txt' outputfile = os.path.join(files_path, outputfile) if os.path.splitext(speech_file)[-1] != '.mp3': continue if os.path.exists(outputfile): print(speech_file + ' already transcribed in ' + outputfile) continue print(speech_file) with open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) #gcs_uri = "gs://pathname.mp3" #audio = speech.RecognitionAudio(uri=gcs_uri) #operation = client.long_running_recognize(config=config, audio=audio) operation = client.recognize(config=config, audio=audio) print("Waiting for operation to complete...") #response = operation.result(timeout=30) response = operation #print(response.results) print('saving to ' + outputfile) with open(outputfile, 'w', encoding='utf-8') as f: for i, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 20) print(u"First alternative of result {}: {}".format( i, alternative)) print(u"Transcript: {}".format(alternative.transcript)) f.write(alternative.transcript) f.write('\n')
def transcribe_file(speech_file, num_speakers): """Transcribe the given audio file asynchronously.""" # Imports the Google Cloud client library #from google.cloud import speech from google.cloud import speech_v1p1beta1 as speech # Instantiates a client client = speech.SpeechClient() # The name of the audio file to transcribe file_name = os.path.join(os.path.dirname(__file__), "resources", "audio.raw") # Loads the audio into memory with io.open(speech_file, "rb") as audio_file: content = audio_file.read() # Construct a recognition metadata object metadata = speech.RecognitionMetadata() metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION metadata.recording_device_type = ( speech.RecognitionMetadata.RecordingDeviceType.OTHER_INDOOR_DEVICE ) metadata.audio_topic = "court trial hearing" metadata.original_mime_type = "audio/mp3" audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code="en-US", enable_automatic_punctuation=True, enable_speaker_diarization=True, diarization_speaker_count=num_speakers, # Enhanced models cost more than standard models. use_enhanced=True, model="video", enable_word_time_offsets=True, ) # Detects speech in the audio file -- short audio file print("Waiting for operation to complete...") response = client.recognize(config=config, audio=audio) result = response.results[-1] words_info = result.alternatives[0].words # Printing out the output: for word_info in words_info: print( u"word: '{}', speaker_tag: {}, start_time:{}, end_time:{}".format(word_info.word, word_info.speaker_tag, word_info.start_time.total_seconds(), word_info.end_time.total_seconds()) )
def testLoadAudioData_succeeds(self): audio_path = os.path.join(self.get_temp_dir(), "a1.wav") wavfile.write(audio_path, 16000, np.zeros(16000 * 1, dtype=np.int16)) buffer = audio_asr.load_audio_data( audio_path, speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, audio_channel_count=1, language_code="en-US")) self.assertLen(buffer, 16000 * 2)
def testLoadAudioData_incorrecSampleRate_raiseValueError(self): audio_path = os.path.join(self.get_temp_dir(), "a1.wav") wavfile.write(audio_path, 16000, np.zeros(16000 * 1, dtype=np.int16)) with self.assertRaises(ValueError): audio_asr.load_audio_data( audio_path, speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, audio_channel_count=1, language_code="en-US"))
def speech_to_text(gcs_URI, keypath): # Reference: https://cloud.google.com/speech-to-text/docs/async-recognize # Set up credentials from local keypath G = 'https://www.listennotes.com/e/p/ea09b575d07341599d8d5b71f205517b/' credentials = service_account.Credentials.from_service_account_file( keypath) audio = speech.RecognitionAudio(uri=gcs_URI) config = speech.RecognitionConfig( language_code="en-US", enable_automatic_punctuation=True, enable_word_time_offsets=True, encoding=speech.RecognitionConfig.AudioEncoding.MP3, sample_rate_hertz=16000, ) client = speech.SpeechClient(credentials=credentials) operation = client.long_running_recognize(config=config, audio=audio) print("Waiting for operation to complete...") response = operation.result() i = 1 sentence = '' transcript_all = '' start_time_offset = [] # Building a python dict (contains start time and words) from the response: for result in response.results: best_alternative = result.alternatives[0] transcript = best_alternative.transcript if i == 1: transcript_all = transcript else: transcript_all += " " + transcript i += 1 # Getting timestamps for word in best_alternative.words: start_s = word.start_time.total_seconds() word = word.word if sentence == '': sentence = word sentence_start_time = start_s else: sentence += ' ' + word if '.' in word: start_time_offset.append({ 'time': sentence_start_time, 'sentence': sentence }) sentence = '' speech_to_text_data = { 'transcript': transcript_all, 'timestamps': start_time_offset } print('Finish transcription.') return speech_to_text_data
def transcribe_gcs(gcs_uri): print('Process', gcs_uri) """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech_v1p1beta1 as speech os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = osp.abspath( configs['google_ca_dir']) client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.MP3, sample_rate_hertz=44100, language_code="ja-jp", enable_word_time_offsets=True, ) operation = client.long_running_recognize(config=config, audio=audio) print("Waiting for operation to complete...") response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. res = [] for result in response.results: alternative = result.alternatives[0] # The first alternative is the most likely one for this portion. print(u"Transcript: {}".format(alternative.transcript)) print("Confidence: {}".format(alternative.confidence)) words = [] for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time words.append({ 'word': word, 'start_time': start_time, 'end_time': end_time }) print( f"Word: {word}, start_time: {start_time.total_seconds()}, end_time: {end_time.total_seconds()}" ) res.append({ "Transcript": alternative.transcript, "Confidence": alternative.confidence, 'word': words }) with open(osp.join('res', gcs_uri[-7:-4]), 'wb') as f: pickle.dump(res, f)
def testTwoFiles(self): audio_path_1 = os.path.join(self.get_temp_dir(), "a1.wav") wavfile.write(audio_path_1, 16000, np.zeros(16000 * 1, dtype=np.int16)) audio_path_2 = os.path.join(self.get_temp_dir(), "a2.wav") wavfile.write(audio_path_2, 16000, np.zeros(16000 * 1, dtype=np.int16)) audio_paths = [audio_path_1, audio_path_2] config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, audio_channel_count=1, language_code="en-US") generator = audio_asr.audio_data_generator(audio_paths, config) self.assertLen(list(generator), 2)
def transcribe_audio_to_tsv(input_audio_paths, output_tsv_path, sample_rate, language_code, begin_sec=0.0): """Transcribe speech in input audio files and write results to .tsv file.""" client = speech.SpeechClient() config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=sample_rate, audio_channel_count=1, language_code=language_code) streaming_config = speech.StreamingRecognitionConfig(config=config, interim_results=False) requests = audio_data_generator(input_audio_paths, config) responses = client.streaming_recognize(streaming_config, requests) with open(output_tsv_path, "w" if not begin_sec else "a") as f: if not begin_sec: # Write the TSV header. f.write(tsv_data.HEADER + "\n") for response in responses: if not response.results: continue results = [ result for result in response.results if result.is_final ] max_confidence = -1 best_transcript = None result_end_time = None for result in results: for alt in result.alternatives: if alt.confidence > max_confidence: max_confidence = alt.confidence best_transcript = alt.transcript.strip() result_end_time = result.result_end_time if not best_transcript: continue end_time_sec = result_end_time.total_seconds() # TODO(cais): The default transcript result doesn't include the start # time stamp, so we currently pretend that each recognizer output phrase # is exactly 1 second. # TODO(cais): Should we use absolute timestamps such as epoch time, instead of # time relative to the beginning of the first file? start_time_sec = end_time_sec - 1 line = "%.3f\t%.3f\t%s\t%s" % ( start_time_sec + begin_sec, end_time_sec + begin_sec, tsv_data.SPEECH_TRANSCRIPT_TIER, best_transcript) print(line) f.write(line + "\n")
def my_transcribe(): from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = 'resources/voice_tom2.wav' # speech_file = 'resources/voice_tom_southern.wav' with open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, #sample_rate_hertz=44100, language_code="th-TH", audio_channel_count=2, # 2 (stereo), 1 (mono) enable_word_confidence=True, enable_word_time_offsets=True, enable_speaker_diarization=True, diarization_speaker_count=2, model="default", ) print("Waiting for operation to complete...") response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 30) #print(u"Transcript: {}".format(alternative.transcript)) print("Confidence: {}".format(alternative.confidence)) print(u"Channel Tag: {}".format(result.channel_tag)) ground_truth = get_ground_truth_text() hypothesis = str(alternative.transcript) print("Ground Truth: ", get_ground_truth_text()) print("Hypothesis: ", hypothesis) atta = Tokenizer(model="attacut-sc") gt_word_tokenize = atta.tokenize(ground_truth) hp_word_tokenize = atta.tokenize(hypothesis) # gt_word_tokenize = word_tokenize(ground_truth, engine="newmm") # default=newmm, longest # hp_word_tokenize = word_tokenize(hypothesis, engine="newmm") print("Ground Truth Word Tokenize:", gt_word_tokenize) print("Hypothesis Word Tokenize:", hp_word_tokenize) error = evaluation.util.word_error_rate(hp_word_tokenize, gt_word_tokenize) print("WER: ", error)
def get_speaker_diarization_results(source_file_name, speaker_count): client = speech.SpeechClient() gcs_uri = "gs://ami_corpus/meeting_files/" + source_file_name audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=48000, language_code="en-US", enable_speaker_diarization=True, diarization_speaker_count=speaker_count, ) response = client.long_running_recognize(config=config, audio=audio) result = response.result().results[-1] return result.alternatives[0].words
def google(): if request.method == 'POST': if os.path.exists("speechtotext.wav"): os.remove("speechtotext.wav") if os.path.exists("monosound.wav"): os.remove("monosound.wav") f = request.files['file'] content = f.read() with open('speechtotext.wav', mode='bx') as file: file.write(content) client = speech.SpeechClient() speech_file = "speechtotext.wav" rate, data = wf.read(speech_file) data0 = data[:, 0] wf.write("monosound.wav", 48000, data0) with io.open("monosound.wav", "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) ob = sf.SoundFile(speech_file) first_lang = "en-US" second_lang = "es-US" third_lang = "zh-cmn-Hans-CN" fourth_lang = "hi-IN" config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=ob.samplerate, language_code="en-US", alternative_language_codes=[second_lang, third_lang, fourth_lang]) response = client.recognize(config=config, audio=audio) text = "" for i, result in enumerate(response.results): alternative = result.alternatives[0] text = text + alternative.transcript + "\n" return jsonify({'text': text})