def get_transcripts_json(gcstorage_path, lang, phrase_hints=[], speaker_count=1, enhanced_model=None): # transcribes audio files def _jsonify(res): # helper func for simplifying gcp speech client response json = [] for section in res.results: data = { 'transcript': section.alternatives[0].transcript, 'words': [] } for word in section.alternative[0].words: data['words'].append({ 'word': word.word, 'start_time': word.start_time.total_seconds(), 'end_time': word.end_time.total_seconds(), 'speaker_tag': word.speaker_tag }) json.append(data) return json client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcstorage_path) diarize = speaker_count if speaker_count > 1 else False print(f"Diarizing: {diarize}") diarizationConfig = speech.SpeakerDiarizationConfig( enable_speaker_diarization=speaker_count if speaker_count > 1 else False, ) # if eng only, can use the optimized video model if lang == 'en': enhanced_model = 'video' config = speech.RecognitionConfig( lang_code='en-US' if lang == 'en' else lang, enable_automatic_punctuation=True, enable_word_time_offsets=True, speech_contexts=[{ 'phrases': phrase_hints, 'boost': 15 }], diarization_config=diarizationConfig, profanity_filter=True, use_enhanced=True if enhanced_model else False, model='video' if enhanced_model else None) res = client.long_running_recognize(config=config, audio=audio).result() return _jsonify(res)
def get_transcripts_json(gcsPath, langCode, phraseHints=[], speakerCount=1, enhancedModel=None): """Transcribes audio files. Args: gcsPath (String): path to file in cloud storage (i.e. "gs://audio/clip.mp4") langCode (String): language code (i.e. "en-US", see https://cloud.google.com/speech-to-text/docs/languages) phraseHints (String[]): list of words that are unusual but likely to appear in the audio file. speakerCount (int, optional): Number of speakers in the audio. Only works on English. Defaults to None. enhancedModel (String, optional): Option to use an enhanced speech model, i.e. "video" Returns: list | Operation.error """ # Helper function for simplifying Google speech client response def _jsonify(result): json = [] for section in result.results: data = { "transcript": section.alternatives[0].transcript, "words": [] } for word in section.alternatives[0].words: data["words"].append({ "word": word.word, "start_time": word.start_time.total_seconds(), "end_time": word.end_time.total_seconds(), "speaker_tag": word.speaker_tag }) json.append(data) return json client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcsPath) diarize = speakerCount if speakerCount > 1 else False print(f"Diarizing: {diarize}") diarizationConfig = speech.SpeakerDiarizationConfig( enable_speaker_diarization=speakerCount if speakerCount > 1 else False, ) # In English only, we can use the optimized video model if langCode == "en": enhancedModel = "video" config = speech.RecognitionConfig( language_code="en-US" if langCode == "en" else langCode, enable_automatic_punctuation=True, enable_word_time_offsets=True, speech_contexts=[{ "phrases": phraseHints, "boost": 15 }], diarization_config=diarizationConfig, profanity_filter=True, use_enhanced=True if enhancedModel else False, model="video" if enhancedModel else None) res = client.long_running_recognize(config=config, audio=audio).result() return _jsonify(res)
from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() speech_file = "'/home/serkhane/Repositories/AI/DATA/youtube_data_taflowtron/en/jocko_podcast_shortlist/v4_concate_removesilence/1HhXDprzf5I/clips/1HhXDprzf5I_trim_0.0_6180.0.wav'" with open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) diarization_config = speech.SpeakerDiarizationConfig( enable_speaker_diarization=True, min_speaker_count=2, max_speaker_count=10, ) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code="en-US", diarization_config=diarization_config, ) print("Waiting for operation to complete...") response = client.recognize(config=config, audio=audio) # The transcript within each result is separate and sequential per result. # However, the words list within an alternative includes all the words # from all the results thus far. Thus, to get all the words with speaker # tags, you only have to take the words list from the last result: