def get_transcript_long(content: bytes = None, audio_path: str = None): """ Gets transcript of long audio file asynchonously. Args: content (bytes): Content of audio file as bytes. audio_path (str): Path or uri to audio file. Returns: object: Processed audio file for speech-to-text. """ if content is None and audio_path is None: raise ValueError('At least one parameter cannot be None.') audio = speech.RecognitionAudio(uri=audio_path) if content is None else speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code="en-US", ) operation = client.long_running_recognize(config=config, audio=audio) print("Waiting for operation to complete...") response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u"Transcript: {}".format(result.alternatives[0].transcript)) print("Confidence: {}".format(result.alternatives[0].confidence)) return response
def get_transcript(content: bytes = None, audio_path: str = None): """ Gets transcript of audio file. Args: content (bytes): Content of audio file as bytes. audio_path (str): Path or uri to audio file. Returns: object: Processed audio file for speech-to-text. """ if content is None and audio_path is None: raise ValueError("At least one parameter cannot be None.") audio = ( speech.RecognitionAudio(uri=audio_path) if content is None else speech.RecognitionAudio(content=content) ) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", ) # Detects speech in the audio file response = client.recognize(config=config, audio=audio) for result in response.results: print(f"Transcript: {result.alternatives[0].transcript}") return response
def transcribe_gcs(gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=44100, audio_channel_count=2, language_code="ja-JP", ) operation = client.long_running_recognize(config=config, audio=audio) print("Waiting for operation to complete...") response = operation.result(timeout=300) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u"Transcript: {}".format(result.alternatives[0].transcript)) print("Confidence: {}".format(result.alternatives[0].confidence))
def transcribe_file(speech_file): """Transcribe the given audio file.""" from google.cloud import speech import io client = speech.SpeechClient() # [START speech_python_migration_sync_request] # [START speech_python_migration_config] with io.open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=44100, language_code="en-GB", audio_channel_count=2, ) # [END speech_python_migration_config] # [START speech_python_migration_sync_response] response = client.recognize(config=config, audio=audio) # [END speech_python_migration_sync_request] # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u"Transcript: {}".format(result.alternatives[0].transcript))
def speech_to_text(gcs_uri): # Instantiates a client client = speech.SpeechClient() # The name of the audio file to transcribe #gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw" audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", ) # Detects speech in the audio file response = client.recognize(config=config, audio=audio) text_all = "" for result in response.results: text_all += result.alternatives[0].transcript print("Transcript: {}".format(result.alternatives[0].transcript)) print(text_all) #text_response = (response.results.alternatives[0].transcript) #print(text_response) return(text_all)
def speechToText(extention): # Instantiates a client client = speech.SpeechClient() # The name of the audio file to transcribe file_name = os.path.join(os.path.dirname(__file__), "resources", "exemple."+extention) # Loads the audio into memory with io.open(file_name, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", ) # Detects speech in the audio file response = client.recognize(config=config, audio=audio) for result in response.results: resultat = "{}".format(result.alternatives[0].transcript) return resultat
def generate_transcript(language_code="ro-RO"): # Creates google client client = speech.SpeechClient() # Full path of the audio file, Replace with your file name file_name = os.path.join(os.path.dirname(__file__), "cache/recording.wav") wav_file = wave.Wave_read(file_name) ch = wav_file.getnchannels() #Loads the audio file into memory with io.open(file_name, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, audio_channel_count=ch, language_code=language_code, ) # Sends the request to google to transcribe the audio response = client.recognize(request={"config": config, "audio": audio}) return response
def transcribe_file(speech_file): # """Transcribe the given audio file asynchronously.""" print("Before Imports") from google.cloud import speech client = speech.SpeechClient() with io.open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", ) # [START speech_python_migration_async_response operation = client.long_running_recognize(request={ "config": config, "audio": audio }) operation = client.long_running_recognize(config=config, audio=audio) print("Waiting for operation to complete...") response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u"Transcript: {}".format(result.alternatives[0].transcript)) print("Confidence: {}".format(result.alternatives[0].confidence))
def transcribe_file(speech_file): """Transcribe the given audio file.""" client = speech.SpeechClient() # with io.open(speech_file, "rb") as audio_file: # content = audio_file.read() # languages = ["en-US", "ru-RU", "uk-UA"] audio = speech.RecognitionAudio(content=speech_file.read()) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding. ENCODING_UNSPECIFIED, # for .mp3 format sample_rate_hertz=16000, language_code="uk-UA", ) response = client.recognize(config=config, audio=audio) if response: # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. logger.info(u"Transcript: {}".format( result.alternatives[0].transcript)) return response.results[0].alternatives[0].transcript
def run(self, file_name='default'): client = speech.SpeechClient() file_names = file_name ip = file_name.split('_')[0] file_name = os.path.join(os.path.dirname(__file__), "payload", file_name) # Loads the audio into memory with io.open(file_name, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.MULAW, sample_rate_hertz=8000, audio_channel_count=1, language_code="ko-KR", enable_automatic_punctuation=True) # Detects speech in the audio file response = client.recognize(config=config, audio=audio) text = "" for idx, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 20) print("Transcript: {}".format(alternative.transcript)) text += alternative.transcript # [END speech_quickstart] return ip, text
def transcribe_file(speech_file): """Transcribe the given audio file.""" from google.cloud import speech import io client = speech.SpeechClient() with io.open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, # sample_rate_hertz=16000, language_code="en-US", audio_channel_count=2) response = client.recognize(config=config, audio=audio) transcription = [] # print(response) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u"Transcript: {}".format(result.alternatives[0].transcript)) transcription.append(result.alternatives[0].transcript) return transcription # transcribe_file("recording/509648114_data/1-ColorfulPockets_0617.ogg")
def transcribe_file_with_multichannel(speech_file): """Transcribe the given audio file synchronously with multi channel.""" # [START speech_transcribe_multichannel] from google.cloud import speech client = speech.SpeechClient() with open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code="en-US", audio_channel_count=2, enable_separate_recognition_per_channel=True, ) response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 20) print("First alternative of result {}".format(i)) print(u"Transcript: {}".format(alternative.transcript)) print(u"Channel Tag: {}".format(result.channel_tag))
def stt(request): credential_path = '..//sa-spoiler-4897b3e764af.json' os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credential_path f = open("chatbot/file.ogg", 'wb') f.write(request.body) f.close() with io.open("chatbot/file.ogg", "rb") as audio_file: content = audio_file.read() try: client = speech.SpeechClient() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.OGG_OPUS, sample_rate_hertz=48000, language_code="ko-KR", ) response = client.recognize(config=config, audio=audio) stringList = [] for result in response.results: stringList.append(result.alternatives[0].transcript) resultStr = ''.join(stringList) except DefaultCredentialsError: logging.warning('DefaultCredentaials error. check api key') resultStr = "stt 오류입니다. 관리자에게 문의하세요 (DefaultCredentalsError)" else: resultStr = "undefined error. 관리자에게 문의하세요" finally: return HttpResponse(resultStr)
def __call__(self, data): try: for _ in range(self.retries): audio = speech.RecognitionAudio(content=data) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding. ENCODING_UNSPECIFIED, sample_rate_hertz=self.rate, language_code=self.language, model=self.model, ) response = self.client.recognize(config=config, audio=audio) # return for result in response.results: try: return result.alternatives[0].transcript except IndexError: # no result continue except JSONDecodeError: continue except KeyboardInterrupt: return None
def sync_recognize_with_multi_region_gcs(): # [START speech_multi_region] # Imports the Google Cloud client library from google.cloud import speech from google.api_core import client_options # Instantiates a client # [START speech_multi_region_client] # Pass an additional argument, ClientOptions, to specify the new endpoint. client_options = client_options.ClientOptions( api_endpoint="eu-speech.googleapis.com") client = speech.SpeechClient(client_options=client_options) # [END speech_multi_region_client] # The name of the audio file to transcribe gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw" audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", ) # Detects speech in the audio file response = client.recognize(config=config, audio=audio) for result in response.results: print("Transcript: {}".format(result.alternatives[0].transcript))
def transcribe_file(content, channels=2): """ Transcribe the given audio file. """ audio = speech.RecognitionAudio(content=content.read()) config = speech.RecognitionConfig( language_code="en-US", enable_word_time_offsets=True, enable_automatic_punctuation=True, # sample_rate_hertz=sample_rate, audio_channel_count=channels) response = client.recognize(request={"config": config, "audio": audio}) res = [] for result in response.results: alternative = result.alternatives[0] transcript = alternative.transcript confidence = alternative.confidence words = [] for word_info in alternative.words: word = word_info.word start = word_info.start_time.total_seconds() end = word_info.end_time.total_seconds() words.append(SpeechWord(word, start, end)) res.append(SpeechParagraph(transcript, confidence, words)) return res
def transcribe(self): # Full path of the audio file, Replace with your file name file_name = "data/recording.wav" #Loads the audio file into memory with io.open(file_name, "rb") as file: content = file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, audio_channel_count=2, language_code="en-US", sample_rate_hertz=16000, ) # Sends the request to google to transcribe the audio response = self.client.recognize(request={ "config": config, "audio": audio }) self.numChars = 20 self.calcLines() self.generateResults() return self.results, self.numLines
def transcribe_context_classes(storage_uri): """Provides "hints" to the speech recognizer to favor specific classes of words in the results.""" # [START speech_context_classes] from google.cloud import speech client = speech.SpeechClient() # storage_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav' audio = speech.RecognitionAudio(uri=storage_uri) # SpeechContext: to configure your speech_context see: # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext # Full list of supported phrases (class tokens) here: # https://cloud.google.com/speech-to-text/docs/class-tokens speech_context = speech.SpeechContext(phrases=['$TIME']) # RecognitionConfig: to configure your encoding and sample_rate_hertz, see: # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code="en-US", speech_contexts=[speech_context], ) response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 20) print("First alternative of result {}".format(i)) print("Transcript: {}".format(alternative.transcript))
def transcribe_file_with_auto_punctuation(path): """Transcribe the given audio file with auto punctuation enabled.""" # [START speech_transcribe_auto_punctuation] from google.cloud import speech client = speech.SpeechClient() # path = 'resources/commercial_mono.wav' with io.open(path, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code="en-US", # Enable automatic punctuation enable_automatic_punctuation=True, ) response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 20) print("First alternative of result {}".format(i)) print("Transcript: {}".format(alternative.transcript))
def transcribe_gcs_with_multichannel(gcs_uri): """Transcribe the given audio file on GCS with multi channel.""" # [START speech_transcribe_multichannel_gcs] from google.cloud import speech client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code="en-US", audio_channel_count=2, enable_separate_recognition_per_channel=True, ) response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] print("-" * 20) print("First alternative of result {}".format(i)) print(u"Transcript: {}".format(alternative.transcript)) print(u"Channel Tag: {}".format(result.channel_tag))
def run_quickstart(): # [START speech_quickstart] import io import os # Imports the Google Cloud client library # [START migration_import] from google.cloud import speech # [END migration_import] # Instantiates a client # [START migration_client] client = speech.SpeechClient() # [END migration_client] # The name of the audio file to transcribe file_name = os.path.join( os.path.dirname(__file__), 'kor_audio.wav') # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, audio_channel_count=2, language_code='ko-KR') # Detects speech in the audio file response = client.recognize(config=config, audio=audio) for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript))
async def speech2text(content: bytes, language_code: str = 'ru-RU', channels: int = 1, rate: int = 16000, encoding: str = 'LINEAR16') -> str: global CLIENT hash_file: str = hashlib.sha512(content).hexdigest() cache_key: str = f'speech2text_{hash_file}_{language_code}_{channels}_{rate}_{encoding}' cached_response: str = await redis.get(key=cache_key) if cached_response: return cached_response audio = speech.RecognitionAudio(content=content) try: encoding = speech.RecognitionConfig.AudioEncoding[encoding.upper()] except KeyError: raise config = speech.RecognitionConfig( encoding=encoding, sample_rate_hertz=rate, language_code=language_code, audio_channel_count=channels, ) response = await CLIENT.recognize(config=config, audio=audio) await redis.setex(key=cache_key, timeout=TTL_CACHE, value=response.results[0].alternatives[0].transcript) return response.results[0].alternatives[0].transcript
def run_quickstart(): # [START speech_quickstart] # Imports the Google Cloud client library # [START speech_python_migration_imports] from google.cloud import speech # [END speech_python_migration_imports] # Instantiates a client # [START speech_python_migration_client] client = speech.SpeechClient() # [END speech_python_migration_client] # The name of the audio file to transcribe gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw" audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", ) # Detects speech in the audio file response = client.recognize(config=config, audio=audio) for result in response.results: print("Transcript: {}".format(result.alternatives[0].transcript))
def submit_speech_api_request(file: bytes, language_code: str) -> dict: """ Function to submit a speech-to-text API request. :return: transcripts extracted from the speech file, in json format """ # FUTURE: Avoid scoping lifetime of the client to function? client = speech.SpeechClient() # Source: https://cloud.google.com/speech-to-text/docs/sync-recognize # Did not dig into all the API options, as the assignment # was to develop a small wrapper around the actual Speech-to-Text API. audio = speech.RecognitionAudio(content=file) # TODO Extract 'audio_channel_count' from the input file itself # https://stackoverflow.com/questions/47905083/how-to-check-number-of-channels-in-my-audio-wav-file-using-ffmpeg-command config = speech.RecognitionConfig(audio_channel_count=2, language_code=language_code) response = client.recognize(config=config, audio=audio) # FUTURE: Additional error handling may be appropriate. # Format JSON output return { # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. # The first alternative is the most likely one for this portion. # # FUTURE: Check if alternatives[0] always exists "transcripts": [f"{result.alternatives[0].transcript}" for result in response.results] }
def transcribe_file(speech_file): """Transcribe the given audio file.""" from google.cloud import speech import os import io client = speech.SpeechClient() with io.open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=48000, language_code="en-US", enable_automatic_punctuation = True, ) response = client.recognize(config=config, audio=audio) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. v = "" for result in response.results: # The first alternative is the most likely one for this portion. print(u"Transcript: {}".format(result.alternatives[0].transcript)) v += result.alternatives[0].transcript return v
def transcribe_file(): from google.cloud import speech import io client = speech.SpeechClient() with io.open('proken.wav', 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code='ja-JP', #audio_channel_count=2, enable_separate_recognition_per_channel=True ) operation = client.long_running_recognize( request={"config": config, "audio": audio} ) operation = client.long_running_recognize(config=config, audio=audio) response = operation.result(timeout=90) with io.open("proken.txt", "w", encoding="utf-8") as f: for result in response.results: f.write(u'{}'.format(result.alternatives[0].transcript))
def __init__(self): # Instantiates a client client = speech.SpeechClient() # The name of the audio file to transcribe gcs_uri = "gs://gradclip1-audio/small.flac" audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, language_code="en-AU", audio_channel_count = 2, enable_word_time_offsets = True ) # Detects speech in the audio file operation = client.long_running_recognize(config=config, audio=audio) print("Waiting for operation to complete...") response = operation.result(timeout=9000) for result in response.results: for word in result.alternatives[0].words: start_time = word.start_time.microseconds end_time = word.end_time.microseconds
def transcribe_gcs(gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri.""" client = speech.SpeechClient() audio = speech.RecognitionAudio(uri=gcs_uri) config = speech.RecognitionConfig( # encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED, encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code="en-US", enable_automatic_punctuation=True, ) operation = client.long_running_recognize(config=config, audio=audio) print("Waiting for operation to complete...") response = operation.result(timeout=100000) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. transcript = "" for result in response.results: # The first alternative is the most likely one for this portion. print(u"Transcript: {}".format(result.alternatives[0].transcript)) transcript += " " + result.alternatives[0].transcript print("Confidence: {}".format(result.alternatives[0].confidence)) return transcript
def transcribe_audio(speech_file): retval = "" try: transcription_client = speech.SpeechClient.from_service_account_file( my_credentials_file_path) except: logger.info("Not using Google speech to text as credz file not at : " + my_credentials_file_path) return "" with io.open(speech_file, "rb") as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code="en-GB", ) response = transcription_client.recognize(config=config, audio=audio) for result in response.results: retval = result.alternatives[0].transcript return retval
def get_transcript(uri): """ Get transcript for a audio WAVE file stored in GCS bucket @param: uri - File location in the google cloud storage bucket """ # create speech client client = speech.SpeechClient() # actual audio file to recognize audio = speech.RecognitionAudio(uri=uri) # recognition config config = speech.RecognitionConfig(language_code="en-US") # execute the recognition API operation = client.long_running_recognize(config=config, audio=audio) print("Waiting for operation to complete...") # wait for 90 seconds max response = operation.result(timeout=90) confidence = [] transcript = "" for result in response.results: # The first alternative is the most likely one for this portion. confidence.append(result.alternatives[0].confidence) transcript = transcript + result.alternatives[0].transcript confidence = sum(confidence) / len(confidence) return transcript, confidence