def transcribe_gcs(gcs_uri): from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types import time start = time.time() client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( language_code='en-US') operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(timeout=9000000) filename = gcs_uri.split('/')[3][0:-5] with open(filename+".txt","w") as gsp: for result in response.results: gsp.write(result.alternatives[0].transcript) end = time.time() print(end-start)
def transcribe_gcs(gcs_uri, language_code): """Transcribes the audio file specified by the gcs_uri.""" client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code=language_code, enable_word_time_offsets=True) operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(timeout=120) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. words = [w.word for w in result.alternatives[0].words] word_counts = Counter(words) print(word_counts)
def main(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-US' # a BCP-47 language tag client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses)
def transcribe_file(speech_file): """Transcribe the given audio file.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() with open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( # encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='ko-KR') response = client.recognize(config, audio) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. try: return (response.results[0].alternatives[0].transcript) except (IndexError): return ("Out")
def __init__(self, callback): Thread.__init__(self) self.callback = callback self.client = speech.SpeechClient() # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-US' # a BCP-47 language tag config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) self.streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) while True: print("Listen again?") self.listen()
def googleApiCall(path): # Instantiates a client client = speech.SpeechClient() # The name of the audio file to transcribe file_name = path #'/home/sanghs3/Capstone/umm.flac' # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=48000, language_code='en-US', enable_word_time_offsets=True) # Detects speech in the audio file response = client.recognize(config, audio) response = formatResponse(response) return response
def transcribe_gcs(gcs_uri): from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code='en-US') operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(timeout=100) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u'Transcript: {}'.format(result.alternatives[0].transcript)) print('Confidence: {}'.format(result.alternatives[0].confidence))
def transcribe_file(speech_file): """Transcribe the given audio file.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types import io import json client = speech.SpeechClient() with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code='en-US') response = client.recognize(config, audio) # Print the first alternative of all the consecutive results. # print(json.dumps(response)) for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript))
def transcribe_gcs(gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, language_code='pt-BR') operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(timeout=1800) print('Done with operation') # Print the first alternative of all the consecutive results. for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript)) print('Confidence: {}'.format(result.alternatives[0].confidence))
def audio_to_text(): # Instantiates a client client = speech.SpeechClient() file_name = "/Users/amirulislam/Downloads/announce.wav" print("File is ", file_name) # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=48000, language_code='ar-QA', audio_channel_count=1) # Detects speech in the audio file response = client.recognize(config, audio) for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript)) return response.results[0].alternatives[0].transcript
def __init__(self, credentials=None, RATE=16000, CHUNK=1024, language_code='en-US'): self.RATE = RATE self.CHUNK = CHUNK self.language_code = language_code if (credentials != None): credentials = service_account.Credentials.from_service_account_file( credentials) self.client = speech.SpeechClient(credentials=credentials) #TODO: speech_contexts -> https://cloud.google.com/speech-to-text/docs/basics#phrase-hints self.config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, model='command_and_search', enable_automatic_punctuation=True) self.streaming_config = types.StreamingRecognitionConfig( config=self.config, interim_results=True, single_utterance=True)
def transcribe_gcs(gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code='en-US', enable_word_time_offsets=True) operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') result = operation.result(timeout=90) alternatives = result.results[0].alternatives for alternative in alternatives: print('Transcript: {}'.format(alternative.transcript)) print('Confidence: {}'.format(alternative.confidence))
def get_text(file_name): # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, #sample_rate_hertz=16000, language_code='en-US') #instantiate a client client = speech.SpeechClient() # Detects speech in the audio file response = client.recognize(config, audio) text = "" for result in response.results: text += result.alternatives[0].transcript #print('Transcript: {}'.format(result.alternatives[0].transcript)) return text
def main(): language_code = 'zh-TW' client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig( config=config, #自己測試 # single_utterance=True, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses)
def google_stt_streaming(self, socket_action): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'ko-KR' # a BCP-47 language tag # for content in comuni.get_data(client_record): # print("Type >> {}".format(type(content))) client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) requests = (types.StreamingRecognizeRequest(audio_content=content) for content in socket_action.get_data()) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. stt_text = self.listen_print_loop(responses) return stt_text
def recognize(): global text global conf # Instantiates a client client = speech.SpeechClient() # The name of the audio file to transcribe file_name = 'output.wav' # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-US') # Detects speech in the audio file response = client.recognize(config, audio) for result in response.results: text = str(result.alternatives[0].transcript) conf = str(result.alternatives[0].confidence) return text, conf
def run(self): client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code='en-US') while not done.is_set(): try: data = stt_buffer.get(timeout=0.1) except queue.Empty: continue print(datetime.datetime.now(), "Requesting transcription of %d data frames" % data.shape[0]) audio = types.RecognitionAudio(content=data.tobytes()) response = client.recognize(config, audio) print(datetime.datetime.now(), "Transcription complete: %r" % response) for result in response.results: text = result.alternatives[0].transcript print(datetime.datetime.now(), "Text: %r" % text) if text: text_out.put(text) else: print(datetime.datetime.now(), 'Decoding complete. No text returned')
def transcribe_large(full_audio_name, frame_rate, language_code): audio_file_name = ntpath.basename(full_audio_name) upload_to_bucket(const.TRANSCRIPTION_GOOGLE_BUCKET, full_audio_name, audio_file_name) gcs_uri = 'gs://' + const.TRANSCRIPTION_GOOGLE_BUCKET + '/' + audio_file_name client = speech.SpeechClient() audio = speech_types.RecognitionAudio(uri=gcs_uri) config = speech_types.RecognitionConfig( encoding=speech_enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code=language_code, ) # Detects speech in the audio file operation = client.long_running_recognize(config, audio) response = operation.result(timeout=10000) delete_from_bucket(const.TRANSCRIPTION_GOOGLE_BUCKET, audio_file_name) return response
def main(): # language_code = 'en-US' # a BCP-47 language tag te-IN en-IN language_code = 'en-IN' # a BCP-47 language tag te-IN en-IN credentials = service_account.Credentials. from_service_account_file('googleKeys.json') client = speech.SpeechClient(credentials=credentials) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: print("inside stream") audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses)
def analyze_text(self): """ Transcribe the speech file and returns it as a string;can only be called after calling self.load(); file must be .wav, mono """ from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() with io.open(self.filename, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, # sample_rate_hertz=44100, language_code='en-US') response = client.recognize(config, audio) text = "" for result in response.results: #print('Transcript: {}'.format(result.alternatives[0].transcript)) text += '{}'.format(result.alternatives[0].transcript) return text
def start(self,window): self.window = window language_code = 'ko-KR' # a BCP-47 language tag client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) #무한으로 하려면 'while True :' with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. self.listen_print_loop(responses)
def _transcribe(self, uri, audio): client = speech.SpeechClient() # NOTE -- according to https://cloud.google.com/speech/quotas we are # limited to ~180 Minutes recog = types.RecognitionAudio(uri=uri) # https://cloud.google.com/speech/reference/rest/v1/RecognitionConfig config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.OGG_OPUS, sample_rate_hertz=audio.frame_rate, language_code=self.lang, enable_word_time_offsets=True) # https://cloud.google.com/speech/docs/async-recognize # https://googlecloudplatform.github.io/google-cloud-python/latest/speech/index.html#asynchronous-recognition operation = client.long_running_recognize(config, recog) start = time.time() logger.info('Beginning transcription') response = operation.result(timeout=self.TIMEOUT) stop = time.time() logger.info("Transcription finished in {}".format(Time(stop - start))) return response
def translate_with_timestamps(self, gs_uri): audio = types.RecognitionAudio(uri=gs_uri, ) config = types.RecognitionConfig( encoding='FLAC', language_code='en-US', # sample_rate_hertz=44100, enable_word_time_offsets=True) operation = self.client.long_running_recognize(config=config, audio=audio) results = [] for result in operation.result().results: alternatives = result.alternatives if len(alternatives) == 0: continue alternative = alternatives[0] for word_info in alternative.words: word = word_info.word start_time = word_info.start_time.seconds + round( word_info.start_time.nanos * 1e-9, 1) end_time = word_info.end_time.seconds + round( word_info.end_time.nanos * 1e-9, 1) results.append([word, start_time, end_time]) return results
def transcribe_gcs(uri_gs): """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=uri_gs) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, language_code='ja-JP') operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(timeout=9000) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. with open('{}.txt'.format(uri_gs.split('/')[-1]), 'w') as f: for result in response.results: # The first alternative is the most likely one for this portion. f.write(result.alternatives[0].transcript)
def audio_to_text(fname): flacfile = mp3_to_flac(fname) client = speech.SpeechClient() transcript = '' with io.open(flacfile, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=22050, language_code='en-US') # Detects speech in the audio file response = client.recognize(config, audio) for result in response.results: transcript = format(result.alternatives[0].transcript) #print('hs', transcript) return transcript
def __init__(self, engine, args, loop): Thread.__init__(self) self.client = speech.SpeechClient() self.args = args self.engine = engine self.stop_recognition = False self.loop = loop self.role = "UNKOWN" self.crash = False # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-US' # a BCP-47 language tag config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) self.streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True)
def transcribe_file(speech_file): client = speech.SpeechClient() with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code='en-US') operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(timeout=90) for result in response.results: print(u'{}'.format(result.alternatives[0].transcript)) f = open("./test.tmp", 'w') f.write(result.alternatives[0].transcript) f.close()
def transcribe_file(speech_file, textFile): """Transcribe the given audio file.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code='es-MX') response = client.recognize(config, audio) fileContent = open(textFile, 'w') for index, result in enumerate(response.results): responseIndex = result.alternatives[0].transcript.encode('utf-8', 'strict') print(responseIndex) fileContent.write(str(responseIndex)) fileContent.close()
def transcribe_file(self, file_name): # Instantiates a client client = self.client # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, language_code='en-US') # Detects speech in the audio file response = client.recognize(config, audio) transcripts = [] for result in response.results: if (result.alternatives[0].transcript): transcripts.append(result.alternatives[0].transcript) return transcripts
def transcribe_streaming(stream_file): """Streams transcription of the given audio file.""" import io from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() with io.open(stream_file, 'rb') as audio_file: content = audio_file.read() # In practice, stream should be a generator yielding chunks of audio data. stream = [content] requests = (types.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='es-es') streaming_config = types.StreamingRecognitionConfig(config=config) # streaming_recognize returns a generator. responses = client.streaming_recognize(streaming_config, requests) for response in responses: # Once the transcription has settled, the first result will contain the # is_final result. The other results will be for subsequent portions of # the audio. for result in response.results: print('Finished: {}'.format(result.is_final)) print('Stability: {}'.format(result.stability)) alternatives = result.alternatives # The alternatives are ordered from most likely to least. for alternative in alternatives: print('Confidence: {}'.format(alternative.confidence)) print(u'Transcript: {}'.format(alternative.transcript))