def speech_recognize(source_audio): """Take source audio uri and call google-cloud-speech recognize method. Return transcript""" # instantiate client from google.cloud import speech sp_client = speech.SpeechClient() # speech RecognitionConfig rec_config = speech.types.RecognitionConfig( encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=_SAMPLE_RATE, profanity_filter=True, language_code='en-US' # max_alternatives, # Not using but available (0-30) # enable_word_time_offsets=TRUE, # provide time offsets ) # speech RecognitionAudio from argument rec_audio = speech.types.RecognitionAudio(uri=source_audio) # recognition operation return sp_client.recognize(rec_config, rec_audio)
def transcribe_gcs(gcs_uri): from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types import time start = time.time() client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( language_code='en-US') operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(timeout=9000000) filename = gcs_uri.split('/')[3][0:-5] with open(filename+".txt","w") as gsp: for result in response.results: gsp.write(result.alternatives[0].transcript) end = time.time() print(end-start)
def main(): # language_code = 'en-US' # a BCP-47 language tag te-IN en-IN language_code = 'en-IN' # a BCP-47 language tag te-IN en-IN credentials = service_account.Credentials.from_service_account_file( 'googleKeys.json') client = speech.SpeechClient(credentials=credentials) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: print("inside stream") audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses)
def listen_mic(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-US' # a BCP-47 language tag client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. voice_command = listen_return_speech(responses) return voice_command
def transcribe_file(speech_file): """Transcribe the given audio file.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() with open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( # encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='ko-KR') response = client.recognize(config, audio) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. try: return (response.results[0].alternatives[0].transcript) except (IndexError): return ("Out")
def mic(): language_code = "ko-KR" # a BCP-47 language tag client = speech.SpeechClient() config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, ) streaming_config = speech.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (speech.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses)
def main(sample_rate): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'ko-KR' # a BCP-47 language tag client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=sample_rate, language_code=language_code, max_alternatives=1) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(sample_rate, int(sample_rate / 10)) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses)
def transcribe_file_with_word_time_offsets(speech_file): ''' :param speech_file: input 할 오디오 파일 경로와 이름입니다. :return: 사용자가 말한 단어들점의 시작점과 끝점을 포함한 리스트입니다. ''' from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code='ko-KR', enable_word_time_offsets=True) response = client.recognize(config, audio) word_and_startend_time = [] for result in response.results: alternative = result.alternatives[0] for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time word_and_startend_time += [[ word, start_time.seconds + start_time.nanos * 1e-9, end_time.seconds + end_time.nanos * 1e-9 ]] return word_and_startend_time
def transcribe_streaming(stream_file): """Streams transcription of the given audio file.""" import io from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() with io.open(stream_file, 'rb') as audio_file: content = audio_file.read() # In practice, stream should be a generator yielding chunks of audio data. stream = [content] requests = (types.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='es-es') streaming_config = types.StreamingRecognitionConfig(config=config) # streaming_recognize returns a generator. responses = client.streaming_recognize(streaming_config, requests) for response in responses: # Once the transcription has settled, the first result will contain the # is_final result. The other results will be for subsequent portions of # the audio. for result in response.results: print('Finished: {}'.format(result.is_final)) print('Stability: {}'.format(result.stability)) alternatives = result.alternatives # The alternatives are ordered from most likely to least. for alternative in alternatives: print('Confidence: {}'.format(alternative.confidence)) print(u'Transcript: {}'.format(alternative.transcript))
def transcribe_file_without_word_time_offsets(speech_file, language): """Transcribe the given audio file synchronously and output the word time offsets.""" #print("Start") from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types #print("checking credentials") client = speech.SpeechClient(credentials=credentials) #print("Checked") with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() #print("audio file read") audio = types.RecognitionAudio(content=content) #print("config start") config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, language_code=language, enable_word_time_offsets=True, audio_channel_count=1) # enableSeparateRecognitionPerChannel=True) #print("Recognizing:") response = client.recognize(config, audio) #print("Recognized") #return only transcript for result in response.results: alternative = result.alternatives[0] return ((alternative.transcript))
def run_quickstart(): # [START speech_quickstart] import io import os # Imports the Google Cloud client library # [START migration_import] from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types # [END migration_import] # Instantiates a client # [START migration_client] client = speech.SpeechClient() # [END migration_client] # The name of the audio file to transcribe file_name = os.path.join(os.path.dirname(__file__), 'resources', 'audio.raw') # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-US') # Detects speech in the audio file response = client.recognize(config, audio) alternatives = response.results[0].alternatives for alternative in alternatives: print('Transcript: {}'.format(alternative.transcript))
def performWork(name): client = speech.SpeechClient() json_arr = [] extract(name) real_name = name[0:len(name) - 3] + ".flac" os.remove(name) with io.open(real_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code='en-US', enable_word_time_offsets=True) response = client.recognize(config, audio) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: alternative = result.alternatives[0] # The first alternative is the most likely one for this portion. #print(u'Transcript: {}'.format(result.alternatives[0].transcript)) for word_info in alternative.words: word = word_info.word json_ret = {} start_time = word_info.start_time end_time = word_info.end_time json_ret['word'] = word json_ret['start_time'] = start_time.seconds json_ret['end_time'] = end_time.seconds json_arr.append(json_ret) os.remove(real_name) reduce((real_name, json_arr))
def transcribe_gcs_with_word_time_offsets(gcs_uri): """Transcribe the given audio file asynchronously and output the word time offsets.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-US', enable_word_time_offsets=True, model='video') operation = client.long_running_recognize(config, audio) result = operation.result(timeout=200) result_time_offsets = [] for result in result.results: alternative = result.alternatives[0] # print(u'Transcript: {}'.format(alternative.transcript)) # print('Confidence: {}'.format(alternative.confidence)) for word_info in alternative.words: word = word_info.word start_time = word_info.start_time start = start_time.seconds + start_time.nanos * 1e-9 end_time = word_info.end_time end = end_time.seconds + end_time.nanos * 1e-9 result_time_offsets.append((word, start, end)) with open("word-time-offsets", "w") as the_file: csv.register_dialect("custom", delimiter=" ", skipinitialspace=True) writer = csv.writer(the_file, dialect="custom") for tup in result_time_offsets: writer.writerow(tup)
def transcribe_gcs(gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code='en-US', enable_word_time_offsets=True) operation = client.long_running_recognize(config, audio) retry_count = 100 while retry_count > 0 and not operation.done(): retry_count -= 1 time.sleep(2) if not operation.done(): print('Operation not complete and retry limit reached.') return alternatives = operation.result().results[0].alternatives for alternative in alternatives: print('Transcript: {}'.format(alternative.transcript)) print('Confidence: {}'.format(alternative.confidence)) for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time print('Word: {}, start_time: {}, end_time: {}'.format( word, start_time.seconds + start_time.nanos * 1e-9, end_time.seconds + end_time.nanos * 1e-9))
def transcribe_file(speech_file): """Transcribe the given audio file asynchronously.""" from google.cloud import speech client = speech.SpeechClient() # [START speech_python_migration_async_request] with io.open(speech_file, "rb") as audio_file: content = audio_file.read() """ Note that transcription is limited to a 60 seconds audio file. Use a GCS file for audio longer than 1 minute. """ audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=44100, language_code="en-GB", audio_channel_count = 2, ) # [START speech_python_migration_async_response] operation = client.long_running_recognize(config=config, audio=audio) # [END speech_python_migration_async_request] print("Waiting for operation to complete...") response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u"Transcript: {}".format(result.alternatives[0].transcript)) print("Confidence: {}".format(result.alternatives[0].confidence))
def stream_speach(self): try: if get_env_value('DEVICE') == 'PI': from lib.PiControls import PiControls pi = PiControls() pi.flash_blue() else: print("sorry! can't blink blue you don't have pi") print('live speech recognition started') print(threading.enumerate()) # See http://g.co/cloud/speech/docs/languages # for a list of supported languages.. language_code = 'en-US' # a BCP-47 language tag credentials = service_account.Credentials.from_service_account_file( 'google-cloud.json') client = speech.SpeechClient(credentials=credentials) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. self.listen_print_loop(responses) except: print('exception occured') self.stream_speach()
def transcribe_file_with_word_time_offsets(speech_file): """Transcribe the given audio file synchronously and output the word time offsets.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, language_code='en-US', enable_word_time_offsets=True) response = client.recognize(config, audio) word_with_ts = [] for result in response.results: #print result alternative = result.alternatives[0] print('Transcript: {}'.format(alternative.transcript)) for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time word_with_ts.append( (word, start_time.seconds + start_time.nanos * 1e-9, end_time.seconds + end_time.nanos * 1e-9)) #print('Word: {}, start_time: {}, end_time: {}'.format( # word, # start_time.seconds + start_time.nanos * 1e-9, # end_time.seconds + end_time.nanos * 1e-9)) return word_with_ts
def transcribe_gcs_return(gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( #encoding=enums.RecognitionConfig.AudioEncoding.FLAC, encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, # default : 16000 language_code='en-US', # 영어 : en-US , 한국어 : ko-KR model='phone_call') operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. tuple_result_msg = () result_msg = '' confidence = 0 print('len : ', len(response.results)) for result in response.results: # The first alternative is the most likely one for this portion. print('Transcript: {}'.format(result.alternatives[0].transcript)) print('Confidence: {}'.format(result.alternatives[0].confidence)) result_msg += result.alternatives[0].transcript + ' ' confidence += result.alternatives[0].confidence #결과를 빅쿼리에 저장하기 위해 return tuple_result_msg = (result_msg, confidence / len(response.results)) return tuple_result_msg
def wav_conversion(filename, tgt_path): print(filename) client = speech.SpeechClient() corrupted_aud = "corrupted.txt" corrupted_log = open(corrupted_aud, "w+") # #write the recognized text fh = open(tgt_path, "w+") try: # Loads the audio into memory with io.open(filename, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-US', model='video') response = client.recognize(config, audio) for result in response.results: print(result) rec = result.alternatives[0].transcript fh.write(rec + ". ") # with sr.AudioFile(filename) as source: # audio = r.record(source) # read the entire audio file # rec=r.recognize_google(audio) # fh.write(rec+". ") # catch any errors. except sr.UnknownValueError: except: corrupted_log.write(filename + '\n') print("Could not understand audio")
def main(): procs = [] #화면에 화자가 있고 출력할 문자열이 있는지 확인하는 Queue q = Queue() #작은 말풍선을 출력할지 큰 말풍선을 출력할지 결정하는 Queue bubble_q = Queue() #멀티 프로세싱으로 frame출력하는 프로그램 start proc = Process(target=visualize_frame, args=( q, bubble_q, )) procs.append(proc) proc.start() # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'ko-KR' # a BCP-47 language tag #마이크로 들어오는 speech를 google cloud platform의 speech to text api 호출하여 말풍선에 문자열 저장 client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses, q, bubble_q)
def google_transcribe(audio_file_name): file_name = audio_file_name mp3_to_wav(file_name) # The name of the audio file to transcribe frame_rate, channels = frame_rate_channel(file_name) if channels > 1: stereo_to_mono(file_name) bucket_name = bucketname source_file_name = audio_file_name destination_blob_name = audio_file_name upload_blob(bucket_name, source_file_name, destination_blob_name) gcs_uri = 'gs://' + bucketname + '/' + audio_file_name transcript = '' client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig(encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frame_rate, language_code='en-US', enable_automatic_punctuation=True) # Detects speech in the audio file operation = client.long_running_recognize(config, audio) response = operation.result(timeout=10000) for result in response.results: transcript += result.alternatives[0].transcript delete_blob(bucket_name, destination_blob_name) return transcript
def get_text_from_audio(): os.system( "export GOOGLE_APPLICATION_CREDENTIALS='analyze_tone_from_voice/Google_cloud_key/My_First_Project_926af8a5744c.json'" ) os.system( 'ffmpeg -i analyze_tone_from_voice/input.m4a -acodec libmp3lame -ab 128k analyze_tone_from_voice/input.mp3' ) os.system( 'sox analyze_tone_from_voice/input.mp3 --rate 16k --bits 16 --channels 1 analyze_tone_from_voice/input.flac' ) # Instantiates a client client = speech.SpeechClient() # The name of the audio file to transcribe file_name = 'analyze_tone_from_voice/input.flac' # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code='en-US') # Detects speech in the audio file response = client.recognize(config, audio) # for result in response.results: text = response.results[0].alternatives[0].transcript # print('Transcript: {}'.format(response.results[0].alternatives[0].transcript)) # remove audio files os.system('rm analyze_tone_from_voice/input.mp3') os.system('rm analyze_tone_from_voice/input.flac') return text
def get_and_save_raw(input_path: str, bucket_name: str, out_path: str, verbosity: int) -> None: """ Gets raw JSON from Google Cloud Speech-to-text API :param input_path: Path to read files from :param bucket_name: Name of the GCS bucket :param verbosity: Verbosity level :return: None """ bin_print(verbosity, 1, "Reading files from", input_path) bin_print(verbosity, 2, "Trying to find all .flac files...") flac_files = [ f for f in listdir(input_path) if isfile(join(input_path, f)) and f.split(".")[1] == "flac" ] bin_print(verbosity, 3, "Found flac files:", flac_files) bin_print(verbosity, 3, "Total flac files:", len(flac_files)) client = speech.SpeechClient() bin_print(verbosity, 1, "Running Google STT...") for flac_file in flac_files: if "stadt_zuerich" in flac_file: bin_print(verbosity, 2, "Processing " + flac_file) try: json = get_raw("gs://" + bucket_name + "/" + flac_file, client) json_path = out_path + "/" + flac_file.replace( ".flac", "_google_output") + ".json" bin_print(verbosity, 2, "Writing " + json_path) f = open(json_path, "w") f.write(json) f.close() except _OperationNotComplete: bin_print(verbosity, 1, "Timeout for " + flac_file)
def legenda_gcs(gcs_uri): from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=44100, language_code='pt-BR') operation = client.long_running_recognize(config, audio) print('Esperando a conclusao da operacao...') response = operation.result(timeout=3000) file = open('c:\legenda2.txt', 'w') for result in response.results: #print(u'Legenda: {}'.format(result.alternatives[0].transcript)) #print('Acuracia: {}'.format(result.alternatives[0].confidence)) file.write('{}\n'.format(result.alternatives[0].transcript)) file.close() #ffmpeg.exe -i "flac1.flac" -ac 1 mono1.flac #gsutil acl ch -u AllUsers:R "gs://legenda/mono02.flac" para deixar arquivo publico no Storage #set GOOGLE_APPLICATION_CREDENTIALS = c:\MyProject.json #export GOOGLE_APPLICATION_CREDENTIALS="/home/joao_sinieghi/chave.json" ##Ambiente de desenvolvimento do Python## #cd your-project #virtualenv --python python3 env #.\env\Scripts\activate #source env/bin/activate
def main(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. # 언어 설정코드 변경 가능 language_code = 'ko-KR' # a BCP-47 language tag # 언어 데이터셋을 Snips NLU에 넣어주기 with io.open("./lights_dataset_train_ko.json", encoding="utf8") as f: sample_dataset = json.load(f) nlu_engine = SnipsNLUEngine(config=CONFIG_KO) nlu_engine = nlu_engine.fit(sample_dataset) print("성공") # 전처리 종료 # 음성인식 시작 client = speech.SpeechClient() config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code) streaming_config = types.StreamingRecognitionConfig(config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. # 무한루프 시작 listen_print_loop(responses, nlu_engine)
def transcribe_file(fileName): """Convert given audio file to single channel.""" monoFileName = uploader._safe_filename('mono.wav') sound = AudioSegment.from_file('./' + fileName) sound = sound.set_channels(1) sound = sound.set_sample_width(2) duration_in_milliseconds = len(sound) sound.export(monoFileName, format='wav') """Transcribe the given audio file.""" client = speech.SpeechClient() # [START migration_sync_request] # [START migration_audio_config_file] with io.open(monoFileName, 'rb') as audio_file: content = audio_file.read() gcs_uri = uploader.upload_file(content, monoFileName, 'audio/wav') plotGraph(monoFileName, gcs_uri) audio = types.cloud_speech_pb2.RecognitionAudio(content=content) config = types.cloud_speech_pb2.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, language_code='en-IN') # [END migration_audio_config_file] # [START migration_sync_response] response = client.recognize(config, audio) # [END migration_sync_request] # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. text = '' for result in response.results: # The first alternative is the most likely one for this portion. text += u'{}'.format(result.alternatives[0].transcript) if (len(result.alternatives) > 0): text += ' ' # [END migration_sync_response] return [duration_in_milliseconds, text, gcs_uri]
def transcribe_gcs_with_word_time_offsets(gcs_uri): """Transcribe the given audio file asynchronously and output the word time offsets.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code='en-US', enable_word_time_offsets=True) operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') result = operation.result(timeout=90) for result in result.results: alternative = result.alternatives[0] print('Transcript: {}'.format(alternative.transcript)) print('Confidence: {}'.format(alternative.confidence)) for word_info in alternative.words: word = word_info.word print(word) start_time = word_info.start_time end_time = word_info.end_time if word in bad_words_list_final1: aList.append(word) aList.append(start_time) aList.append(end_time) print('Word: {}, start_time: {}, end_time: {}'.format( word, start_time.seconds + start_time.nanos * 1e-9, end_time.seconds + end_time.nanos * 1e-9))
def transcribe_file(speech_file): """Transcribe the given audio file asynchronously.""" from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types client = speech.SpeechClient() # [START migration_async_request] with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, language_code='en-US') # [START migration_async_response] operation = client.long_running_recognize(config, audio) # [END migration_async_request] print('Waiting for operation to complete...') response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. f = open("speech_file.html", "w+") for result in response.results: # The first alternative is the most likely one for this portion. #print(u'Transcript: {}'.format(result.alternatives[0].transcript)) #print('Confidence: {}'.format(result.alternatives[0].confidence)) f.write(format(result.alternatives[0].transcript)) # [END migration_async_response] f.close() webbrowser.open( 'file:///media/spritle/ACER%20DATA/Health%20NLP/audiototext/speech_file.html' )
def sub_main(profanityFilterBool): """ *** Code taken from Google Cloud Speech to text documentation *** Turns on the profanity filter so bad words are censored and not printed """ # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. language_code = 'en-US' # a BCP-47 language tag sp_c_cico = {"phrases": ["cico"],"boost": 20} #speech_contexts_cico #sp_c_kiko = {"phrases": ["Kiko"],"boost": 0}#speech_contexts_kiko speech_contexts = [sp_c_cico] client = speech.SpeechClient() #print(help(types.RecognitionConfig)) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=language_code, enable_automatic_punctuation=True, speech_contexts=speech_contexts) streaming_config = types.StreamingRecognitionConfig( config=config, interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = (types.StreamingRecognizeRequest(audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. solution = returnResponseString(responses) #solution is the result append_to_file("log.txt",str(solution)) return solution
def main(): client = speech.SpeechClient() config = speech.types.RecognitionConfig( encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=SAMPLE_RATE, language_code='en-US', max_alternatives=1, enable_word_time_offsets=True) streaming_config = speech.types.StreamingRecognitionConfig( config=config, interim_results=True) mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE) print('Say "Quit" or "Exit" to terminate the program.') with mic_manager as stream: while not stream.closed: audio_generator = stream.generator() requests = (speech.types.StreamingRecognizeRequest( audio_content=content) for content in audio_generator) responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses, stream)