def convert_sound_file(filename, language='en-US', wait=1200, keep_on_gs=False): """ Convert a sound file to a transcript using Google API. Files must be converted to FLAC encoding if they are not already. Big files have to be stored on a gs bucket to avoid time out. Args: filename (string): name of the file to convert language (string): language of voices in file (default: en-US) wait (int): time to wait of long running operations (if 0, use recognize, should be less 1 min text) Returns: Sound file transcript (first alternative) if wait > 0 Operation name if wait == 0 """ working_filename = filename # Convert file if needed _, file_extension = os.path.splitext(filename) if file_extension is not '.flac': working_filename = _convert_with_ffmpeg(filename) # Upload file if necessary # Optimal size to be determined, always upload for now size = os.path.getsize(working_filename) uploaded = (size > 0) if uploaded: upload_uri = _upload_to_gs(working_filename, delete=(filename != working_filename)) # Instantiates a Speech client using credentials client = speech.SpeechClient(credentials=_get_credentials()) # Loads the audio into memory if uploaded: audio = types.RecognitionAudio(uri=upload_uri) else: with io.open(working_filename, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, language_code=language) print("URI:", upload_uri) # Detects speech in the audio file operation = client.long_running_recognize(config, audio) operation_name = operation.operation_name() if wait > 0: print("Operation:", operation_name) retry_count = wait // 10 + 1 while retry_count > 0 and not operation.done(): retry_count -= 1 time.sleep(10) progress = operation.metadata().progress_percent print("Progress:", progress) if not operation.done(): raise TimeoutError( "Conversion not completed before end of retries") response = operation.result() transcript = '' for result in response.results: # Several alternatives could be proposed, but generally only one is available transcript += result.alternatives[0].transcript + '\n' if uploaded and not keep_on_gs: _delete_from_gs(upload_uri) return transcript else: return operation_name
def main(name): # [START speech_quickstart] import io import os from pydub import AudioSegment # Imports the Google Cloud client library # [START migration_import] from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types # AudioSegment.ffmpeg = "C:\\ffmpeg\\ffmpeg\\bin\\ffmpeg.exe" # AudioSegment.ffprobe = "C:\\ffmpeg\\ffmpeg\\bin\\ffprobe.exe" # [END migration_import] # Instantiates a client # [START migration_client] client = speech.SpeechClient() # [END migration_client] # The name of the audio file to transcribe # file_name = os.path.join( # os.path.dirname(__file__), # '.', # 'file.wav') # print("++++++++++======") # print(file_name) # print("++++++++++======") file_name = name print(file_name) # file_name = argv str_name = str(file_name).split('.')[-1] str_firstname = str(file_name).split('.')[-2] filename1 = "C:/Users/multicampus/Documents/s03p23d107/backend/AI/captioning/test/images/" + str( file_name) if str_name == "m4a": print("m4a") AudioSegment.converter = "C:/ffmpeg-4.3.1-2020-09-21-full_build/bin/ffmpeg.exe" sound1 = AudioSegment.from_file(filename1, "m4a") sound1.export(str_firstname + "_trans_mp4.wav", format="wav") os.remove(filename1) filename_t = "C:/Users/multicampus/Documents/s03p23d107/backend/AI/captioning/test/images/" + str_firstname + "trans_mp4.wav" sound = sound1.set_channels(1) sound.export(filename_t, format="wav") sound = AudioSegment.from_wav(filename_t) frames_per_second = sound.frame_rate print(frames_per_second) # Loads the audio into memory with io.open(filename_t, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frames_per_second, language_code='en') # Detects speech in the audio file response = client.recognize(config, audio) os.remove(filename_t) for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript)) stt = result.alternatives[0].transcript return stt # [END speech_quickstart] if str_name == "mp4": print("mp4") AudioSegment.converter = "C:/ffmpeg-4.3.1-2020-09-21-full_build/bin/ffmpeg.exe" sound1 = AudioSegment.from_file(filename1, "mp4") sound1.export(str_firstname + "_trans_mp4.wav", format="wav") os.remove(filename1) filename_t = "C:/Users/multicampus/Documents/s03p23d107/backend/AI/captioning/test/images/" + str_firstname + "trans_mp4.wav" sound = sound1.set_channels(1) sound.export(filename_t, format="wav") sound = AudioSegment.from_wav(filename_t) frames_per_second = sound.frame_rate print(frames_per_second) # Loads the audio into memory with io.open(filename_t, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frames_per_second, language_code='en') # Detects speech in the audio file response = client.recognize(config, audio) os.remove(filename_t) for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript)) stt = result.alternatives[0].transcript return stt # [END speech_quickstart] if str_name == "mp3": print("mp3") AudioSegment.converter = "C:/ffmpeg-4.3.1-2020-09-21-full_build/bin/ffmpeg.exe" sound1 = AudioSegment.from_mp3(filename1) sound1.export(str_firstname + "_trans_mp3.wav", format="wav") os.remove(filename1) filename_t = "C:/Users/multicampus/Documents/s03p23d107/backend/AI/captioning/test/images/" + str_firstname + "_trans_mp3.wav" sound = sound1.set_channels(1) sound.export(filename_t, format="wav") sound = AudioSegment.from_wav(filename_t) frames_per_second = sound.frame_rate print(frames_per_second) # Loads the audio into memory with io.open(filename_t, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frames_per_second, language_code='en') # Detects speech in the audio file response = client.recognize(config, audio) os.remove(filename_t) for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript)) stt = result.alternatives[0].transcript return stt # [END speech_quickstart] if str_name == "wav": print("wav") sound = AudioSegment.from_wav(filename1) sound = sound.set_channels(1) sound.export(filename1, format="wav") sound = AudioSegment.from_wav(filename1) frames_per_second = sound.frame_rate print(frames_per_second) # Loads the audio into memory with io.open(filename1, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=frames_per_second, language_code='en') # Detects speech in the audio file response = client.recognize(config, audio) os.remove(filename1) for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript)) stt = result.alternatives[0].transcript return stt
def run_google_speech(filename): os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join( CONFIG('project_directory'), 'testing_webpage', 'google_cloud_speech_key.json') client = speech.SpeechClient() # The name of the audio file to transcribe filename = common.get_media_path(filename) # TODO: remove? """ test_module.down_sample_wave(filename, filename, inrate=sample_rate, outrate=ideal_sample_rate, inchannels=num_channels, outchannels=1) """ """ import librosa audio_time_series, _ = librosa.load(filename, sr=None) audio_time_series = librosa.core.resample(audio_time_series, orig_sr=sample_rate, target_sr=ideal_sample_rate) filename = filename + '.wav' librosa.output.write_wav(filename, audio_time_series, ideal_sample_rate) """ # With scipy """ print('BEFORE:') in_rate, _ = describe_wav(filename) ds = test_module.DownSample(in_rate=in_rate, out_rate=ideal_sample_rate) opens = ds.open_file(filename) if opens: ds.resample(filename) """ """ freq, audio = wavfile.read(filename) print(audio) print(freq) wavfile.write(filename, 16000, audio) """ print('AFTER:') describe_wav(filename) # Loads the audio into memory with io.open(filename, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, language_code='es-CO') # Detects speech in the audio file response = client.recognize(config, audio) for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript)) return response.results[0].alternatives[0].transcript
def transcribe_file(speech_file, sample_rate, parser): # authenticate with google using credentials in JSON file credentials = GoogleCredentials.get_application_default() client = speech.SpeechClient() # open audio file with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() # send audio file to recognizer audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=sample_rate, language_code='en-US') utterances = [] marker_coordinates = [] # send audio to recognizer response = client.recognize(config, audio) # Each result is for a consecutive portion of the audio. # #Iterate through them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. application.logger.info('Transcript: {}'.format( result.alternatives[0].transcript)) # send transcribed text to geo parse services_ lat, long, address, geo_json = geo_parse( result.alternatives[0].transcript, parser) # if we get back any geo parse results store as markers for map and append to utterance text if bool(address) and lat is not None: my_point = Point((float(long), float(lat))) my_feature = Feature(geometry=my_point, properties={ 'title': 'Geo Location: {}'.format(address), 'description': 'Transcript: {}'.format( result.alternatives[0].transcript), 'marker-size': 'large', 'marker-color': '#FF0000', 'marker-symbol': 'police' }) # Insert record into DB row = [ str(uuid.uuid4()), float(long), float(lat), 'Geo Location: {}'.format(address), 'Transcript: {}'.format(result.alternatives[0].transcript), str(datetime.datetime.now()) ] db.InsertRow(tablename='security_events', row=row) #use this line to temp export the table for debugging #db.ExportCSV(tablename='security_events') # store lat long as marker coordinates for map marker_coordinates.append(my_feature) # store utterance with geo parsed address after for disaply utterances.append( 'Transcript: {}'.format(result.alternatives[0].transcript) + ' ( ' + '<em style="color:LightGray;">' + 'Geo Location: {}'.format(address) + '</em>' + ' )') elif bool(address) and lat is None: utterances.append( 'Transcript: {}'.format(result.alternatives[0].transcript) + ' ( ' + '<em style="color:LightGray;">' + 'Geo Location: {}'.format(address) + '</em>' + ' )') # if there are no geo parsed results just added text without address else: utterances.append('Transcript: {}'.format( result.alternatives[0].transcript)) return utterances, marker_coordinates
def text_recognition(path, config): root, ext = os.path.splitext(path) txt_path = root + ".txt" if os.path.exists(txt_path): with open(txt_path) as f: out = json.loads(open(txt_path).read()) return out from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types out = {} error_count = 0 tmp_path = os.path.splitext(path)[0] + ".tmp.wav" client = speech.SpeechClient() # Fixed while True: try: # client= speech.SpeechClient() # Causes 10060 max retries exceeded -to OAuth -HK content = load_audio( path, pre_silence_length=config.pre_silence_length, post_silence_length=config.post_silence_length) max_duration = config.max_duration - \ config.pre_silence_length - config.post_silence_length audio_duration = get_duration(content) if audio_duration >= max_duration: print(" [!] Skip {} because of duration: {} > {}". \ format(path, audio_duration, max_duration)) return {} content = resample_audio(content, config.sample_rate) save_audio(content, tmp_path, config.sample_rate) with io.open(tmp_path, 'rb') as f: audio = types.RecognitionAudio(content=f.read()) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=config.sample_rate, language_code='ko-KR') response = client.recognize(config, audio) if len(response.results) > 0: alternatives = response.results[0].alternatives results = [ alternative.transcript for alternative in alternatives ] assert len(results) == 1, "More than 1 results: {}".format( results) out = {path: "" if len(results) == 0 else results[0]} print(path, results[0]) break break except Exception as err: raise Exception("OS error: {0}".format(err)) error_count += 1 print("Skip warning for {} for {} times". \ format(path, error_count)) if error_count > 5: break else: continue remove_file(tmp_path) with open(txt_path, 'w') as f: json.dump(out, f, indent=2, ensure_ascii=False) return out
def transcribe(gcs_uri, apikey, language='en-US', confidences=False): """Function to asynchronously translate audio file uploaded to Google Cloud Platform. Parameters ----------- gcs_uri: str URI file path consisting of bucket name and filename See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage apikey: str Path to the .json file with Google Cloud API key. See: https://cloud.google.com/docs/authentication/api-keys language: str (default: 'en-US') Passes language code argument to client. Many languages available. See: https://cloud.google.com/speech-to-text/docs/languages confidences: bool (default: False) Inserts back-to-top links below headings if True. Returns ----------- cont: file Text file with transcription and (optionally) json file with transcription and confidence levels. """ from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = apikey client = speech.SpeechClient() # For optimal results, file sample hertz rate # should be at least 16000Hz audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, language_code=language) operation = client.long_running_recognize(config, audio) print('PROCESSING...') response = operation.result(timeout=None) transcript_list = [] result_dict = {} for n, result in enumerate(response.results): # Note that results returns alternative translations # with varying degrees of confidence, with the zeroth # alternative as the most likely. transcript = result.alternatives[0].transcript confidence = result.alternatives[0].confidence result_value = {'transcript': transcript, 'confidence': confidence} result_key = 'result_{}'.format(n) result_dict[result_key] = result_value transcript_list.append(transcript) transcript_str = ''.join(transcript_list) # write files to os audio_name = gcs_uri.split('/')[3] writer(transcript_str, '{}-transcript.txt'.format(audio_name)) if confidences: writer(result_dict, '{}-transcript_confidences.json'.format(audio_name)) return None
def speech_to_text_in_a_min( doc_title='範例1_一分鐘內雲端運算', title_pattern='nlpno.wav', wd='/home/slave1/git/Speech2Text_workshop/record', json_os='/home/slave1/git/Speech2Text_workshop/speech2text-3de4444fd46a.json', sample_rate_hertz=48000): ''' * json_os:憑證檔的路徑 * title_pattern:錄音檔的名稱模式 * sample_rate_hertz:錄音的取樣頻率 * doc_title:docx文件名稱 * wd:工作目錄 ''' # 計時 start_time = time.time() # 從python client端對雲端speech2text服務進行驗證 os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = json_os client = speech.SpeechClient() file_list = os.listdir(wd) # 選出title_pattern的錄音檔 select_wav = [] for i in file_list: if title_pattern in i: select_wav.append(wd + '/' + i) aa = pd.DataFrame() for music in select_wav: # 將 audio錄音檔 讀入進來 with io.open(music, 'rb') as audio_file: content = audio_file.read() # 將錄音檔轉換成google 看得懂的格式 audio = types.RecognitionAudio(content=content) # 設定格式錄音檔 config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=sample_rate_hertz, language_code='cmn-Hant-TW', enable_word_time_offsets=True) # 機器學習文字辨識(speech2text) print('') response = client.recognize(config, audio) transcript_list = [] transcript_confidence = [] timerecored = [] # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: alternative = result.alternatives[0] # The first alternative is the most likely one for this portion. transcript_list.append(alternative.transcript) transcript_confidence.append(alternative.confidence) print('Transcript: {}'.format(alternative.transcript)) print('Confidence: {}'.format(alternative.confidence)) # begining and end time of a sentence sentence_start_time = alternative.words[0].start_time sentence_end_time = alternative.words[len(alternative.words) - 1].end_time # make time sentence_start_time = round(sentence_start_time.seconds + sentence_start_time.nanos * 1e-9) sentence_end_time = round(sentence_end_time.seconds + sentence_end_time.nanos * 1e-9) # make min sentence_start_time = str( datetime.timedelta(seconds=sentence_start_time)) sentence_end_time = str( datetime.timedelta(seconds=sentence_end_time)) timerecored.append([sentence_start_time, sentence_end_time]) # pandas 建立信心程度資料表 # make df transcript_df = pd.DataFrame(transcript_list, columns=['文章段句']) confidence_df = pd.DataFrame(transcript_confidence, columns=['機器認字信心水準']) confidence_df['機器認字信心水準'] = round(confidence_df['機器認字信心水準'], 2) time_df = pd.DataFrame(timerecored, columns=['start', 'end']) correctness_summary_df = pd.concat( [transcript_df, confidence_df, time_df], axis=1) correctness_summary_df = correctness_summary_df.sort_values( ['機器認字信心水準']) correctness_summary_df['改善順序'] = range(1, len(correctness_summary_df) + 1) timer_translist = [] for hah, timer in zip(transcript_list, timerecored): timer_translist.append(hah + ' ' + '【' + ' to '.join(timer) + '】') aa = pd.concat([aa, correctness_summary_df]) # 製作文字雲 from speech2text import make_worldcould_report, text_freq cut_text = make_worldcould_report(data=aa, pd_text_col='文章段句', mask_pic=False, filename='wordcloud', pic_name='test.png') words_counts = text_freq(cut_text) # 計算重要程度 max = words_counts['counts'].describe()['max'] mean = words_counts['counts'].describe()['mean'] # 僅取出max與mean的字詞 words_counts = words_counts[(words_counts['counts'] <= max) & (words_counts['counts'] >= mean)] df_count_all = pd.DataFrame() for index, i in words_counts.iterrows(): df_count = correctness_summary_df[ correctness_summary_df['文章段句'].str.contains(i['word'])] if not df_count.empty: df_count['重要性'] = i['counts'] df_count_all = pd.concat([df_count_all, df_count]) # group by correctness_summary_df = df_count_all.groupby( ['文章段句', '機器認字信心水準', 'start', 'end', '改善順序'], as_index=False)['重要性'].mean().round(2) # save to docx document = Document() document.add_heading(doc_title, 0) document.add_paragraph( '機器認字信心水準' + str(round(correctness_summary_df['機器認字信心水準'].mean(), 2)) + '\n\n' + '\n\n'.join(timer_translist)) document.add_picture('wordcloud.png', width=Cm(15), height=Cm(13)) document.save(doc_title + '_文章逐字稿.docx') print('Done') print('請看工作目錄檔案中有沒有兩個檔案,一格個是完整的docx檔案,一個是xlsx檔案') print("--- %s seconds ---" % (round(time.time() - start_time, 2))) return correctness_summary_df.to_excel(doc_title + '_文章認字信心矩陣.xlsx')
print "finished recording" # stop Recording stream.stop_stream() stream.close() audio1.terminate() file = open("newfile.raw", "w") file.write(b''.join(frames)) file.close() client = speech.SpeechClient() with io.open('newfile.raw', 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-US') response = client.recognize(config, audio) for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript)) if result.alternatives[0].transcript == "lights on": s = "1" elif result.alternatives[0].transcript == "orange": s = "2" elif result.alternatives[0].transcript == "kitchen":
def startSTT(end): form_1 = pyaudio.paInt16 # 16-bit resolution chans = 1 # 1 channel samp_rate = 44100 # 44.1kHz sampling rate chunk = 4096 # 2^12 samples for buffer record_secs = 3600 # seconds to record dev_index = 2 # device index found by p.get_device_info_by_index(ii) threshold = 15000 sliding_window = deque(maxlen=15) client = speech.SpeechClient() #send_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) #send_socket.connect(('10.0.0.194', 8002)) audio = pyaudio.PyAudio() # create pyaudio instantiation # create pyaudio stream stream = audio.open(format = form_1,rate = samp_rate,channels = chans, \ input_device_index = dev_index, input = True, \ frames_per_buffer=chunk) print("recording") predata = deque(maxlen=10) while end.value == 0: frames = [] started = False # loop through stream and append audio chunks to frame array for ii in range(0, int((samp_rate / chunk) * record_secs)): data = stream.read(chunk, exception_on_overflow=False) predata.append(data) rms = audioop.rms(data, 2) print(rms) if rms > threshold and started is False: started = True print('started') if started: frames.append(data) sliding_window.append(rms) if sum(ii < threshold for ii in sliding_window) >= 15: print("ending") break print("finished recording") for i in range(len(predata)): print('added a frame') frames.insert(0, predata.pop()) audio = types.RecognitionAudio(content=b''.join(frames)) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code='en-US') response = client.recognize(config, audio) for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript)) # send result to server #send_socket.send((result.alternatives[0].transcript).encode()) print("sent") # stop the stream, close it, and terminate the pyaudio instantiation stream.stop_stream() stream.close() audio.terminate()
] # mac 不一樣是,應該用pattern去做 請記得,引數都是從 0 開始,所以第 2 個數值就是引數 1 music = [i for i in music if 'wav' in i] # mac 不一樣是,應該用pattern去做 請記得,引數都是從 0 開始,所以第 2 個數值就是引數 1 ##以上 會捉到 nlpno.wav music = music[0] # 成功抓出第二個數值 # 將 audio錄音檔 讀進來 with io.open(music, 'rb') as audio_file: content = audio_file.read() # 看一下讀進python裡面是以何種方式呈現 content[0:100] # 將錄音檔轉換成google 看得懂的格式 audio = types.RecognitionAudio(content=content) # 如果您覺得自己電腦效能夠好的話,可以執行看看下面的audio #audio ################################################################## # google 語音分析 ################################################################### # 設定格式錄音檔 config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding. LINEAR16, # LINEAR16 for Audio sample_rate_hertz=sample_rate_hertz, language_code='cmn-Hant-TW', # for Taiwan 語系 enable_word_time_offsets=True) # 是否要分段
def validate_dataset(yt_uri, matching, in_stage, out_stage): # Use vid as the diretory name for download and processing vids = parse_qs(urlparse(yt_uri).query, keep_blank_values=True).get('v') vid = None if vids == None else vids[0] v_dir = os.path.join(data_path, vid) in_dir = os.path.join(v_dir, in_stage) out_dir = os.path.join(v_dir, out_stage) ext_dir = os.path.join(v_dir, out_stage + 'ext') # Get information on the YouTube content try: yt = YouTube(yt_uri) except: e = sys.exc_info()[0] print("Exception: {}".format(e)) sys.exit(1) # Creating array of wav files files = [] for file in os.listdir(in_dir): if file.endswith('.wav'): files.append(file) files.sort() os.makedirs(out_dir, exist_ok=True) os.makedirs(ext_dir, exist_ok=True) # Speech client client = speech.SpeechClient() for file in files: event_no = os.path.splitext(os.path.basename(file))[0] subtitle = os.path.join(in_dir, event_no + '.txt') transcript = os.path.join(in_dir, event_no + 't.txt') if Path(subtitle).exists() == False: continue # Printing process and testing files try: file_path = os.path.join(in_dir, file) print(file_path) audio_file = io.open(file_path, 'rb') audio_content = audio_file.read() audio_file.close() audio = types.RecognitionAudio(content=audio_content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, speech_contexts=[{ "phrases": build_phrase_hint(subtitle) }], language_code='ko-kr') response = client.recognize(config, audio) subtitle_file = io.open(subtitle, 'r') transcript_file = io.open(transcript, 'w') # Determining appropriateness of existing subtitle result_script = "" print(u"Subtitle: {}".format(subtitle_file.read())) for result in response.results: print(u"Response: {}".format( result.alternatives[0].transcript)) print("Confidence: {}".format( result.alternatives[0].confidence)) result_script += result.alternatives[0].transcript print(u"Transcript: {}".format(result_script)) try: transcript_file.write(result_script) except: exc_type, exc_obj, exc_tb = sys.exc_info() exc_file = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, exc_file, exc_tb.tb_lineno) sys.exit(1) subtitle_file.close() transcript_file.close() score = similarity_score(subtitle, transcript) # Moving appropriate files to output pipeline stage if matching == 'exact': result = exact_match(subtitle, transcript) elif matching == 'similarity': result = score >= 0.9 else: # matching == 'subs' or else result = substring_match(subtitle, transcript) if result == True: shutil.move(file_path, out_dir) shutil.move(subtitle, out_dir) shutil.move(transcript, out_dir) message = "Matched" elif score >= 0.95: shutil.move(file_path, ext_dir) shutil.move(subtitle, ext_dir) shutil.move(transcript, ext_dir) message = "Matched (Similar)" else: message = "Not Matched" print("Result: {}, Score: {}".format(message, score)) print("") except: exc_type, exc_obj, exc_tb = sys.exc_info() exc_file = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, exc_file, exc_tb.tb_lineno) sys.exit(1)
def transcript(content): audio = types.RecognitionAudio(content=content) response = client.recognize(config, audio) return response.results
#! pip install webapp2 #! pip install cloudstorage #! pip install GoogleAppEngineCloudStorageClient from google.cloud import storage client = storage.Client() bucket = client.get_bucket('storagexxx') blob = bucket.get_blob('Info/1-NotSolved_No_Silence.wav') blob3=blob.upload_from_filename(filename='1-NotSolved_No_Silence.wav') audio = types.RecognitionAudio(content=blob3) from google.cloud import speech_v1p1beta1 as speech client = speech.SpeechClient() config = speech.types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.MULAW, sample_rate_hertz=8000,enable_word_time_offsets= True, language_code='pt-BR', enable_automatic_punctuation= True, use_enhanced=True, speech_contexts=[speech.types.SpeechContext(phrases=['computador', 'wi-fi'])], enable_speaker_diarization=True, diarization_speaker_count=2,
def speech_to_text(): print('button pressed') os.system('python D:/Hackathons/VirtualPatient/src/record.py') file_name = "D:/Hackathons/VirtualPatient/output.wav" # X, sample_rate = librosa.load(file_name, res_type='kaiser_fast', duration=2.5, sr=22050*2, offset=0.5) # sample_rate = np.array(sample_rate) # mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0) # featurelive = mfccs # livedf2 = featurelive # livedf2 = pd.DataFrame(data=livedf2) # livedf2 = livedf2.stack().to_frame().T # twodim = np.expand_dims(livedf2, axis=2) # livepreds = loaded_model.predict(twodim, # batch_size=32, # verbose=1) # livepreds = np.array([0,0,0,0,0,0,0,0,0,0]) # Instantiates a client client = speech.SpeechClient() # Loads the audio into memory with io.open(file_name, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, language_code='en-US') try: # Detects speech in the audio file response = client.recognize(config, audio) print(response) for result in response.results: data = {'msg': result.alternatives[0].transcript, 'counter': 1} resp = Response(json.dumps(data), status=200, mimetype='application/json') resp.headers['Access-Control-Allow-Origin'] = '*' return resp except: data = { 'msg': "Your message was not picked up, please try again.", 'counter': 0 } resp = Response(json.dumps(data), status=200, mimetype='application/json') resp.headers['Access-Control-Allow-Origin'] = '*' return resp
def speech_to_text( gcs_uri='gs://speechfashion/Acc.wav', doc_title='範例2_一分鐘以上雲端運算', timeout=None, json_os='/home/slave1/git/Speech2Text/damnhow-db8d83229dd4.json', sample_rate_hertz=96000): ''' 1.產出文章認字信心矩陣csv,提供修改者文句之修正順序 2.產出docx文本,並提供文章機器認字信心水準,供修改者修改 ''' # 計時 start_time = time.time() os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = json_os client = speech.SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=sample_rate_hertz, language_code='cmn-Hant-TW', enable_word_time_offsets=True) # config=types.StreamingRecognitionConfig(config=config) # stream = [audio] # requests = (types.StreamingRecognizeRequest(audio_content=chunk) # for chunk in stream) # responses = client.streaming_recognize(config, requests) operation = client.long_running_recognize(config, audio) print('機器學習文字辨識中...') response = operation.result(timeout=timeout) # aa = pd.DataFrame() transcript_list = [] transcript_confidence = [] timerecored = [] # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: alternative = result.alternatives[0] # The first alternative is the most likely one for this portion. transcript_list.append(alternative.transcript) transcript_confidence.append(alternative.confidence) print('Transcript: {}'.format(alternative.transcript)) print('Confidence: {}'.format(alternative.confidence)) # begining and end time of a sentence sentence_start_time = alternative.words[0].start_time sentence_end_time = alternative.words[len(alternative.words) - 1].end_time # make time sentence_start_time = round(sentence_start_time.seconds + sentence_start_time.nanos * 1e-9) sentence_end_time = round(sentence_end_time.seconds + sentence_end_time.nanos * 1e-9) # make min sentence_start_time = str( datetime.timedelta(seconds=sentence_start_time)) sentence_end_time = str(datetime.timedelta(seconds=sentence_end_time)) timerecored.append([sentence_start_time, sentence_end_time]) # pandas 建立信心程度資料表 # make df transcript_df = pd.DataFrame(transcript_list, columns=['文章段句']) confidence_df = pd.DataFrame(transcript_confidence, columns=['機器認字信心水準']) confidence_df['機器認字信心水準'] = round(confidence_df['機器認字信心水準'], 2) time_df = pd.DataFrame(timerecored, columns=['start', 'end']) correctness_summary_df = pd.concat( [transcript_df, confidence_df, time_df], axis=1) correctness_summary_df = correctness_summary_df.sort_values( ['機器認字信心水準']) correctness_summary_df['改善順序'] = range(1, len(correctness_summary_df) + 1) timer_translist = [] for hah, timer in zip(transcript_list, timerecored): timer_translist.append(hah + ' ' + '【' + ' to '.join(timer) + '】') aa = pd.concat([aa, correctness_summary_df]) # 製作文字雲 from speech2text import make_worldcould_report, text_freq cut_text = make_worldcould_report(data=aa, pd_text_col='文章段句', mask_pic=False, filename='wordcloud', pic_name='test.png') words_counts = text_freq(cut_text) # 計算重要程度 max = words_counts['counts'].describe()['max'] mean = words_counts['counts'].describe()['mean'] # 僅取出max與mean的字詞 words_counts = words_counts[(words_counts['counts'] <= max) & (words_counts['counts'] >= mean)] df_count_all = pd.DataFrame() for index, i in words_counts.iterrows(): df_count = correctness_summary_df[ correctness_summary_df['文章段句'].str.contains(i['word'])] if not df_count.empty: df_count['重要性'] = i['counts'] df_count_all = pd.concat([df_count_all, df_count]) # group by correctness_summary_df = df_count_all.groupby( ['文章段句', '機器認字信心水準', 'start', 'end', '改善順序'], as_index=False)['重要性'].mean().round(2) # save to docx document = Document() document.add_heading(doc_title, 0) document.add_paragraph( '機器認字信心水準' + str(round(correctness_summary_df['機器認字信心水準'].mean(), 2)) + '\n\n' + '\n\n'.join(timer_translist)) document.add_picture('wordcloud.png', width=Cm(15), height=Cm(13)) document.save(doc_title + '_文章逐字稿.docx') print('Done') print('請看工作目錄檔案中有沒有兩個檔案,一格個是完整的docx檔案,一個是xlsx檔案') print("--- %s seconds ---" % (round(time.time() - start_time, 2))) return correctness_summary_df.to_excel(doc_title + '_文章認字信心矩陣.xlsx')
storage_client = storage.Client() #variables source_bucket_name = '911-calls' source_bucket = storage_client.bucket(source_bucket_name) bucket_prefix = 'audio-recordings-wav' #create a csv file with open('data.csv', 'a') as csvfile: csvfile.write('audio_gcs_uri, transcript' + '\n') # for each audio file .... for file in (list(source_bucket.list_blobs(prefix=bucket_prefix))): audio_gcs_uri = "gs://" + source_bucket_name + "/" + file.name audio = types.RecognitionAudio(uri=audio_gcs_uri) print(audio) config = types.RecognitionConfig( encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, # use_enhanced=True, # for phone audio # model='phone_call', # model must be specified for enhanced model language_code='en-US') operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(timeout=3600) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file.
client = speech.SpeechClient() # The name of the audio file to transcribe audio_file = sys.argv[1] # 'SchoolOfAI.wav' file_to_mono = 'audio_mono.wav' sound = AudioSegment.from_wav(audio_file) sound = sound.set_channels(1) # convert audio file to mono sound.export(file_to_mono, format="wav") # Loads the audio into memory - for when not storing file on Google Cloud Storage # with io.open(file_to_mono, 'rb') as audio_file: # content = audio_file.read() # audio = types.RecognitionAudio(content=content) # Loads the audio from Google Cloud Storage audio = types.RecognitionAudio(uri='gs://ctrlfboilermake/' + audio_file) config = types.RecognitionConfig( # encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, # sample_rate_hertz=16000, language_code='en-US', enable_word_time_offsets=True, enable_automatic_punctuation=True) # Detects speech in the audio file # response = client.recognize(config, audio) operation = client.long_running_recognize(config, audio) result = operation.result(timeout=90) transcript = "" list_of_times = []
def transcribe_file(speech_file): print(speech_file) """Transcribe the given audio file.""" client = speech.SpeechClient() now = datetime.datetime.now() speech_file = os.getcwd() + '/' + speech_file # [START speech_python_migration_sync_request] # [START speech_python_migration_config] outstr = '' try: #print(os.path.isfile(speech_file) ) if (debug): print('Python::opening file ' + speech_file) with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='ko-KR') # [END speech_python_migration_config] # [START speech_python_migration_sync_response] if (debug): print('GCP::requesting : ' + speech_file) response = client.recognize(config, audio) # [END speech_python_migration_sync_request] # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. # The first alternative is the most likely one for this portion. #output_transcription = response.results[0].alternatives[0].transcript if (debug): print('opening response file') outfile = open('response.txt', 'a') for result in response.results: if (os.name == 'posix'): outstr = (result.alternatives[0].transcript) else: # outstr =(result.alternatives[0].transcript).encode("utf-8") outstr = (result.alternatives[0].transcript) # outstr = result.alternatives[0].transcript if (debug): print('GCP::response : ' + outstr) #outstr = result.alternatives[0].transcript #print(u'Transcript: {}'.format(result.alternatives[0].transcript)) #print(outstr.format(result.alternatives[0].transcript)) outfile.write(now.strftime("%Y-%m-%d %H:%M") + " : " + outstr) outfile.write('\n') #outfile.write(outstr) print("GCP-response::" + outstr) #outfile.write(output_transcription) #outfile.write(response.results[0].alternatives[0].transcript) outfile.close() # print('closing response file') except: #outfile = open('output.txt', 'w') outfile = open('response.txt', 'a') outfile.write(now.strftime("%Y-%m-%d %H:%M") + " : " + "ERROR\n") #outfile.write('recognition error occured') outfile.close() outstr = 'ERROR::exeption occured in GCP-Speech' print(outstr) #outfile.write("error") #outfile.close() # [END speech_python_migration_sync_response] # [END speech_transcribe_sync] # print('got ' + outstr) if (outstr == ""): return "<내용 없음>" return outstr
parser = argparse.ArgumentParser( description='Prints the language from an Image') parser.add_argument(dest="audio", help='URL of an image') return parser.parse_args() if __name__ == "__main__": args = get_args() # api_key = os.environ.get('GCP_API_KEY') # project_name = os.environ.get('PROJECT_NAME') project_name = os.environ[ 'GOOGLE_APPLICATION_CREDENTIALS'] == 'service_account.json' client = speech.SpeechClient() audio = types.RecognitionAudio(uri=args.audio) config = types.RecognitionConfig(language_code='en-US') operation = client.long_running_recognize(config, audio) print('Waiting for operation to complete...') response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. print(u'Transcript: {}'.format(result.alternatives[0].transcript)) print('Confidence: {}'.format(result.alternatives[0].confidence))