def send_speech(self, File): model_path = get_model_path() y, s = librosa.load(File, sr=16000) sf.write(File, y, s) config = { 'lm': False, 'audio_file': File, 'hmm': constants.POCKET_SPHINX_MODEL_FILEPATH, 'dict': os.path.join(model_path, 'cmudict-en-us.dict') } yes_result = 0 no_result = 0 audio = AudioFile(kws=constants.YES_WORDS_FILEPATH, **config) for phrase in audio: yes_result += 1 audio = AudioFile(kws=constants.NO_WORDS_FILEPATH, **config) for phrase in audio: no_result += 1 os.remove(File) if yes_result == 0 and no_result == 0: result = "unsure" if yes_result > no_result: result = "yes" else: result = "no" sys.stdout.write('SET VARIABLE GoogleUtterance "%s"\n' % str(result)) sys.stdout.flush() sys.stdout.write("EXEC " + "\"" + "NOOP" + "\" \"" "%s \n" % str(result)) sys.stdout.flush()
def extract_keywords(self, file_name, sample_rate=16000, window_ms=1000, hop_ms=500): kws_results = [] files = [file_name] for fname, transcription in zip( files, quartznet.transcribe(paths2audio_files=files)): print( f"[NeMo] Audio in {fname} was recognized as: {transcription}") self.kws_config['audio_file'] = file_name audio = AudioFile(audio_file=file_name) print(f"Printing all audio segments in {file_name}") for phrase in audio: for s in phrase.seg(): print(s.start_frame, s.end_frame, s.word) print( transcribe(file_name, s.start_frame * 160, s.end_frame * 160)) print("Done printing segments") audio = AudioFile(**self.kws_config) for phrase in audio: result = phrase.segments(detailed=True) # TODO:: confirm that when multiple keywords are detected, every detection is valid if len(result) == 1: start_time = result[0][2] * 10 end_time = result[0][3] * 10 # print('%4sms ~ %4sms' % (start_time, end_time)) text = transcribe(file_name, start_time * 16, end_time * 16) if self.keyword not in text.lower(): continue print("Pruning") while not good_start(text, self.keyword) and start_time < end_time: start_time += 100 text = transcribe(file_name, start_time * 16, end_time * 16) while not good_end(text, self.keyword) and start_time < end_time: end_time -= 100 text = transcribe(file_name, start_time * 16, end_time * 16) if text == self.keyword: print("MATCH", file_name) kws_results.append((start_time, end_time)) return kws_results
def get_subtitles(self, input_file_path, output_dir) -> SubtitleStageResult: phrases = [] log = "" with contextlib.closing(wave.open(input_file_path, 'r')) as f: rate = f.getframerate() frames = f.getnframes() duration = frames / float(rate) log += f"rate: {rate}, frames: {frames}, duration: {duration}" for phrase in AudioFile(audio_file=input_file_path): start = sys.maxsize end = 0 for seg in phrase.seg(): start = min(start, seg.start_frame) end = max(end, seg.end_frame) log += f"{phrase}: {start}~{end}\n" phrases.append( Subtitle("", str(phrase), start / float(rate), end / float(rate))) result = SubtitleStageResult() result.success = True result.log = log result.subtitles = phrases return result
def extract_keywords(self, file_name, sample_rate=16000, window_ms=1000, hop_ms=500): kws_results = [] self.kws_config['audio_file'] = file_name audio = AudioFile(**self.kws_config) for phrase in audio: result = phrase.segments(detailed=True) if len(result) == 0: continue start_time = result[0][2] * 10 end_time = result[0][3] * 10 # print('%4sms ~ %4sms' % (start_time, end_time)) if len(result) > 1: print(result) raise ValueError('Result has more than one entry') kws_results.append((start_time, end_time)) return kws_results
def test_kws(self): segments = [] for phrase in AudioFile(lm=False, keyphrase='forward', kws_threshold=1e+20): segments = phrase.segments(detailed=True) self.assertEqual(segments, [('forward', -617, 63, 121)])
def get_words_from_file(file_path): """ :param file_path: audio file (must be raw 16khz 16bit) :return: a list of phrases made of words """ model_path = get_model_path() data_path = get_data_path() config = { 'verbose': False, 'audio_file': file_path, 'buffer_size': 2048, 'no_search': False, 'full_utt': False, 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(model_path, 'en-us.lm.bin'), 'dict': os.path.join(model_path, 'cmudict-en-us.dict') } audio = AudioFile(**config) phrases = [] for phrase in audio: phrases.append(str(phrase)) return phrases
def get_phonemes_from_file(file_path): """ :param file_path: audio file (must be raw 16khz 16bit) :return: a list of phrases made of phonemes """ model_path = get_model_path() data_path = get_data_path() config = { 'verbose': False, 'audio_file': file_path, 'buffer_size': 2048, 'no_search': False, 'full_utt': False, 'hmm': os.path.join(model_path, 'en-us'), 'allphone': os.path.join(model_path, 'en-us/en-us-phone.lm.dmp'), 'beam': 1e-20, 'pbeam': 1e-20, 'lw': 2.0 } audio = AudioFile(**config) phrases = [] for phrase in audio: phrases.append(str(phrase)) return phrases
def transcribe(self, audio_file: Path): self.config['audio_file'] = audio_file transcription = '' for phase in AudioFile(**self.config): transcription = str(phase) break return transcription
def decode_with_time_stamp(config): # with time axis fps = config['frate'] for phrase in AudioFile(**config): print('-' * 28) print('| %5s | %3s | %5s |' % ('start', 'end', 'word')) print('-' * 29) for s in phrase.seg(): print('| %4ss | %4ss | %9s |' % (s.start_frame / fps, s.end_frame / fps, s.word)) print('-' * 29)
def keyword_spotting(config): # keyword spotting if not config.get('keyphrase'): raise ValueError('no keyphrase given for spotting') fps = config['frate'] config['lm'] = False config['kws_threshold'] = 1e-20 audio = AudioFile(**config) for phrase in audio: for s in phrase.seg(): print(s.start_frame / fps, s.end_frame / fps, s.word)
def keyword_list_spotting(config): # uses a file for inputting the keywords if not config.get('kws'): raise ValueError('no keywords file given for spotting') if not os.path.isfile(config['kws']): raise IOError('keywords file does not exist %s' % config['kws']) fps = config['frate'] config['lm'] = False audio = AudioFile(**config) for phrase in audio: for s in phrase.seg(): print(s.start_frame / fps, s.end_frame / fps, s.word, s.prob)
def grammar_search(config): # search via jsgf queries # for Java Speech grammar format check https://www.w3.org/TR/jsgf/ if not config.get('jsgf'): raise ValueError('no jsgf file given for grammar search') if not os.path.isfile(config['jsgf']): raise IOError('grammar file does not exist %s' % config['jsgf']) fps = config['frate'] config['lm'] = False config['jsgf'] = 'n.gram' config['keyphrase'] = None audio = AudioFile(**config) for phrase in audio: for s in phrase.seg(): print(s.start_frame / fps, s.end_frame / fps, s.word)
def test(update): downloadVoiceFile(update) model_path = get_model_path() config = { 'audio_file': os.path.join("files", LAST_VOICE_FILE), 'verbose': False, 'buffer_size': 2048, 'no_search': False, 'full_utt': False, 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(model_path, 'en-us.lm.bin'), 'dict': os.path.join(model_path, 'cmudict-en-us.dict') } for phrase in AudioFile(**config): print(phrase)
def phone(): config = { 'verbose' : False, 'logfn' : '/dev/null' or 'nul', 'audio_file' : 'tovmok.wav', 'audio_device' : None, 'sampling_rate' : 16000, 'buffer_size' : 2048, 'no_search' : False, 'full_utt' : False, 'hmm' : 'ASRProject/model_parameters/iot.ci_cont', 'lm' : 'ASRProject/etc/iot.lm.DMP', 'dict' : 'ASRProject/etc/iot.dic', } audio = AudioFile(**config) for phrase in audio: print(phrase)
def detect(self, file_name): kws_results = [] self.kws_config['audio_file'] = file_name audio = AudioFile(**self.kws_config) for phrase in audio: result = phrase.segments(detailed=True) # TODO:: confirm that when multiple keywords are detected, every detection is valid if len(result) == 1: start_time = result[0][2] * 10 end_time = result[0][3] * 10 if self.verbose: print('%4sms ~ %4sms' % (start_time, end_time)) kws_results.append((start_time, end_time)) return kws_results
def transcribe(myword): dic = {} fps = 100 stre = "" print(os.getcwd() + "/audios") for file in os.listdir(os.getcwd() + "/audios"): if file.endswith("wav"): afile = (file.split(".")[0]) + ".mp4" dic[(afile)] = [] print(file) for phrase in AudioFile(audio_file=(os.getcwd() + "/audios/" + file), full_utt=False): for s in phrase.seg(): #if '<' not in s.word: print('| %4ss | %4ss | %8s |' % (s.start_frame / fps, s.end_frame / fps, s.word)) if myword in s.word: occ = [s.start_frame / fps, s.end_frame / fps] dic[afile].append(occ) return dic
def get_sphinx_text(self, audio_file, sample_rate, lang): text = '' try: config = { 'verbose': False, 'audio_file': audio_file, 'buffer_size': 2048, 'no_search': False, 'full_utt': False, 'hmm': os.path.join(self.model_path, 'en-us'), 'lm': os.path.join(self.model_path, 'en-us.lm.bin'), 'dict': os.path.join(self.model_path, 'cmudict-en-us.dict') } config_in = { 'verbose': False, 'audio_file': audio_file, 'buffer_size': 2048, 'no_search': False, 'full_utt': False, 'hmm': os.path.join(self.model_path, 'en_in'), 'lm': os.path.join(self.model_path, 'en-in.lm.bin'), 'dict': os.path.join(self.model_path, 'en-in.dict') } print(config_in) print(config) audio = AudioFile(**config) for phrase in audio: text += str(phrase) print('CMU text:' + text) del audio except Exception as e: print('Error processing sphinx:', str(e)) print('processed sphinx STT') return text
def up_ps_audio(wavfile): """Pocketsphinx库本地离线唤醒""" model_path = get_model_path() # lm文件和dict文件替换,参考https://blog.51cto.com/feature09/2300352 # 所需资源文件在yiwa/asr/resources/中,必需放入到你的/site-packages/pocketsphinx/model/目录下 config = { 'verbose': False, 'audio_file': wavfile, 'buffer_size': 2048, 'no_search': False, 'full_utt': False, 'hmm': os.path.join(model_path, 'zh_cn'), 'lm': os.path.join(model_path, '3603.lm'), 'dict': os.path.join(model_path, '3603.dic') } # 识别声音文件,参考https://pypi.org/project/pocketsphinx/ audio = AudioFile(**config) for phrase in audio: return phrase return None
def extract_keywords(self, file_name, sample_rate=16000, window_ms=1000, hop_ms=500): kws_results = [] self.kws_config['audio_file'] = file_name audio = AudioFile(**self.kws_config) for phrase in audio: result = phrase.segments(detailed=True) # TODO:: confirm that when multiple keywords are detected, every detection is valid if len(result) == 1: start_time = result[0][2] * 10 end_time = result[0][3] * 10 # print('%4sms ~ %4sms' % (start_time, end_time)) kws_results.append((start_time, end_time)) return kws_results
def processAudio(file): audio = AudioFile(audio_file=file, buffer_size=1024) for phrase in audio: print(phrase)
import os from pocketsphinx import AudioFile audio = AudioFile( audio_file= r'C:\Users\BZT\Desktop\speech_segment\speech_segment\Ses01F_impro01_M013.wav', keyphrase='yeah') fps = 100 for phrase in audio: # frate (default=100) # print('-' * 28) # print('| %5s | %3s | %4s |' % ('start', 'end', 'word')) # print('-' * 28) for s in phrase.seg(): print('%4ss\t%4ss\t%8s' % (s.start_frame / fps, s.end_frame / fps, s.word)) # print('-' * 28) # from pocketsphinx import Pocketsphinx # # ps = Pocketsphinx(verbose=True, logfn='pocketsphinx.log') # ps.decode() # # print(ps.hypothesis())
badwords = open("badwords.txt") open("words.csv", "w").close() f = open('words.csv', 'r+') writer = csv.writer(f) writer.writerow(['start', 'end', 'word']) data = list(badwords) r = [] fps = 100 config = { 'verbose': False, 'buffer_size': 2048, 'audio_file': os.path.join(data_path, aF), 'frate': fps, 'no_search': False } audio = AudioFile(**config) for phrase in audio: print(phrase) for s in phrase.seg(): writer.writerow([s.start_frame / fps, s.end_frame / fps, s.word]) # writing timestamps of words to csv mycsv = csv.reader(f) for i in badwords: for row in mycsv: if row[2] == i: writer.writerow(row) # Code Wasteland # I leave code here and come back to it if I need to # print('| %4ss | %4ss | %8s |' % (s.start_frame / fps, s.end_frame / fps, s.word)) # writer.writerow(['{0}'.format(s.start_frame),'{0}'.format(s.end_frame),'{0}'.format(s.word)]) ### this line of code is so long it doesnt fit on one line in brackets, generally it was a bad idea, i fixed it tho and it shall lay here in the code wasteland
def simple_decode(config): # simple decode audio = AudioFile(**config) for phrase in audio: print(phrase)
from pocketsphinx import AudioFile audio = AudioFile(lm=False, keyphrase='forward', kws_threshold=1e+20) for phrase in audio: print(phrase.segments(detailed=True)) # => "[('forward', -617, 63, 121)]"
remtime = time.strftime('%H:%M:%S', time.gmtime(x)) res = remtime + ',' + mil return res f = open('sub.srt', 'w') f.close() fps = 100 counter = 1 z = 0 h = '' start = '' end = '' f = open('sub.srt', 'a') for phrase in AudioFile(audio_file='audio_filtered_final.wav', frate=fps): # frate (default=100) for s in phrase.segments(detailed=True): if z == 0 and s[0] != '<s>' and s[0] != '</s>': start = timeconv(s[2] / fps) z = 1 if s[0] != '</s>' and s[0] != '<s>': end = timeconv(s[3] / fps) if s[0] != '</s>' and s[0] != '<s>' and s[0] != '<sil>': if s[0] != '[SPEECH]': if s[0].find('(') < 0: h = h + s[0] else: h = h + (s[0])[:s[0].find('(')] else:
def test_audiofile(self): hypothesis = '' for phrase in AudioFile(): hypothesis = str(phrase) self.assertEqual(hypothesis, 'go forward ten meters')
import os, time from pocketsphinx import AudioFile, get_model_path model_path = get_model_path() exmpl_path = os.getcwd() exmpl_path = os.path.join(exmpl_path, 'examples') start = time.process_time() print('start of init') speech = AudioFile( verbose=False, audio_file=os.path.join(exmpl_path, 'coming_home_red_16000.raw'), buffer_size=2048, no_search=False, full_utt=False, hmm=os.path.join(model_path, 'zero_ru.cd_cont_4000'), lm=os.path.join(model_path, 'ru.lm'), dic=os.path.join(model_path, 'my_dict.dic') # dic=os.path.join(model_path, 'ru.dic') ) stop = time.process_time() print('time of init - ' + str(stop - start)) #digits_16000 start = time.process_time() for _ in speech: pass stop = time.process_time() print('time of recognizing - ' + str(stop - start)) print(str(speech))
def listenRoutine(self): r = sr.Recognizer() for each_file in files: print(each_file) model_path = get_model_path() data_path = get_data_path() config = { 'verbose': False, 'audio_file': os.path.join(os.getcwd(), 'audioFiles', each_file), 'buffer_size': 2048, 'no_search': False, 'full_utt': False, 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(os.getcwd(), "TAR9991/TAR9991/9991.lm"), 'dict': os.path.join(os.getcwd(), "TAR9991/TAR9991/9991.dic") } #print (config) audio = AudioFile(**config) for phrase in audio: print(phrase) with sr.AudioFile( os.path.join(os.getcwd(), "audioFiles", each_file)) as source2: recording = r.record(source2) print( r.recognize_google(recording, language="en-EN", show_all=True)) exit() if 0: eFile = sr.AudioFile(each_file) with eFile as source: audio = r.record(source) print(each_file, type(audio)) print( r.recognize_google(audio, language="en-EN", show_all=True)) #print(r.recognize_sphinx(audio, grammar="TAR9991/TAR9991/")) exit() print("\r\n\r\n*****\r\nr", r) list_text = [ 'a lumpy', 'hey Lumpy', 'lamp', 'Halen', 'Hayden', 'listen', 'Listen', 'Lampe', 'lampe' ] stop_flag = True duration = 5 while (stop_flag): config = { 'color': { 'hue': self.hue, 'saturation': self.saturation }, 'brightness': self.brightness, 'on': self.on_off, 'client': 'local' } print(" - mqtt saved:", config) our_device = getaudiodevices() print("Detected our mic:", our_device) with sr.Microphone(device_index=our_device, sample_rate=48000) as source: print("Microphone source:", source, source.__dict__.keys(), source.device_index) print(" - Call lampi (", duration, "seconds ) ...") print("Set minimum energy threshold to {}".format( r.energy_threshold)) r.adjust_for_ambient_noise(source) audio_data = r.record(source, duration=duration) #print(type(audio_data)) filename = "pre_filtered_" + datetime.now().strftime( "%H:%M:%S") + ".wav" with open(filename, "wb") as audio_file: audio_file.write(audio_data.get_wav_data()) exit() #print(" - Recognizing...") # convert speech to text #text = r.recognize_google(audio_data) try: text = r.recognize_google(audio_data, language="en-EN") print(" - heard: ", text) text = text.split(" ") for item in text: #print(list_text[i]) if item in list_text: print(" - LAMPI detected") pygame.init() pygame.mixer.music.load('this_is_lampi.mp3') pygame.mixer.music.play() time.sleep(3) pygame.mixer.music.fadeout(5) #stop_flag = False self.commandRoutine() break except: print(" - no word recognized!")
def fsg_search(self, text_snippet, audio_snippet, offset_seconds, operation='beginning', option='safe'): # create grammar file for the fsg search fsg_file = self.generate_fsg(text_snippet, operation) # store the name of the file which stores the search results fsg_result_file = fsg_file.replace('.jsgf', '.yaml') self.fsg_result_files.append(fsg_result_file) CONFIG['jsgf'] = fsg_file CONFIG['audio_file'] = audio_snippet audio = AudioFile(**CONFIG) result_sequence = [] for phrase in audio: for s in phrase.seg(): start_time = s.start_frame / CONFIG['frate'] end_time = s.end_frame / CONFIG['frate'] if start_time != end_time and s.word != '<sil>': # getting rid if NULL elements and silences result_sequence.append((start_time, end_time, s.word)) with open(fsg_result_file, 'w') as out: yaml.dump(result_sequence, out) self.remove_file(fsg_file) self.remove_file(audio_snippet) # should return the best match text snippet with beginning end if operation == 'beginning': search_snippet = copy(text_snippet) match_result, search_snippet_ind = self.find_match( result_sequence, search_snippet) # assert that offset_seconds is zero if match_result: result_seconds = offset_seconds + match_result[0] search_snippet = search_snippet[search_snippet_ind[0]:\ search_snippet_ind[1]] self.beginning_word_index = search_snippet_ind[0] else: if option == 'safe': result_seconds, search_snippet = None, [] self.beginning_word_index = None else: result_seconds = offset_seconds search_snippet = text_snippet self.beginning_word_index = 0 elif operation == 'ending': search_snippet = copy(text_snippet)[::-1] match_result, search_snippet_ind = self.find_match( result_sequence[::-1], search_snippet) if match_result: result_seconds = offset_seconds + match_result[1] search_snippet = search_snippet[search_snippet_ind[0]:\ search_snippet_ind[1]][::-1] if search_snippet_ind[0] == 0: self.ending_word_index = None else: self.ending_word_index = -1 * search_snippet_ind[0] else: if option == 'safe': result_seconds, search_snippet = None, [] self.ending_word_index = 0 else: # get result second from the audio_snippet filename m = re.search('.+_\d+\.\d+_(\d+\.\d+).wav', audio_snippet) if m: result_seconds = float(m.groups()[0]) else: result_seconds = end_time # input total duration self.ending_word_index = None search_snippet = text_snippet else: raise ValueError('option %s not known' % option) return result_seconds, search_snippet
from pocketsphinx import AudioFile for phrase in AudioFile(): print(phrase)