from vosk import Model, KaldiRecognizer from os import path from pyaudio import PyAudio, paInt16 # https://github.com/alphacep/vosk-api/blob/master/doc/models.md P = PyAudio() stream = P.open(format=paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() model = Model('_model') rec = KaldiRecognizer(model, 16000) print('Привет, я Баря!') while True: data = stream.read(2000) if not len(data): break if rec.AcceptWaveform(data): text = eval(rec.Result()) if text.get('text'): print(text.get('text')) # else: # text = eval(rec.PartialResult()) # if text.get('partial'): # print(text['partial']) #print(rec.FinalResult())
def __init__(self): self.model = Model(vosk_model_path)
if not os.path.exists(model_path): print ("Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as {} in the current folder.".format(model_path)) exit (1) if not os.path.exists(spk_model_path): print ("Please download the speaker model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as {} in the current folder.".format(spk_model_path)) exit (1) wf = wave.open(sys.argv[1], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": print ("Audio file must be WAV format mono PCM.") exit (1) # Large vocabulary free form recognition model = Model(model_path) spk_model = SpkModel(spk_model_path) rec = KaldiRecognizer(model, spk_model, wf.getframerate()) # We compare speakers with cosine distance. We can keep one or several fingerprints for the speaker in a database # to distingusih among users. spk_sig = [5.64308, 4.23898, 1.119433, -0.810904, 2.115443, 2.328436, 6.135152, 1.348195, 2.60771, 1.020717, 4.324225, -0.873012, 6.123375, 4.903791, 0.064803, 4.66212, 3.502724, 2.535861, 5.452417, 7.081769, -0.823969, -5.167974, 8.568919, 4.159035, 5.314441, 3.688272, 5.730379, 4.463213, 7.227232, 3.538961, 3.316218, 1.269628, -1.902378, 3.512679, -1.947611, -1.520158, 3.80928, -2.721601, 5.359588, 2.942463, -7.474174, 3.788054, 0.303426, 4.951366, 1.72281, -1.867125, -3.574615, 3.622509, 4.803109, 2.829714, 1.528521, 6.408293, 0.820131, 5.066522, 2.836125, 2.867029, 3.725267, 0.505927, 1.462984, 5.001863, -3.838309, -2.45902, 3.992581, 4.451616, 2.865211, -1.148313, 4.996399, -3.473454, 2.876967, 3.940124, 7.553079, 0.373356, 1.396561, 2.686691, 2.094895, 0.913796, -0.286909, 3.540179, 4.904687, 0.84554, 7.585956, 1.017081, 0.168355, 6.672327, 4.092033, -4.240158, -2.017081, -0.813043, 6.468298, 4.115041, 2.231936, 2.370055, 4.972295, 5.58382, 6.022872, 2.706988, 5.248096, -1.918003, 8.259204, -0.900911, 1.961962, 2.349709, 3.290093, 3.344172, 3.307027, 4.203372, -0.315103, 5.61919, -3.229496, 3.777309, 4.328595, 1.461014, 2.622894, 0.315525, 5.447259, 5.407609, 5.339016, 1.604555, 5.359932, 0.090242, 0.535306, 4.724705, 4.692502, 0.5783, -5.436688, -4.915511, 1.959807, 2.825248] def cosine_dist(x, y): nx = np.array(x) ny = np.array(y) return 1 - np.dot(nx, ny) / np.linalg.norm(nx) / np.linalg.norm(ny) while True: data = wf.readframes(4000) if len(data) == 0:
vosk_sample_rate = float(os.environ.get('VOSK_SAMPLE_RATE', 8000)) spk_model_path = os.environ.get('VOSK_SPK_PATH', '/opt/vosk-model-es/model-spk') if len(sys.argv) > 1: vosk_model_path = sys.argv[1] # Gpu part, uncomment if vosk-api has gpu support # # from vosk import GpuInit, GpuInstantiate # GpuInit() # def thread_init(): # GpuInstantiate() # pool = concurrent.futures.ThreadPoolExecutor(initializer=thread_init) model = Model(vosk_model_path) spk_model = SpkModel(spk_model_path) pool = concurrent.futures.ThreadPoolExecutor((os.cpu_count() or 1)) loop = asyncio.get_event_loop() def process_chunk(rec, message): if message == '{"eof" : 1}': return rec.FinalResult(), True elif rec.AcceptWaveform(message): return rec.Result(), False else: return rec.PartialResult(), False async def recognize(websocket, path):
def __init__(self): self.model_path = "./model" self.set_up() self.model = Model(self.model_path)
spk_model_path = "model-spk" if not os.path.exists(spk_model_path): print( "Please download the speaker model from https://alphacephei.com/vosk/models and unpack as {} in the current folder." .format(spk_model_path)) exit(1) wf = wave.open(sys.argv[1], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) # Large vocabulary free form recognition model = Model(lang="en-us") spk_model = SpkModel(spk_model_path) #rec = KaldiRecognizer(model, wf.getframerate(), spk_model) rec = KaldiRecognizer(model, wf.getframerate()) rec.SetSpkModel(spk_model) # We compare speakers with cosine distance. We can keep one or several fingerprints for the speaker in a database # to distingusih among users. spk_sig = [ -1.110417, 0.09703002, 1.35658, 0.7798632, -0.305457, -0.339204, 0.6186931, -0.4521213, 0.3982236, -0.004530723, 0.7651616, 0.6500852, -0.6664245, 0.1361499, 0.1358056, -0.2887807, -0.1280468, -0.8208137, -1.620276, -0.4628615, 0.7870904, -0.105754, 0.9739769, -0.3258137, -0.7322628, -0.6212429, -0.5531687, -0.7796484, 0.7035915, 1.056094, -0.4941756, -0.6521456, -0.2238328, -0.003737517, 0.2165709, 1.200186, -0.7737719, 0.492015, 1.16058, 0.6135428, -0.7183084, 0.3153541, 0.3458071, -1.418189,
def main(args): if len(args) != 2: sys.stderr.write( 'Usage: analyze.py <path to audio file> <n_clusters>\n') sys.exit(1) """ Initialize Config input: n_clusters: Integer set by a user text_processor: by default it is set to nltk.stem.snowball.SnowballStemmer sample_rate: by default set to 16 kHz due to ASR model specs aggressivness: required for VAD, by default set to maximum=3 as audiofiles are long """ config = Config(n_clusters=int(args[1])) print( "If you want to check any specific target vocabulary, please type them\n", "Ex.: train, dog, work, seventeen, Brazil\n", "Otherwise, hit enter to skip") try: lesson_vocabulary = input().lower() except SyntaxError: pass lesson = LessonSegment( lesson_vocabulary, # target_vocabulary read_audio(args[0], config.sample_rate) # audio to get pcm_data ) # update lesson dictionary to collect statistics lesson.update_dictionary(config.text_processor) # VAD vad = webrtcvad.Vad(config.aggressivness) frames = frame_generator(30, lesson.bytes, config.sample_rate) frames = list(frames) segments = vad_collector(config.sample_rate, 10, 150, vad, frames) # ASR asr = KaldiRecognizer(Model("model"), config.sample_rate) # store LessonSegment instances lesson_segments = [] # store static tempo and pitch of each LessonSegment features = [] for segment in segments: seg = LessonSegment('', segment) seg.transcribe(asr) features.append(seg.get_features(config.sample_rate)) lesson_segments.append(seg) # Clustering features = MinMaxScaler().fit_transform(np.array(features)) cl = GaussianMixture(n_components=config.n_clusters, covariance_type='full') clusters = cl.fit_predict(features) # Resegmentation - create empty n*LessonSegments segments = [LessonSegment('', b'') for n in range(config.n_clusters)] for i, cluster in enumerate(clusters): cluster = int(cluster) segments[cluster].bytes += lesson_segments[i].bytes segments[cluster].transcript.extend(lesson_segments[i].transcript) [segment.get_staistics(lesson.dictionary) for segment in segments] for i, segment in enumerate(segments): path = 'resegmentation/cluster-%002d.mp3' % (i, ) print('Writing %s' % (path, )) write_audio(path, segment.bytes, config.sample_rate) print("\n", segment.statistics, "\n")
# View & Screen example view = View() stream = open('queue.json', 'wt') p = pyaudio.PyAudio() CHANNELS = 1 RATE = 16000 CHUNK = 8000 audio_stream = p.open(format=pyaudio.paInt16, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) audio_stream.start_stream() model = Model("models/ru") rec = KaldiRecognizer(model, RATE) phrase = [] def get_product_id(name): for key in product_codes: if fuzz.ratio(key, name) >= 70: return product_codes[key] return -1 while True: data = audio_stream.read(CHUNK) if len(data) == 0: break
def main(): flag_verbose = True model_path_kaldi_local = os.path.join('data', 'models', 'kaldi-ru-0.6') folder_record = os.path.join('/', 'mnt', 'monitor', '2020-01-28') #### dict users ######################### users = {'903*******': 'user1', '903*******': 'user2'} ######################################## path_cities_all = os.path.join('data', 'pop_cities_all_2019.xlsx') path_privet_wav = os.path.join('data', 'sounds', 'privet.wav') path_repeat_answer_wav = os.path.join('data', 'sounds', 'repeat_answer.wav') path_what_city_answer_wav = os.path.join('data', 'sounds', 'what_city.wav') path_bye_wav = os.path.join('data', 'sounds', 'bye.wav') step_duration = 0.5 sample_rate, sample_size_bytes = 8000, 2 model_vosk = Model( os.path.join( 'data', 'models', 'alphacep-model-android-ru-0.3')) #хотя модель только на 16 kz kaldi_rec_vosk = KaldiRecognizer(model_vosk, sample_rate) energy_threshold_in = 100 # для человека кто отвечает на звонок energy_threshold_out = 1 # запись или google tts output_folder = 'output' if not os.path.exists(output_folder): os.mkdir(output_folder) writer = pd.ExcelWriter( os.path.join(output_folder, folder_record.split('/')[-1] + '_.xlsx')) wav_info = WavTools() #получаем длительности звуковых приветствий с точностью до step_duration privet_dur = wav_info.get_duration_wav(path_privet_wav, step_duration) repeat_answer_dur = wav_info.get_duration_wav(path_repeat_answer_wav, step_duration) what_city_answer_dur = wav_info.get_duration_wav(path_what_city_answer_wav, step_duration) bye_dur = wav_info.get_duration_wav(path_bye_wav, step_duration) df_citi_all = pd.read_excel(path_cities_all) lst_cities = df_citi_all['gor'].values.tolist() lst_sub = df_citi_all['sub'].values.tolist() del df_citi_all lst_cities = [ re_only_text.sub(' ', re_cyr_only.sub(' ', i)).lower() for i in lst_cities ] #оставляем толко кириллицу lst_cities = [ ' '.join([morph.normal_forms(j)[0] for j in i.split()]) for i in lst_cities ] #приводим города в нормальную форму lst_cities = np.unique(lst_cities).tolist() lst_sub = [ re_only_text.sub(' ', re_cyr_only.sub(' ', i)).lower() for i in lst_sub ] lst_sub = [ ' '.join([morph.normal_forms(j)[0] for j in i.split()]) for i in lst_sub ] lst_sub = np.unique(lst_sub).tolist() if flag_verbose: print(100 * '#') print( f'длительность звуковых файлов в секундах с округлением до {step_duration} секунд:' ) print(f'privet.wav = {privet_dur}') print(f'repeat_answer.wav = {repeat_answer_dur}') print(f'what_city.wav = {what_city_answer_dur}') print(f'bye.wav = {bye_dur}') print(100 * '#') print('список городов:') print(', '.join(lst_cities)) for address, dirs, files in os.walk(folder_record): lst_wav = [] for file_wav in files: if file_wav.endswith('wav'): lst_wav.append(file_wav) sheet_name = address.split('/')[-1] if lst_wav: lst_wav = sorted(lst_wav) lst_wav = [(lst_wav[i], lst_wav[i + 1]) for i in range(0, len(lst_wav), 2)] else: continue df_info_wav = pd.DataFrame(columns=[ 'id', 'user', 'number', 'time', 'duration', 'privet_question', 'privet_answer', 'repeat_question', 'repeat_answer', 'city_question', 'city_answer', 'bye', 'simultaneously_with_bot', 'Результат', 'Update', 'Комментарий' ]) df_city_answer = pd.DataFrame(columns=[ 'id', 'user', 'number', 'time', 'kaldi_docker', 'kaldi_local', 'kaldi_vosk', 'google', 'city_isincluded_kaldi_docker', 'city_isincluded_kaldi_local', 'city_isincluded_kaldi_vosk', 'city_isincluded_google' ]) for item in lst_wav: temp_dict = {k: '' for k in df_info_wav.columns.tolist()} temp_dict_city = {k: '' for k in df_city_answer.columns.tolist()} abonent = item[0].split('-')[2][1:] temp_dict['id'] = '-'.join(item[0].split('-')[:-1]) temp_dict['user'] = users[abonent] temp_dict['number'] = abonent part_data = ''.join(item[0].split('-')[:2]) date_time_str = part_data[:4] + str('-') + part_data[4:6] + str('-') + part_data[6:8] + str( ' ') + part_data[8:10] + \ str(':') + part_data[10:12] + str(':') + part_data[12:14] date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S') temp_dict['time'] = date_time_obj in_wav = item[0] out_wav = item[1] in_dur = wav_info.get_duration_wav(os.path.join(address, in_wav), step_duration) temp_dict['duration'] = in_dur lst_time_say_in = wav_info.get_energy_say( os.path.join(address, in_wav), sample_rate, sample_size_bytes, step_duration, energy_threshold_in) #время в секундах когда человек говорил lst_time_say_out = wav_info.get_energy_say( os.path.join(address, out_wav), sample_rate, sample_size_bytes, step_duration, energy_threshold_out ) #время в секундах когда робот задавал вопрос period_say_in = wav_info.find_period_say( lst_time_say_in, step_duration ) #преобразование времени в интервалы когда говорил человек period_say_out = wav_info.find_period_say( lst_time_say_out, step_duration ) #преобразование времени в интервалы когда говорил робот out_dict = {} count_none = 0 for item in period_say_out: if (item[1] - item[0]) == privet_dur: out_dict['privet'] = item #приветствие проиграло полностью elif (item[1] - item[0]) == what_city_answer_dur or ( item[1] - item[0] ) == int(what_city_answer_dur ) + 1: #вопрос про город проиграл полностью out_dict['what_city'] = item elif (item[1] - item[0]) == repeat_answer_dur or ( item[1] - item[0] ) == int(repeat_answer_dur ) + 1: #просьба повторить проиграла полностью out_dict['repeat'] = item elif (item[1] - item[0]) == bye_dur: #прощание проиграло полностью out_dict['bye'] = item else: out_dict['none_' + str(count_none)] = item #не распознанный интервал count_none += 1 del count_none out_dict = dict(sorted( out_dict.items(), key=lambda x: x[1])) # словарь временных интервалов робота count_bytes = int( sample_rate * sample_size_bytes * step_duration) # сколько байтов в одном шаге step_duration if 'privet' in out_dict.keys(): temp_dict['privet_question'] = 1 count_start_answer = out_dict['privet'][ 1] / step_duration #время окончания приветствия count_stop_answer = out_dict[list(out_dict.keys( ))[list(out_dict.keys()).index('privet') + 1]][ 0] / step_duration #время начала следующего вопроса за приветствием temp_dict['privet_answer'] = rcgn_kaldi_docker( path_wav=os.path.join(address, in_wav), count=count_bytes, count_start_answer=count_start_answer, count_stop_answer=count_stop_answer) else: temp_dict['privet_question'] = 0 if 'repeat' in out_dict.keys(): temp_dict['repeat_question'] = 1 count_start_answer = out_dict['repeat'][ 1] / step_duration #время окончания просьбы повторить ответ count_stop_answer = out_dict[list(out_dict.keys())[ list(out_dict.keys()).index('repeat') + 1]][0] / step_duration #время начала следующего вопроса temp_dict['repeat_answer'] = rcgn_kaldi_docker( path_wav=os.path.join(address, in_wav), count=count_bytes, count_start_answer=count_start_answer, count_stop_answer=count_stop_answer) else: temp_dict['repeat_question'] = 0 if 'what_city' in out_dict.keys(): temp_dict['city_question'] = 1 ''' может быть несколько фраз count_start_answer - всегда после речи бота count_stop_answer - из period_say_in ''' count_start_answer = out_dict['what_city'][ 1] / step_duration #время окончания вопроса про город # count_start_answer = int([i for i in period_say_in if i[0] >= out_dict['what_city'][1]][0][0]/step_duration) # [i for i in period_say_in if i[0] >= out_dict['what_city'][1]] - все периоды count_stop_answer = out_dict[list( out_dict.keys())[list(out_dict.keys()).index('what_city') + 1]][0] / step_duration # count_stop_answer = int([i for i in period_say_in if i[0] >= out_dict['what_city'][1]][0][-1] / step_duration) + 2 temp_dict['city_answer'] = rcgn_kaldi_docker( path_wav=os.path.join(address, in_wav), count=count_bytes, count_start_answer=count_start_answer, count_stop_answer=count_stop_answer) temp_dict_city['id'] = temp_dict['id'] temp_dict_city['user'] = temp_dict['user'] temp_dict_city['time'] = temp_dict['time'] temp_dict_city['kaldi_docker'] = temp_dict['city_answer'] frame_data = wav_info.get_part_wav_bytes( path_wav=os.path.join(address, in_wav), count=count_bytes, count_start_answer=count_start_answer, count_stop_answer=count_stop_answer) temp_dict_city['kaldi_vosk'] = rcgn_kaldi_vosk( frame_data=frame_data, kaldi_rec_vosk=kaldi_rec_vosk) temp_dict_city['kaldi_local'] = rcgn_kaldi_local( model_path=model_path_kaldi_local, frame_data=frame_data, sample_rate=sample_rate, sample_size_bytes=sample_size_bytes) temp_dict_city['google'] = rcgn_google( frame_data=frame_data, sample_rate=sample_rate, sample_size_bytes=sample_size_bytes) ###### city is included ##### temp_dict_city[ 'city_isincluded_kaldi_docker'] = check_city_isincluded( temp_dict_city['kaldi_docker'], lst_cities, lst_sub) temp_dict_city[ 'city_isincluded_kaldi_local'] = check_city_isincluded( temp_dict_city['kaldi_local'], lst_cities, lst_sub) temp_dict_city[ 'city_isincluded_kaldi_vosk'] = check_city_isincluded( temp_dict_city['kaldi_vosk'], lst_cities, lst_sub) temp_dict_city[ 'city_isincluded_google'] = check_city_isincluded( temp_dict_city['google'], lst_cities, lst_sub) else: temp_dict['city_question'] = 0 if 'bye' in out_dict.keys(): temp_dict['bye'] = 1 else: temp_dict['bye'] = 0 period_speak_bot = [ item_in for item_in in period_say_in if any(item_out[0] <= item_in[0] and item_out[1] >= item_in[1] for item_out in period_say_out) ] if period_speak_bot: speak_bot_text = [] for item in period_speak_bot: count_start_answer = item[0] / step_duration count_stop_answer = (item[1] / step_duration) + 2 speak_bot_text.append( rcgn_kaldi_docker( path_wav=os.path.join(address, in_wav), count=count_bytes, count_start_answer=count_start_answer, count_stop_answer=count_stop_answer)) speak_bot_text = [ item for item in speak_bot_text if item != str() ] if speak_bot_text: temp_dict['simultaneously_with_bot'] = ';'.join( speak_bot_text) else: temp_dict['simultaneously_with_bot'] = 1 else: temp_dict['simultaneously_with_bot'] = 0 df_info_wav = df_info_wav.append(temp_dict, ignore_index=True) if temp_dict['city_question'] == 1: df_city_answer = df_city_answer.append(temp_dict_city, ignore_index=True) df_info_wav.to_excel(writer, index=False, sheet_name=str(sheet_name)) df_city_answer.to_excel(writer, index=False, sheet_name='city_answer' + str(sheet_name)) writer.save() writer.close()
from vosk import Model, KaldiRecognizer from pathlib import Path import argparse import wave import json from os import remove as del_file from speach_text.smart_search import find_text_in_models import speach_text.global_settings as GST model = Model(GST.voice_model_folder) class ErrWav(Exception): def __init__(self, text): self.txt = text if __name__ == '__main__': # 'python demon_voice.py --string demon' - запуск бесконечной обработки parser = argparse.ArgumentParser() parser.add_argument('--string', type=str, default='', help='') opt = parser.parse_args() while True: all_wav_file = Path(GST.voice_wav_folder).rglob('*.{}'.format( GST.wav_extension)) for file_wav in all_wav_file: file_txt = str(file_wav.name).replace(GST.wav_extension, GST.text_extension)
def main(): configuration = Configuration("config/config.yaml") if not os.path.exists("model/" + configuration.config_list["language"]): print( "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder." ) exit(1) configuration.generate_nlu_file() ##HOTWORD hotword = Hotword(configuration.config_list["hotword"]) ##TEXT TO SPEECH tts = Tts() tts.setVoice(configuration.config_list["voice_id"]) ##PYAUDIO p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() ##VOSK model = Model("model/" + configuration.config_list["language"]) rec = KaldiRecognizer(model, 16000) ###SNIPS nlu = Nlu("nlu/" + configuration.config_list["language"] + "/dataset.json") # Load plugins plugin_directories = [os.path.normpath('plugins')] plugins_list = PluginList(plugin_directories) plugins_list.find_plugins() while True: data = stream.read(8000, exception_on_overflow=False) if len(data) == 0: break if rec.AcceptWaveform(data): rec_result = json.loads(rec.Result()) if rec_result["text"].count(hotword.getWord()) > 0: tts.speak(configuration.config_list["sentence_welcome"]) hotword.setState(True) if hotword.getState() == True: if rec_result["text"] != "": parsing = nlu.parse(rec_result["text"]) if parsing["intent"][ "probability"] >= configuration.config_list[ "min_probability"]: for plugin in plugins_list._plugins: plugin_object = plugins_list._plugins[ plugin].plugin_class if plugin_object.has_intent( parsing["intent"]["intentName"]) == True: response = plugin_object.get_response( parsing["intent"]["intentName"], parsing["slots"]) tts.speak(response) hotword.setState(False) elif parsing["intent"]["intentName"] == None: hotword.setState(True) else: tts.speak( "je ne suis pas sur d'avoir compris, peux-tu répéter?" )
from ..dirs import MARKUP_TXT from vosk import Model, KaldiRecognizer, SetLogLevel import sys import wave import json SetLogLevel(-1) model = Model("models/kaldi_vosk") def write_file(data, name): with open(f'{MARKUP_TXT}/{name}.txt', 'w', encoding='utf-8') as f: for word in data: f.write(str([*word.values()])[1:-1] + '\n') def write_file_text(data, name): with open(f'{MARKUP_TXT}/{name}.txt', 'w', encoding='utf-8') as f: for word in data: f.write(word['word'] + ' ') def parse_json(data): data = json.loads(data) for sample in data['result']: del sample['conf'] return data['result'] def creat_text(path): wf = wave.open(path, "rb")
def transcribe_to_sql(self, duration, side, original_file_name, rec_date, src, dst, linkedid): trans_start = time.time() # datetime.datetime.now() if self.source_id == self.sources['master']: original_file_name = linkedid + ('-in.wav' if side == 0 else '-out.wav') transcribation_date = datetime.datetime.now().strftime( '%Y-%m-%dT%H:%M:%S') print('transcribing', self.temp_file_path + self.temp_file_name) # read file wf = wave.open(self.temp_file_path + self.temp_file_name, "rb") # read model model = Model(self.model_path) rec = KaldiRecognizer(model, wf.getframerate()) # recognizing phrases_count = 0 confidences = [] while True: conf_score = [] data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): accept = json.loads(rec.Result()) if accept['text'] != '': accept_start = str(accept['result'][0]['start']) accept_end = accept['result'][-1:][0]['end'] accept_text = str(accept['text']) for result_rec in accept['result']: conf_score.append(float(result_rec['conf'])) conf_mid = str(sum(conf_score) / len(conf_score)) confidences.append(sum(conf_score) / len(conf_score)) # conf_score = [] self.save_result(duration, accept_text, accept_start, accept_end, side, transcribation_date, conf_mid, original_file_name, rec_date, src, dst, linkedid) phrases_count += 1 if len(confidences): self.confidence_of_file = sum(confidences) / len(confidences) else: self.confidence_of_file = 0 trans_end = time.time() # datetime.datetime.now() self.perf_log(2, trans_start, trans_end, duration, linkedid) if phrases_count == 0: self.save_result(duration, '', '0', '0', side, transcribation_date, 0, original_file_name, rec_date, src, dst, linkedid)
import ffmpeg import json import os import shutil import time from vosk import Model, KaldiRecognizer import wave PROJECT_PATH = os.getcwd() AUDIO_RECORDINGS = PROJECT_PATH + "/audio_examples/" REPORT_PATH = PROJECT_PATH + "/VOSK Speech Recognition" # https://alphacephei.com/vosk/models MODEL = Model("vosk-model-ru-0.10") REPORT_LINE_WIDTH = 100 # Variables for the resulting report number_audio = 0 total_time = 0 total_recognized_words = 0 # Creating a folder with the final transcription report shutil.rmtree(REPORT_PATH, ignore_errors=True) os.makedirs(REPORT_PATH) # Formatting the transcribed text for the report def recognition_report(): count_word = 0 for word in transcript.split(): count_word += 1 if (len(word) + recognition_report.length > REPORT_LINE_WIDTH): audio_report.write('\n')
#!/usr/bin/env python3 from vosk import Model, KaldiRecognizer import os import pyaudio model = Model('model') rec = KaldiRecognizer(model, 16000) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() while True: data = stream.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) print(rec.FinalResult())
def trigger_microphone(n_clicks): if n_clicks == 0: return '' print('trigger microphone %d' % n_clicks) import termux termux.Microphone.stop() pwd = os.environ['PWD'] aac_file = "%s/microphone.aac" % pwd wave_file = "%s/microphone.wave" % pwd if os.path.exists(aac_file): os.remove(aac_file) termux.Microphone.record(aac_file, encoder='aac', limit=5, count=2) import time time.sleep(6) os.system('faad -o %s %s' % (wave_file, aac_file)) if False: import speech_recognition as sr r = sr.Recognizer() with sr.WavFile(wave_file) as source: audio = r.record(source) text = r.recognize_sphinx(audio) else: from vosk import Model, KaldiRecognizer, SetLogLevel import wave import numpy as np model_name = 'vosk-model-small-en-us-0.15' if not os.path.exists(model_name): os.system('wget http://alphacephei.com/vosk/models/%s.zip' % model_name) os.system('unzip %s.zip' % model_name) wf = wave.open(wave_file, "rb") model = Model(model_name) rec = KaldiRecognizer(model, wf.getframerate()) nch = wf.getnchannels() depth = wf.getsampwidth() typ = {1: np.uint8, 2: np.uint16, 4: np.uint32}.get(depth) sdata = wf.readframes(64000) data = np.frombuffer(sdata, dtype=typ) ch_data = data[0::nch] sdata = ch_data.tobytes() if True: outwav = wave.open('good.wave', 'w') outwav.setparams(wf.getparams()) outwav.setnchannels(1) outwav.writeframes(ch_data.tobytes()) outwav.close() if rec.AcceptWaveform(sdata): result = rec.Result() result = json.loads(result) text = result['text'] else: result = rec.PartialResult() result = json.loads(result) text = result['partial'] result = rec.FinalResult() result = json.loads(result) text += result['text'] print('finish microphone') print('text:%s' % text) return text
def my_link(): time.sleep(15) c = '0' + '.wav' counter = 0 conn = connect() model = Model("vosk-model-small-en-in-0.4") pth = os.listdir(audio_path) while (c in pth): wf = wave.open(audio_path + '/' + c, 'rb') rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(CHUNK) if len(data) == 0: break if rec.AcceptWaveform(data): pass dict = ast.literal_eval( rec.FinalResult()) #changing the string to dictionary print(c) #print(dict["text"]) s = dict["text"] print(s) #Text classification starts temp = remove_punct(s) temp = tknz_text(temp) temp = remove_stopwords(temp) temp = stmng(temp) #for removing punctions puncs = set([ '"', '(', ')', '.', ',', '-', '<', '>', '/', '\',%', '\\x', '!', '?', "'", 's' ]) temp2 = [] for i in temp: if i[0].isalpha() == True: temp2.append(i) #for removing spaces temp1 = [] for i in temp2: if i not in ("", '', " ", ' '): temp1.append(i) print(temp1) fg, word = check_list(conn, temp1) if fg == 1: print("Abusive Detected") else: print("Normal Text") print() #close_the_connection(conn) time.sleep(0.01) counter = counter + 1 c = '0' + ' ' + '(' + str(counter) + ')' + '.wav' pth = os.listdir(audio_path) #print(pth, "-->", c) time.sleep(5) try: delete() except: return render_template('index.html') return render_template('index.html')
print(f'Voice model: {config.model}') print(f'Full voice model: {config.fullModel}') if bool(config.spkModel): print(f'Speaker identification model: {config.spkModel}') else: print('Speaker identification disabled') #region Load models model = None gModel = None spkModel = None if bool(config.model): print() print("=========== Загрузка основной голосовой модели ===========") model = Model(config.model) if model == None: fatalError(f'Ошибка при загрузке голосовой модели {config.model}') if bool(config.gModel): if config.gModel == config.model: gModel = model else: print() print("===== Загрузка модели для распознавания со словарем ======") gModel = Model(config.gModel) if gModel == None: fatalError( f'Ошибка при загрузке голосовой модели для распознавания со словарем {config.gModel}' )
import os import wave if not os.path.exists("model-en"): print( "Please download the model from https://github.com/alphacep/kaldi-android-demo/releases and unpack as 'model' in the current folder." ) exit(1) wf = wave.open(sys.argv[1], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("model-en") # You can also specify the possible word list rec = KaldiRecognizer(model, wf.getframerate(), "zero oh one two three four five six seven eight nine") while True: data = wf.readframes(1000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) print(rec.FinalResult())
def __init__(self): self.model = Model("model")
#elif 'курс биткоин' in zadanie: #response = requests.get("https://api.coinmarketcap.com/v1/ticker/bitcoin/") #response_json = response.json() #talk(response_json[0]['price_usd'].split('.')[0] + " долларов") #elif 'погода питер' in zadanie: #res = requests.get("http://api.openweathermap.org/data/2.5/find?q=Petersburg,RU&type=like&APPID=f98abe5235a919f50fc6536fbaa383ca") #data = res.json() #cities = ["{} ({})".format(d['name'], d['sys']['country']) # for d in data['list']] #print( "city:", cities ) #elif 'имя' in zadanie: #talk("Меня зовут Курису.") if __name__ == "__main__": model = Model("models/model-ru") rec = KaldiRecognizer(model, 16000) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() while True: data = stream.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): commander_start(rec) #else: #print(rec.PartialResult())
} color_chosen = switcher.get(text, "grey") if color_chosen != "grey": self['bg']=color_chosen def listen_microphone(): while True: data = stream.read(8000, exception_on_overflow = False) if len(data) == 0: break if rec.AcceptWaveform(data): result = json.loads(rec.Result()) color_change.change_background_color(result["text"]) if __name__ == "__main__": color_change = ColorChange() ##PYAUDIO p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() ##VOSK model = Model("model/fr_FR") rec = KaldiRecognizer(model, 16000) thread = threading.Thread(target=listen_microphone) thread.daemon = True thread.start() color_change.mainloop()
def __init__(self, model_path, text_processor=None): SetLogLevel(-1) self.vosk_model = Model(model_path) self.text_processor = text_processor self.sample_rate = 16000
def __init__(self): model = Model("/home/pi/Documents/DOORS/modules/model") self.rec = KaldiRecognizer(model, 8000)
if useLM == True: from huggingsound import ParlanceLMDecoder LmModelFolder = HSSttModelFolder + "/language_model/" lm_path = LmModelFolder + "lm.binary" unigrams_path = LmModelFolder + "unigrams.txt" # To use this decoder you'll need to install the Parlance's ctcdecode first (https://github.com/parlance/ctcdecode) print('Starting to load LM file %s in ParlanceLMDecoder ...' % lm_path) decoder = ParlanceLMDecoder(HSmodel.token_set, lm_path=lm_path, alpha=2, beta=1, beam_width=100) #decoder = KenshoLMDecoder(model.token_set, lm_path=lm_path, unigrams_path=unigrams_path, alpha=2, beta=1, beam_width=100) print("Finished loading Language Model") except Exception as e: print('Could not load acoustic model. Failed with message: %s' % e) sys.exit(-1) print('Loading Vosk Model from folder: %s' % VoskModelFolder) VoskModel = Model(VoskModelFolder) rec = KaldiRecognizer(VoskModel, sampleRate) rec.SetWords(True) # COMMAND ---------- !ls -R /dbfs/FileStore/output/Spanish/Spanish_Conversational_Speech_Corpus/CTM/ !rm -rf /dbfs/FileStore/output/Spanish/Spanish_Conversational_Speech_Corpus/CTM/ !mkdir /dbfs/FileStore/output/Spanish/Spanish_Conversational_Speech_Corpus/CTM/ # COMMAND ---------- HSOutFolder = outFolder + "/wav2vec2" VoskOutFolder = outFolder + "/vosk" if not os.path.exists(HSOutFolder): os.mkdir(HSOutFolder)
from vosk import Model, KaldiRecognizer import urllib.request import uuid from http.server import HTTPServer, BaseHTTPRequestHandler from io import BytesIO MODEL_PATH = "model/" AUDIO_PATH = "audio/" SAMPLE_FREQUENCY = 16000 model = Model(MODEL_PATH) def download_temp_file(url, dir_name): if not os.path.exists(dir_name): os.makedirs(dir_name) filename = os.path.join(dir_name, str(uuid.uuid4())) urllib.request.urlretrieve(url, filename=filename) return filename def remove_temp_file(filename): if os.path.exists(filename): os.remove(filename)
import sys import os import wave import json SetLogLevel(0) if not os.path.exists("model"): print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.") exit (1) wf = wave.open(sys.argv[1], "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": print ("Audio file must be WAV format mono PCM.") exit (1) model = Model("model") rec = KaldiRecognizer(model, wf.getframerate()) rec.SetMaxAlternatives(10) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(json.loads(rec.Result())) else: print(json.loads(rec.PartialResult())) print(json.loads(rec.FinalResult()))
def my_link(): print("entered into function for processing") time.sleep(15) c = '0' + '.wav' counter = 0 conn = connect() model = Model("vosk-model-small-en-in-0.4") pth = os.listdir(audio_path) #print(pth) print("entering into while loop") while (c in pth): #sound = AudioSegment.from_wav('C:/Users/admin/Downloads/' + i ) #sound = sound.set_channels(1) # To make it MONO Channel #sound = sound.set_frame_rate(44100) # Sample Frame rate taken here = 44,100 Hz #sound.export('C:/Users/admin/Downloads/' + i , format="wav") wf = wave.open(audio_path + '/' + c, 'rb') rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(CHUNK) if len(data) == 0: break if rec.AcceptWaveform(data): pass dict = ast.literal_eval( rec.FinalResult()) #changing the string to dictionary print(c) s = dict["text"] print(s) #integrate tc model temp = remove_punct(s) temp = tknz_text(temp) temp = remove_stopwords(temp) temp = stmng(temp) #for removing punctions puncs = set([ '"', '(', ')', '.', ',', '-', '<', '>', '/', '\',%', '\\x', '!', '?', "'", 's' ]) temp2 = [] for i in temp: if i[0].isalpha() == True: temp2.append(i) #for removing spaces temp1 = [] for i in temp2: if i not in ("", '', " ", ' '): temp1.append(i) fg, word = check_list(conn, temp1) if fg == 1: #flash("Abusive Detected") print("Abusive Detected") else: print("Normal Text") print() #close_the_connection(conn) time.sleep(0.01) counter = counter + 1 c = '0' + ' ' + '(' + str(counter) + ')' + '.wav' pth = os.listdir(audio_path) time.sleep(5) print("exiting while loop") delete() return redirect('http://127.0.0.1:5000/')
import subprocess import codecs import datetime SetLogLevel(-1) os.chdir(sys.argv[1]) if not os.path.exists(sys.argv[2]): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as ", sys.argv[2], " in the current folder.") exit(1) sample_rate = 16000 model = Model(sys.argv[2]) rec = KaldiRecognizer(model, sample_rate) rec.SetWords(True) # zone rendering if len(sys.argv) > 4 and (float(sys.argv[4]) > 0 or float(sys.argv[5]) > 0): process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', sys.argv[3], '-ss', sys.argv[4], '-t', sys.argv[5], '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) else: process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', sys.argv[3], '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-'
def detectKeywords(libpath): audio_stream = AudiostreamSource() extractor = FeatureExtractor(libpath) detector = AudioRecognition(libpath) framerate = 16000 model = Model("model") #Let's define a custom dictionary rec = KaldiRecognizer( model, framerate, '["oh one two three four five six seven eight nine zero", "[unk]"]') extactor_gain = 1.0 #Add one or more keyword models keywordIdAlexa = detector.addModel( '../../models/Hotword/alexa_v3.0.35.premium', 0.85) bufsize = detector.getInputDataSize() print("Audio Recognition Version: " + detector.getVersionString()) command_started = False audio_stream.start() try: while (True): # Wakeword loop if (not command_started): frame = audio_stream.read(bufsize * 2, bufsize * 2) if (not frame): time.sleep(0.01) continue features = extractor.signalToMel(frame, extactor_gain) prediction = detector.runDetection(features) if (prediction != 0): now = datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") if (prediction == keywordIdAlexa): print("Alexa detected:" + now) os.system(play_command + " ../resources/ding.wav") command_started = True # vosk loop else: frame = audio_stream.read(4000, 4000) if (not frame): time.sleep(0.01) continue if rec.AcceptWaveform(bytes(frame)): print(rec.Result()) command_started = False print(rec.FinalResult()) except KeyboardInterrupt: print("Terminating") audio_stream.stop() sys.exit(0)