def ShiBie_ZiRanYuYan(): wf = wave.open('yuyin.wav', "rb") model = Model("model") rec = KaldiRecognizer(model, wf.getframerate()) wenben = "" while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): res = json.loads(rec.Result()) wenben = res['text'] #print("识别结果:",wenben) else: pass #if '"text" : "' in rec.PartialResult(): #wenben = rec.PartialResult() #print("部分识别结果:",rec.PartialResult()) if wenben == "": res = json.loads(rec.FinalResult()) wenben = res['text'] # n = wenben.find('"text" : "') # wenben = wenben[n+10:].strip('}""') del_zf = ' "\n' for c in wenben: if c in del_zf: wenben = wenben.replace(c, '') return wenben
def speech_to_text(args): if not os.path.exists(os.path.join('models', args.model)): print( "Please download the model from https://alphacephei.com/vosk/models and unpack to 'models' folder.") exit(1) for filepath in glob.iglob(os.path.join(os.getcwd(), args.data, '*.wav')): print(filepath) wf = wave.open(args.data, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model(args.model) rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) print(rec.FinalResult()) hypothesis_path = os.path.join(args.hypothesis, filepath.split('.')[0] + '.txt') with open(hypothesis_path, 'w') as hypothesis: hypothesis.write(rec.FinalResult())
def ShiBie_ZiFu(): if PanDuan == "": ZiFuJi = "继 续 检 搜 索 全 部 无 损 听 歌 播 放 音 乐 停 止 诗 词 单 曲 专 辑 循 环 顺 序 随 相 声 评 书 讲 坛 朗 读 关 机 复 制 上 下 一 个 从 头 添 加 收 藏 中 文 日 语 英 更 新 升 级 清 空 谁 多 少 什 么 唱 名 叫 他 的" else: ZiFuJi = "对 是 嗯 没 错" wenben = "" model = Model("model") rec = KaldiRecognizer(model, 16000, ZiFuJi) WaveWenJian = open("yuyin.wav", "rb") WaveWenJian.read(44) while True: data = WaveWenJian.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): res = json.loads(rec.Result()) wenben = res['text'] print("识别结果是: " + res['text']) res = json.loads(rec.FinalResult()) if wenben == "": wenben = res['text'] print("最终结果是: " + wenben) return wenben
def recognizer_process(queue_audio, queue_text): """ as result: place into queue_text <- (text, True|False) where: text - a str with recognizer result, to json.loads() """ print('Worker started') rec = KaldiRecognizer(model, 8000) last_received = datetime.datetime.now() partial = True while True: queue_bytes = b'' while not queue_audio.empty(): last_received = datetime.datetime.now() queue_bytes += queue_audio.get() if rec.AcceptWaveform(queue_bytes): res = rec.Result() partial = False queue_text.put(res) if datetime.datetime.now() - datetime.timedelta( seconds=60) > last_received: if partial: queue_text.put(rec.FinalResult()) print(f'Worker stopped ') time.sleep(1) return time.sleep(1)
def listen(model: Model, spk_model: SpkModel = None, speech_chunk_sec: float = 0.5, buffer_sec: float = 1): with ExitStack() as stack: rate = model.SampleFrequency() if spk_model: rec = KaldiRecognizer(model, spk_model, rate) else: rec = KaldiRecognizer(model, rate) p = stack.enter_context(_pyaudio()) s = stack.enter_context( _pyaudio_open_stream(p, format=paInt16, channels=1, rate=rate, input=True, frames_per_buffer=int(rate * buffer_sec))) while True: data = s.read(int(rate * speech_chunk_sec)) if rec.AcceptWaveform(data): res = json.loads(rec.Result()) logging.info(res) else: res = json.loads(rec.PartialResult()) logging.info(res)
def _command_check(self): print('+ Switch to command mode') self._cmd_start_t = time.time() speech_recognizer = KaldiRecognizer(self._vosk_model, 16000) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() self._cmd_start_f() while not self._command_interrupt_check(): data = stream.read(4000) if len(data) == 0: break if speech_recognizer.AcceptWaveform(data): jdata = json.loads(speech_recognizer.Result()) cmd = jdata.get("text") print("CMD:", cmd, end=f"\n-----{'-'*len(cmd)}\n") if cmd: self._handle_command(cmd) stream.stop_stream() stream.close() self._cmd_stop_f()
def audio_speech_recognition(model_path, audio_path): """ Recognizes text from audio file :param model_path: str :return: None """ # checking if file has .mp3 format and convert it to .wav result = "" with open(audio_path, 'rb') as wf: model = Model(model_path) rec = KaldiRecognizer(model, 16000) wf.read(44) while True: data = wf.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): res = json.loads(rec.Result()) #print(res,29) print(res['text']) result = result + res['text'] '''res = json.loads(rec.FinalResult()) print(rec.FinalResult(),33) print(rec.Result(),34) print(res['text'],35)''' print(result) return result
def audio_to_txt(file_name): os.system(f'ffmpeg -i {file_name} out.wav') model = Model("model") # Large vocabulary free form recognition rec = KaldiRecognizer(model, 16000) wf = wave.open('out.wav', "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) rec = KaldiRecognizer(model, wf.getframerate()) transcript = '' while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): json_dict = json.loads(rec.Result()) transcript += json_dict['text'] else: rec.PartialResult() os.system(f'rm out.wav') return transcript
def speech_to_text(file): SetLogLevel(0) timestamp = [] text = [] #ouverture fichier audio with wave.open(file, "rb") as wav_file: #entrainement modèle avec les données model = Model("model") rec = KaldiRecognizer(model, wav_file.getframerate()) #lecture du fichier audio par bloc de frames data = wav_file.readframes(4000) while len(data) != 0: if rec.AcceptWaveform(data): #récupération de la transcription par json res = json.loads(rec.Result()) #ajout de l'horodatage et de la transcription dans les listes #si ils existent if ('result' in res): timestamp.append(res['result'][0]['start']) text.append(res['text']) #lecture du bloc de frames suivant data = wav_file.readframes(4000) return timestamp, text
def upload_voice_input(request): if request.method == "POST": myFile = request.FILES.get("myfile", None) if not myFile: print("no files for upload!") return HttpResponse("no files for upload!") destination = open(os.path.join("media/voice", myFile.name), 'wb+') for chunk in myFile.chunks(): destination.write(chunk) destination.close() rec = KaldiRecognizer(vosk_model, 16000) wf = wave.open(BASE_DIR + '/media/voice/voicehome.wav', "rb") while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): rec.Result() data = json.loads(rec.FinalResult()) voicetext = data['text'] print(voicetext) selectitem = dragon_cf['voicerec'][voice_section] item_value_array = selectitem.split(',') if voicetext not in item_value_array: newvalue = selectitem + ',' + voicetext dragon_cf.set('voicerec', voice_section, newvalue) dragon_cf.write(open(voice_rec_config, 'w')) return HttpResponse("voice added added added") return HttpResponse("voice already exist")
def process_file(self, file_name): """ Run the Vosk model on the input file :param file_name: Input wav or mp3 file :return: List of dictionaries containing: confidence, start time, end time and the predicted word """ logger.info(f'Recognising speech for {file_name}') wf = wave.open(file_name, "rb") # Check to see if the audio file can be read by the Vosk model if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": raise Exception(f'Invalid file format for {file_name}') rec = KaldiRecognizer(self.model, wf.getframerate()) results = [] while True: data = wf.readframes(config.frame_to_read) # If the data we have read is empty then we are at the end of the file if len(data) == 0: break if rec.AcceptWaveform(data): result = json.loads(rec.Result()) # Result can contain an empty text string but no result list if len(result['text']) > 0: # If we reach here we have accepted the translation of a section of text results.extend(result['result']) result = json.loads(rec.FinalResult()) # Add to results list if len(result['text']) > 0: results.extend(result['result']) logger.info(f'Processed speech, captured {len(results)} results') return results
def translate_file(filename="last5.wav"): SetLogLevel(0) if not os.path.exists("model"): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) exit(1) filepath = "./" + filename wf = wave.open(filepath, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("./model") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): print(rec.Result()) else: print(rec.PartialResult()) results = rec.FinalResult() return json.loads(results)[ "text"] #["results"] for confidence of each word
def meu_comando(): #função que retorna o que foi falado em forma de string p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() model = Model("vosk-model-small-pt-0.3" ) #localiza o arquivo de reconhecimento de voz rec = KaldiRecognizer(model, 16000) print("Fale algo") while True: data = stream.read(2000) if len(data) == 0: break if rec.AcceptWaveform(data): meuResultado = rec.Result() minhaLista = meuResultado.split( "text" ) #o que foi falado na posição text é retornado em lista comando = minhaLista[1] # stream.stop_stream() stream.close() p.terminate() resultado = re.findall( r'\w+', comando) #expressão regular parar pegar todas as letras resultadofinal = " ".join( resultado) #transforma a lista em string limpa return resultadofinal
def use_offline_recognition(): """ Переключение на оффлайн-распознавание речи :return: распознанная фраза """ recognized_data = "" try: # проверка наличия модели на нужном языке в каталоге приложения if not os.path.exists("models/vosk-model-small-" + assistant.speech_language + "-0.4"): print(colored("Please download the model from:\n" "https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.", "red")) exit(1) # анализ записанного в микрофон аудио (чтобы избежать повторов фразы) wave_audio_file = wave.open("microphone-results.wav", "rb") model = Model("models/vosk-model-small-" + assistant.speech_language + "-0.4") offline_recognizer = KaldiRecognizer(model, wave_audio_file.getframerate()) data = wave_audio_file.readframes(wave_audio_file.getnframes()) if len(data) > 0: if offline_recognizer.AcceptWaveform(data): recognized_data = offline_recognizer.Result() # получение данных распознанного текста из JSON-строки (чтобы можно было выдать по ней ответ) recognized_data = json.loads(recognized_data) recognized_data = recognized_data["text"] except: traceback.print_exc() print(colored("Sorry, speech service is unavailable. Try again later", "red")) return recognized_data
def get_transcript(wav_filename): """ From a WAV filename, use vosk to generate a Transcript object See example code https://github.com/alphacep/vosk-api/blob/master/python/example/test_simple.py """ transcript_text = [] transcript = Transcript() wav_file = wave.open(wav_filename, "rb") if wav_file.getnchannels() != 1 or wav_file.getsampwidth( ) != 2 or wav_file.getcomptype() != "NONE": print("Audio file must be WAV format mono PCM.") raise Exception("Audio file must be WAV format mono PCM.") model = Model(MODEL_DIR) rec = KaldiRecognizer(model, wav_file.getframerate()) rec.SetWords(True) while True: data = wav_file.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): result = rec.Result() json_result = json.loads(result) if (exists(json_result, 'result')): for word in json_result['result']: item = Item() item.start_time = word['start'] item.end_time = word['end'] item.confidence = word['conf'] item.content = word['word'] transcript.items.append(item) if (exists(json_result, 'text')): transcript_text.append(json_result['text']) transcript.text = ' '.join(transcript_text) return transcript
def wav2str(filename, sample_rate=16000, foldername="voskmodel"): # this is the name of the model folder model = Model(foldername) rec = KaldiRecognizer(model, sample_rate) wf = wave.open(filename, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) results = [] subs = [] while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): results.append(rec.Result()) results.append(rec.FinalResult()) Strings = [] for i, res in enumerate(results): jres = json.loads(res) if not 'result' in jres: continue words = jres['result'] for j in range(len(words)): Strings.append(words[j]['word']) return Strings
def _get_data_in_audio(self,audio_wav_path: str): """ :param audio_wav_path: -path to wav :return: """ wf = wave.open(audio_wav_path, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": # check file not bead return # список для объединения результатов result = list() # wf.getframerate()->Возвращает частоту дискретизации. rec = KaldiRecognizer(self.model, wf.getframerate()) while True: data = wf.readframes(1000) if len(data) == 0: break if rec.AcceptWaveform(data): # get result in JSON data = rec.Result() jsonData = json.loads(data) result.append(jsonData['text']) jsonData = json.loads(rec.FinalResult()) # data is void if 'result' in jsonData: result.append(jsonData.get('text')) wf.close() self.raw_data = result return result
def takeCommand(): if not os.path.exists("model1"): print( "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder.") exit(1) model = Model("model1") rec = KaldiRecognizer(model, 16000) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() print("Start SPEAKING:-") subprocess.call(['/usr/bin/canberra-gtk-play', '--id', 'bell']) while True: final_string = '' data = stream.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): curr = eval(rec.Result())['text'] final_string = final_string + curr print(final_string) return final_string.lower() else: continue
class VoskVoiceToTextCalculator(Calculator): def __init__(self, name, s, options=None): from vosk import Model, KaldiRecognizer super().__init__(name, s, options) self.model = Model("model") self.rec = KaldiRecognizer(self.model, 16000) self.output_data = [None, None] def process(self): audio = self.get(0) if isinstance(audio, AudioData): if self.rec.AcceptWaveform(audio.audio): result = self.rec.Result() try: result_json = json.loads(result) except json.decoder.JSONDecodeError as e: print("Voice2Text: Failed to parse voice json:", e) print(result) else: if 'text' in result_json: text = result_json['text'] if text: print("Voice2Text:", repr(text), result_json) self.set_output(0, VoiceTextData(text, audio.timestamp, info=result_json)) else: partial_result = self.rec.PartialResult() partial_json = json.loads(partial_result) if 'partial' in partial_json: text = partial_json['partial'] if text: print("Voice2Text (partial): ", repr(text)) self.set_output(1, VoiceTextData(text, audio.timestamp, info=partial_json)) return True return False
def myChat(): # "listens for chatter" # We imported vosk up above. p = pyaudio.PyAudio() # MyFormat = pyaudio.paInt16 # MyChannels = 1 # MyRate = 16000 # MyInput = True # MyFPB = 8000 stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) stream.start_stream() model = Model("model") rec = KaldiRecognizer(model, 16000) while True: data = stream.read(2000) if len(data) == 0: break if rec.AcceptWaveform(data): myResult = rec.Result() myList = myResult.split("text") chatter = myList[1] stream.stop_stream() stream.close() p.terminate() return chatter
def recognize(self): if not os.path.exists("Speech_Recognition/model"): print( "Please create speech model as 'model' in the current folder.") exit(1) sound = AudioSegment.from_wav(self.file_folder) sound = sound.set_channels(1) sound.export("path.wav", format="wav") wf = wave.open('path.wav', "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio file must be WAV format mono PCM.") exit(1) model = Model("Speech_Recognition/model") rec = KaldiRecognizer(model, wf.getframerate()) result = '' while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): x = json.loads(rec.Result()) result += x['text'] + ' ' else: pass result += json.loads(rec.FinalResult())['text'] return result
class WakeWordDetector: """唤醒词检测器,对 `vosk-api <https://github.com/alphacep/vosk-api>`_ 的简单封装,默认的唤醒词是 `'阿Q'` 和 `'R-Cute'`。 如果要自定义唤醒词,请参考 https://github.com/alphacep/vosk-api/blob/master/python/example/test_words.py """ def __init__( self, sr=16000, lang='en', grammar='[ "a b c d e f g h i j k l m n o p q r s t u v w x y z key cute", "[unk]" ]' ): self.load(lang) self._det = KaldiRecognizer(util.cache[f'vosk.{lang}'], sr, grammar) def _detected(self, text): if text == 'r q': return '阿Q' elif text == 'r cute': return 'R-Cute' def load(self, lang='en'): """load language model in advance""" model = util.cache.get(f'vosk.{lang}', Model(util.data_file(f'vosk/{lang}'))) util.cache[f'vosk.{lang}'] = model def detect(self, source, timeout=None): """开始检测 :param source: 声音来源 :param timeout: 超时,即检测的最长时间(秒),默认为 `None` ,表示不设置超时,知道检测到唤醒词才返回 :type timeout: float, optional :return: 检测到的唤醒词模型对应的唤醒词,若超时没检测到唤醒词则返回 `None` :rtype: str """ self._cancel = False # possible race condition? if timeout: count = 0.0 self._det.FinalResult() # clear buffer while True: segment = source.read() if self._det.AcceptWaveform(segment.raw_data): p = self._detected(json.loads(self._det.Result())['text']) else: p = self._detected( json.loads(self._det.PartialResult())['partial']) if p: return p if self._cancel: return # raise RuntimeError('Hotword detection cancelled by another thread') elif timeout: count += segment.duration_seconds if count > timeout: return # self._detected(self._det.FinalResult()['text']) def cancel(self): """停止检测""" self._cancel = True
def next_sentence(self, process): reconizer = KaldiRecognizer(self.vosk_model, self.sample_rate) while True: data = process.stdout.read(8000) if len(data) == 0: break if reconizer.AcceptWaveform(data): yield self.format_result(reconizer.Result()) yield self.format_result(reconizer.FinalResult(), final=True)
def main(): argv = sys.argv[1:] model_path = "./model" filename = "" try: opts, _ = getopt.getopt(argv, "f:m:", ["file_name =", "model_path ="]) #print(opts) #print(args) except: print("Error with arguments") return for opt, arg in opts: if opt in ['-f', '--file_name']: filename = arg elif opt in ['-m', '--model_path']: model_path = arg print("FILE: ", filename, " MODEL: ", model_path) if not os.path.exists(model_path): print( "Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder." ) return SetLogLevel(-1) sample_rate = 16000 model = Model(model_path) rec = KaldiRecognizer(model, sample_rate) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', filename, '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) result = "" while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): data = json.loads(rec.Result()) result += data['text'] #print(result) data = json.loads(rec.FinalResult()) result += data['text'] print("\n") print(result)
def StreamingRecognize(self, request_iterator, context): request = next(request_iterator) partial = request.config.specification.partial_results recognizer = KaldiRecognizer(self.model, request.config.specification.sample_rate_hertz) for request in request_iterator: res = recognizer.AcceptWaveform(request.audio_content) if res: yield self.get_response(recognizer.Result()) elif partial: yield self.get_response(recognizer.PartialResult()) yield self.get_response(recognizer.FinalResult())
class Recognizer: def __init__(self, pathToModel): self.answer = "None" self.modelFlag = False self.pyAudioFlag = False self.pathToModel = pathToModel def setupModel(self): if not os.path.exists(self.pathToModel): print( "Please download the model from https://github.com/alphacep/vosk-api/blob/master/doc/models.md and unpack as 'model' in the current folder." ) exit(1) self.model = Model(self.pathToModel) self.rec = KaldiRecognizer(self.model, 16000) self.modelFlag = True self.startPyaudio() def startPyaudio(self): self.p = pyaudio.PyAudio() self.stream = self.p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000) self.stream.start_stream() self.pyAudioFlag = True def stopPyaudio(self): self.stream.stop_stream() self.pyAudioFlag = False def runTimedRecognition(self): n = 1000 while True: data = self.stream.read(4000) if len(data) == 0: break if self.rec.AcceptWaveform(data): result = self.rec.Result() print(result) d = json.loads(str(result)) myStr = d["text"] print(myStr) self.answer = myStr return myStr else: print(self.rec.PartialResult()) if n == 0: break print(n) n -= 1
def reconize(model_path, process): vosk_model = Model(model_path) reconizer = KaldiRecognizer(vosk_model, sample_rate) reconizer.SetWords(True) while True: data = process.stdout.read(8000) if len(data) == 0: break if reconizer.AcceptWaveform(data): yield format_result(reconizer.Result()) yield format_result(reconizer.FinalResult())
class Decoder: def __init__(self): model = Model("/home/pi/Documents/DOORS/modules/model") self.rec = KaldiRecognizer(model, 8000) def decode_file(self, aud_file): SetLogLevel(0) wf = wave.open(aud_file, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype( ) != "NONE": print("Audio aud_file must be WAV format mono PCM.") exit(1) results = [] while True: data = wf.readframes(4000) if len(data) == 0: break if self.rec.AcceptWaveform(data): results.append(self.rec.Result()) for i in results: y = json.loads(i) print("---VOSK TEXT---", y["text"]) print("results:", results) return results def decode_stream(self, socket, initData): fname = 'temp.wav' cur = 1 obj = wave.open(fname, 'wb') obj.setchannels(1) #mono obj.setsampwidth(2) obj.setframerate(8000) obj.writeframesraw(initData) obj.close results = self.decode_file(fname) print("results " + cur + ":" + results) while true: obj = wave.open(fname, 'wb') obj.setchannels(1) #mono obj.setsampwidth() obj.setframerate(8000) obj.writeframesraw(socket.read(1024)) obj.close results = self.decode_file(fname) print("results " + cur + ":" + results) cur += 1
class VoskSpeechDetector(SpeechDetector): def __init__(self, model: Model, sample_rate): self._model = model self._sample_rate = sample_rate self._bytes_per_iter = 4096 self._recognizer: Optional[KaldiRecognizer] = None def detect(self, file) -> Generator[Speech, None, None]: self._recognizer = KaldiRecognizer(self._model, self._sample_rate) for data in self._read_audio(file): if self._recognizer.AcceptWaveform(data): speech = self._parse_speech() if speech: yield speech speech = self._parse_speech() if speech: yield speech def _parse_speech(self) -> Optional[Speech]: result = json.loads(self._recognizer.Result()) if result['text'] == '': return None first_phrase = result['result'][0] last_phrase = result['result'][-1] begin_of_speech = timedelta(seconds=first_phrase['start']) end_of_speech = timedelta(seconds=last_phrase['end']) return Speech(result['text'], begin_of_speech, end_of_speech) def _read_audio(self, file): command = [ 'ffmpeg', '-loglevel', 'quiet', '-i', file, '-ar', str(self._sample_rate), '-ac', '1', '-f', 's16le', '-' ] process = subprocess.Popen(command, stdout=subprocess.PIPE) while True: data = process.stdout.read(self._bytes_per_iter) if len(data) == 0: break yield data
def video2data(self, url): """Получаем распознанный текст ролика по его url """ current_dir = os.getcwd() os.chdir(self.path) ydl_opts = { 'format': 'bestaudio/best', 'writeinfojson': 'info', 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192', }], 'progress_hooks': [self._catch_filename], } with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) time.sleep(20) video_description = self._downloaded_data() model = Model(self.kaldi_path) rec = KaldiRecognizer(model, 16000) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', os.path.join(self.path, self.filename), '-ar', str(16_000), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) full_text = '' while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): res = json.loads(rec.Result()) full_text += ' ' + res['text'] full_text += ' ' + json.loads(rec.FinalResult())['text'] os.remove(os.path.join(self.path, self.description_file)) os.remove(os.path.join(self.path, self.filename)) os.chdir(current_dir) return full_text, video_description