def record(filename, duration): mic = MutableMicrophone() recognizer = Recognizer() with mic as source: audio = recognizer.record(source, duration=duration) with open(filename, 'wb') as f: f.write(audio.get_wav_data())
def Audio_file_Read(filename): universal_dict = {} cnt = {} gantu = [0, 0, 0, 0] analysis = {} token = Tokenizer() recog = Recognizer() try: audioFile = sr.AudioFile(filename) with audioFile as source: audio = recog.record(source) recognized = recog.recognize_google(audio, language="ko-KR") res = text_to_word_sequence(recognized) cnt = collections.Counter(res) universal_dict = dict(cnt) if "어" in universal_dict: gantu[0] = universal_dict["어"] if "아니" in universal_dict: gantu[1] = universal_dict["아니"] if "근데" in universal_dict: gantu[2] = universal_dict["근데"] if "이제" in universal_dict: gantu[3] = universal_dict["이제"] text = recognized analysis['text'] = text analysis['data'] = gantu return analysis except UnknownValueError: analysis['text'] = "당신이 말한 문장이 없습니다." analysis['data'] = [0, 0, 0, 0] return analysis
def GetTextInAudio(audio_msg, chat_id, bot): from speech_recognition import AudioFile, Recognizer, UnknownValueError rec = Recognizer() file_id = audio_msg['file_id'] tempAudio = dirLoc + file_id + '.' + audio_msg['mime_type'].split('/')[1] bot.download_file(file_id, tempAudio) from pydub import AudioSegment filename = file_id + '.wav' file_loc = dirLoc + filename sound = AudioSegment.from_file(tempAudio) sound.export(file_loc, format='wav') with AudioFile(file_loc) as AudioSrc: content = rec.record(AudioSrc) try: text = rec.recognize_google( audio_data=content, language='ja-JP', ) except UnknownValueError: bot.sendMessage(chat_id, 'None') return mainText = t_j2k(text) bot.sendMessage(chat_id, mainText) remove(tempAudio) remove(file_loc)
def wav_to_text(wav_file_path, language="es-ES", show_all=False): r = Recognizer() with WavFile(wav_file_path) as source: audio = r.record(source) try: return r.recognize_google(audio_data=audio, language=language, show_all=show_all) except UnknownValueError: raise GolemException("Could not understand audio")
def dothis(message): """ From speech to text :param message: :return: text """ session = message.get_session() ans = '' current_cmd = message.get_setting(session, 'active') if message.attachments['sound']: try: r = Recognizer() mode = 'google' lang = 'ru-RUS' ans = '' for attachment in message.attachments['sound']: ext = attachment[1] path = os.path.abspath(os.curdir) fname = time.strftime("%Y%m%d-%H%M%S") + '.' dir = path + '/temp/' + fname urllib.request.urlretrieve( attachment[0], dir + ext) # getting file if ext != 'wav': subprocess.run(['ffmpeg', '-i', dir + ext, dir + 'wav']) os.remove(dir + ext) with AudioFile(dir + 'wav') as source: song = r.record(source) os.remove(dir + 'wav') if "en" in message.params: lang = 'en-EN' if 'wit' in message.params: mode = 'wit' recg = r.recognize_google( song, language=lang ) if mode == 'google' else r.recognize_wit(song, witkey) ans += f">>>>>>{recg}\n\n" yield ans except Exception as f: ans += "Произошла непредвиденная ошибка: " + str(f) + "\n" finally: if current_cmd: message.delete_active(session) yield str(ans) elif 'Выход' in message.params and current_cmd: message.delete_active(session) yield {'msg': 'Успешно!', 'keyboard': [[], False]} else: if current_cmd is None: message.add_setting(session, 'active', 'stt') yield {'msg': 'Прикрепите аудио или напишите Выход', 'keyboard': [[[('Выход', 'negative')]], False] }
def speech_recog(file_name="input_sample/audio/introduction_ml.mp3", duration=10): convert_mp3_to_wav("input_sample/audio/introduction_ml.mp3") result = 0 class_audio = AudioFile("input_sample/audio/introduction_ml.wav") print(type(class_audio)) recongizer = Recognizer() with class_audio as src_audio: audio = recongizer.record(src_audio, duration=duration) print(recongizer.recognize_google(audio))
def gat_with_breakdown(path_to_file, recognizer: sr.Recognizer, breakdown_duration=10, tempfile_name='temp.wav'): """ gat = get audio transcript This breakdown is effective for writing down SRT(s). """ for start, end, total in chop_chop(path_to_file, breakdown_duration, output_file=tempfile_name): last_yield = start with sr.AudioFile(tempfile_name) as source: try: if (content := recognizer.recognize_google(recognizer.record(source))): yield {'start': format_to_ffmpeg_duration(start), 'end': format_to_ffmpeg_duration(end), 'text': content, 'total': total, 'delta': end - last_yield} last_yield = end except sr.UnknownValueError: pass
async def process_audio(chat_id, msg): await bot.download_file(msg['voice']['file_id'], "./dest.ogg") filename = "dest.ogg" dest = "dest.flac" r = Recognizer() sound = AudioSegment.from_ogg(filename) os.unlink(filename) sound.export(dest, format="flac") with AudioFile(dest) as source: # listen for the data (load audio to memory) audio_data = r.record(source) # recognize (convert from speech to text) try: text = r.recognize_google(audio_data) print(f"VOICE LOG - {msg['from']['first_name']}: {text}") await process_result(chat_id, text) except UnknownValueError: await bot.sendMessage(chat_id, 'This audio is too short or corrupted, retry!') pass try: os.unlink(dest) except PermissionError: pass
def SpeechToText(): try: rec = Recognizer() with AudioFile(WAVE_OUTPUT_FILENAME) as AudioSrc: content = rec.record(AudioSrc) text = rec.recognize_google( audio_data=content, language='ko-KR', ) mainText = str(text) print('main-> ', mainText) return mainText # for sW in START_WORDS: # if mainText.startswith(sW): # return mainText.split(sW)[1] except (UnknownValueError): print("SPEECH ERROR!") return False
async def speech_to_text(event): """ Note: telethon may borrow a different DC id to download audio """ if event.reply_to_msg_id: msg = await event.get_reply_message() else: await event.edit(msgRep.REPLY_TO_VM) return filename, file_format = (None, ) * 2 voice_note = False if msg.media and hasattr(msg.media, "document") and \ isinstance(msg.media.document, Document) and \ msg.media.document.mime_type.startswith("audio"): for attribute in msg.media.document.attributes: if isinstance(attribute, DocumentAttributeAudio): if not voice_note: # set only if not True already voice_note = attribute.voice if isinstance(attribute, DocumentAttributeFilename): if not file_format: # set only if none string = attribute.file_name.split(".") file_format = string[-1] if not voice_note: await event.edit(msgRep.WORKS_WITH_VM_ONLY) return if not file_format: # alternative way file_format = msg.media.document.mime_type.split("/")[1] filename = join(TEMP_DL_DIR, "audio." + file_format) await event.edit(msgRep.CONVERT_STT) try: await msg.download_media(file=filename) except Exception as e: log.warning(e) await event.edit(msgRep.FAILED_LOAD_AUDIO) return else: await event.edit(msgRep.REPLY_TO_VM) return try: audio_file = AudioSegment.from_file(filename, file_format) audio_wav = join(TEMP_DL_DIR, "audio.wav") audio_file.export(audio_wav, "wav") r = Recognizer() with AudioFile(audio_wav) as source: audio = r.record(source) result = r.recognize_google(audio) text = f"**{msgRep.STT}**\n\n" text += f"{msgRep.STT_TEXT}:\n" text += f"__{result}__" await event.edit(text) except UnknownValueError: await event.edit(msgRep.STT_NOT_RECOGNIZED) except RequestError: await event.edit(msgRep.STT_REQ_FAILED) except MessageTooLongError: await event.edit(msgRep.STT_OUTPUT_TOO_LONG) except Exception as e: log.warning(e) await event.edit(msgRep.UNABLE_TO_STT) try: remove(filename) remove(audio_wav) except Exception as e: log.warning(f"Unable to delete audio(s): {e}") return
def gettingWordsFromAudio(): print(version) r = Recognizer() print("captures any speech") harvard = AudioFile('harvard.wav') with harvard as source: audio = r.record(source) print(type(audio)) print(r.recognize_google(audio)) print("") print("") print("captures any speech in the first four seconds of the file") with harvard as source: audio = r.record(source, duration=4) print(r.recognize_google(audio)) print("") print("") print( "The record() method, when used inside a with block, always moves ahead in the file stream." ) with harvard as source: audio1 = r.record(source, duration=4) audio2 = r.record(source, duration=4) print(r.recognize_google(audio1)) print(r.recognize_google(audio2)) print("") print("") print( "To capture only the second phrase in the file, you could start with an offset of four seconds and record for, say, three seconds." ) with harvard as source: audio = r.record(source, offset=4, duration=3) print(r.recognize_google(audio)) print("") print("") print("****************") print("noisy audio") jackhammer = AudioFile('jackhammer.wav') with jackhammer as source: audio = r.record(source) print(r.recognize_google(audio)) print("") print("") print( "The adjust_for_ambient_noise() method reads the first second of the file stream and calibrates the recognizer to the noise level of the audio." ) with jackhammer as source: r.adjust_for_ambient_noise(source, duration=1) audio = r.record(source) print(r.recognize_google(audio)) print("") print("") print("Prints all json alternatives") print(r.recognize_google(audio, show_all=True))
def get_audio_transcript(path_to_file, recognizer: sr.Recognizer): with sr.AudioFile(path_to_file) as source: return recognizer.recognize_google(recognizer.record(source))
import os import subprocess from speech_recognition import ( AudioFile, Recognizer, ) unique_id = 'file_name' ogg_file_path = f'{unique_id}.ogg' wav_file_path = f'{unique_id}.wav' process = subprocess.run(['ffmpeg', '-i', ogg_file_path, wav_file_path]) recognizer = Recognizer() with AudioFile(f'{unique_id}.wav') as audio_file: audio_content = recognizer.record(audio_file) print(recognizer.recognize_google(audio_content, language="ru-RU")) os.remove(wav_file_path)
def process_voice(update: Update, context: CallbackContext): responce = update.message.voice chat_id = update.message.chat_id username = update.message.from_user.username user, _ = TelegramUser.objects.get_or_create( chat_id=chat_id, defaults={ 'username': username, } ) if user.is_active != TelegramUser.ACTIVE: return update.message.reply_text( text="You are not registered!\n\nСontact the administrator." ) file_ = responce.get_file() unique_id = file_.file_unique_id file_.download(f'{settings.VOICE_TEMP_FOLDER}/{unique_id}.ogg') ogg_file_path = f'{settings.VOICE_TEMP_FOLDER}/{unique_id}.ogg' wav_file_path = f'{settings.VOICE_TEMP_FOLDER}/{unique_id}.wav' subprocess.run(['ffmpeg', '-i', ogg_file_path, wav_file_path]) recognizer = Recognizer() with AudioFile(wav_file_path) as audio_file: audio_content = recognizer.record(audio_file) os.remove(ogg_file_path) os.remove(wav_file_path) try: result = recognizer.recognize_google(audio_content, language=settings.LANGUAGE_OF_RECOGNION) except UnknownValueError: return update.message.reply_text( text=REPEAT_MESSAGE ) optimal_id, total_score = max({command.id: calculate_score(result, command.key) for command in CommandModel.objects.all()}.items(), key=operator.itemgetter(1)) if total_score < 0.8: return update.message.reply_text( text=REPEAT_MESSAGE ) command = CommandModel.objects.get(id=optimal_id) # broadlink.setup( # os.environ.get('BROADLINK_WIFI_SSID'), # os.environ.get('BROADLINK_WIFI_PASSWORD'), # int(os.environ.get('BROADLINK_WIFI_SECURITY_MODE')), # ) # device = broadlink.discover( # timeout=5, # discover_ip_address=os.environ.get('BROADLINK_WIFI_DISCOVER_IP_ADDRESS') # )[0] # device.auth() # device.send_data( # binascii.unhexlify(command.signal_to_broadlink.encode()) # ) update.message.reply_text( text=command.responce_message )
try: if x['params']['type'] == 'Media': lastaudio = x['params']['request']['url'] except: pass with open('1.mpeg', 'wb') as f: f.write(get(lastaudio).content) sound = AudioSegment.from_mp3("1.mpeg") sound.export("transcript.wav", format="wav") r = Recognizer() with AudioFile('transcript.wav') as source: audio = r.record(source) transcript = r.recognize_google(audio) print("Transcription: " + transcript) driver.switch_to.default_content() iframe = driver.find_elements_by_tag_name('iframe')[-1] driver.switch_to.frame(iframe) i = driver.find_elements_by_tag_name('input')[1].send_keys(transcript + Keys.ENTER) sleep(5) driver.switch_to.default_content() driver.find_element_by_xpath('/html/body/div[1]/form/fieldset/ul/li[6]/input').click()
def googleRecognition(audioname): harvard_audio = sr.AudioFile(audioname) r = Recognizer() with harvard_audio as source: audio = r.record(source)
def recognize_wav(filename, language="en-US", show_all=True): recognizer = Recognizer(language=language) with WavFile(filename) as source: audio_data = recognizer.record(source) return recognizer.recognize(audio_data, show_all)
class GoogleSTTEndpoint(PublicEndpoint): """Endpoint to send a flac audio file with voice and get back a utterance""" def __init__(self): super(GoogleSTTEndpoint, self).__init__() self.google_stt_key = self.config['GOOGLE_STT_KEY'] self.recognizer = Recognizer() self.account = None self.account_shares_data = False def post(self): self._authenticate() self._get_account() self._check_for_open_dataset_agreement() self._write_flac_audio_file() stt_response = self._call_google_stt() response = self._build_response(stt_response) self._write_stt_result_file(response) return response, HTTPStatus.OK def _get_account(self): if self.device_id is not None: account_repo = AccountRepository(self.db) self.account = account_repo.get_account_by_device_id( self.device_id) def _check_for_open_dataset_agreement(self): for agreement in self.account.agreements: if agreement.type == OPEN_DATASET: self.account_shares_data = True def _write_flac_audio_file(self): """Save the audio file for STT tagging""" self._write_open_dataset_file(self.request.data, file_type='flac') def _write_stt_result_file(self, stt_result): """Save the STT results for tagging.""" file_contents = '\n'.join(stt_result) self._write_open_dataset_file(file_contents.encode(), file_type='stt') def _write_open_dataset_file(self, content, file_type): if self.account is not None: file_name = '{account_id}_{time}.{file_type}'.format( account_id=self.account.id, file_type=file_type, time=time()) file_path = os.path.join(SELENE_DATA_DIR, file_name) with open(file_path, 'wb') as flac_file: flac_file.write(content) def _call_google_stt(self): """Use the audio data from the request to call the Google STT API We need to replicate the first 16 bytes in the audio due a bug with the Google speech recognition library that removes the first 16 bytes from the flac file we are sending. """ lang = self.request.args['lang'] audio = self.request.data with AudioFile(BytesIO(audio[:16] + audio)) as source: recording = self.recognizer.record(source) response = self.recognizer.recognize_google(recording, key=self.google_stt_key, language=lang, show_all=True) return response def _build_response(self, stt_response): """Build the response to return to the device. Return n transcripts with the higher confidence. That is useful for the case when send a ambiguous voice file and the correct utterance is not the utterance with highest confidence and the API. """ limit = int(self.request.args['limit']) if isinstance(stt_response, dict): alternative = stt_response.get("alternative") if 'confidence' in alternative: # Sorting by confidence: alternative = sorted(alternative, key=lambda alt: alt['confidence'], reverse=True) alternative = [alt['transcript'] for alt in alternative] # client is interested in test the utterances found. if len(alternative) <= limit: response = alternative else: response = alternative[:limit] else: response = [alternative[0]['transcript']] else: response = [] return response
def solve(self): try: sleep(3) # Scrolling Down the Page pyautogui.scroll(-1000) # Locating and Clicking Captcha button on Page cap = pyautogui.locateCenterOnScreen("files/captcha.png") pyautogui.click(cap) try: sleep(3) # Locating and Clicking Headphones button on Page voi = pyautogui.locateCenterOnScreen("files/voice.png") pyautogui.click(voi) except: try: # Move out mouse from the voice button pyautogui.moveTo(200, 200) # Locate Voice2 Button that is little Gray pyautogui.locateCenterOnScreen("files/voice2.png") return self.error except: return self.done except: try: sleep(2) # Move out mouse from the voice button pyautogui.moveTo(200, 200) # Locate Voice2 Button that is little Gray pyautogui.locateCenterOnScreen("files/voice2.png") self.error[1] = "Captcha" return self.error except: return self.done sleep(2) try: # Locate and Right Click on the Download Button down = pyautogui.locateCenterOnScreen("files/down.png") pyautogui.rightClick(down) sleep(1) # Press down button 5 times pyautogui.press(['down'] * 5) sleep(1) # Press Enter (Cursor will be Copy Link Address) pyautogui.press('enter') # Download and Save that audio file from Link copied from above code with open('files/audio.mp3', 'wb') as file: r = ge(paste()) file.write(r.content) sleep(2) if path.exists('files/audio.mp3'): # Convert that mp3 file into wav using ffpmeg call([ 'files/ffmpeg.exe', '-i', 'files/audio.mp3', '-y', 'files/audio.wav' ]) sleep(2) AUDIO_FILE = 'files/audio.wav' # Code to Send That Audio File to Google and Recognize The Audio r = Recognizer() with AudioFile(AUDIO_FILE) as source: audio = r.record(source) try: # Get the Recognized Text capSolved = r.recognize_google(audio) # Go to the text field and write it there pyautogui.hotkey('shift', 'tab') pyautogui.typewrite(capSolved) pyautogui.press('enter') sleep(5) return self.done except UnknownValueError as e: return self.error except RequestError as e: return self.error except: self.error[1] = "Captcha" return self.error
driver.find_element_by_xpath("/html/body/div/div/div[3]/div/button").click() try: src = driver.find_element_by_id("audio-source").get_attribute("src") print(src) urllib.request.urlretrieve(src, path+"\\audio.mp3") sound = pydub.AudioSegment.from_mp3( path+"\\audio.mp3").export(path+"\\audio.wav", format="wav") recognizer = Recognizer() recaptcha_audio = AudioFile(path+"\\audio.wav") with recaptcha_audio as source: audio = recognizer.record(source) text = recognizer.recognize_google(audio, language="de-DE") print(text) inputfield = driver.find_element_by_id("audio-response") inputfield.send_keys(text.lower()) inputfield.send_keys(Keys.ENTER) sleep(10) print("Success") driver.quit() except NameError: print("Failed")