def synthesize(self, text: str, solution=None) -> sr.AudioData: # Switch to default solution if solution is None: solution = self._solution if solution["name"] == SOLUTION_RHVOICE["name"]: voice = solution["voice"] synthesizer = RHVoice(default_voice=voice) return sr.AudioData(synthesizer.get_audio(text), sample_rate=24000, sample_width=2) elif solution["name"] == SOLUTION_GOOGLE["name"]: with io.BytesIO() as audiodata: # Read the mp3 file from the google translate gTTS(text, lang="ru").write_to_fp(audiodata) # Convert it to raw format so it is compatible with the speech recognition library audiodata.seek(0) song = AudioSegment.from_mp3(audiodata) with io.BytesIO() as audiodata: song.export(audiodata, format="raw") audiodata.seek(0) return sr.AudioData(audiodata.read(), sample_rate=song.frame_rate, sample_width=song.sample_width) else: raise Exception("There is no such solution.")
def read_file(self, fname): if fname == '': return self.w = sr.WavFile(fname) with self.w as inf: data = inf.stream.read() ad = sr.AudioData(data, inf.SAMPLE_RATE, inf.SAMPLE_WIDTH) self.np_arr = np.fromstring(ad.get_wav_data()) np.save('C:/Users/Brett/Desktop/data.np', self.np_arr) ad = sr.AudioData(self.np_arr, inf.SAMPLE_RATE, inf.SAMPLE_WIDTH) words = self.r.recognize_google(ad) print(words) return words
async def record_recog(self, ctx): """Speech recognize the current voice recording. Usage: recording recog""" or_check_perms(ctx, ['manage_server', 'manage_channels', 'move_members']) with assert_msg(ctx, '**The bot owner has not set up this feature!**'): check(self.opus_decoder != None) with assert_msg(ctx, '**This server does not have a recording!**'): check(ctx.message.server.id in self.bot.pcm_data) status = await self.bot.say('Hmm, let me think... 🌚') pg_task = self.loop.create_task( asyncio.wait_for(self.progress(status, 'Hmm, let me think'), timeout=30, loop=self.loop)) sr_data = sr.AudioData(self.recording_data[ctx.message.server.id], 48000, 2) try: with async_timeout.timeout(16): final = await self.loop.run_in_executor( None, r.recognize_sphinx, sr_data) except asyncio.TimeoutError: pg_task.cancel() await self.bot.edit_message( status, '**It took too long to recognize your recording!**') return pg_task.cancel() await self.bot.edit_message(status, 'I think you said: ' + final[:2000])
def callback(self, data): r = sr.Recognizer() with sr.Microphone() as source2: numpydata = data.data audio = sr.AudioData(numpydata.tobytes(), source2.SAMPLE_RATE, source2.SAMPLE_WIDTH) try: HOUNDIFY_CLIENT_ID = "INSERT HOUNDIFY CLIENT ID HERE" HOUNDIFY_CLIENT_ID = "INSERT HOUNDIFY CLIENT ID HERE" self.text = r.recognize_houndify( audio, client_id=HOUNDIFY_CLIENT_ID, client_key=HOUNDIFY_CLIENT_KEY) except sr.UnknownValueError: print("Houndify could not understand audio") except sr.RequestError as e: print("Could not request results from Houndify service; {0}". format(e)) else: self.text_pub.publish(str(self.text))
def callback(self, data): r = sr.Recognizer() with sr.Microphone() as source2: numpydata = data.data audio = sr.AudioData(numpydata.tobytes(), source2.SAMPLE_RATE, source2.SAMPLE_WIDTH) try: GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE""" self.text = r.recognize_google_cloud( audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS) except sr.UnknownValueError: print("Google Cloud Speech could not understand audio") except sr.RequestError as e: print( "Could not request results from Google Cloud Speech service; {0}" .format(e)) else: self.text_pub.publish(str(self.text))
def recognize(self, va): with noalsaerr(): p = pyaudio.PyAudio() stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, output=True, frames_per_buffer=CHUNK) try: data = stream.read(CHUNK) audio = None while data != '': rms = audioop.rms(data, 2) if rms >= THRESHOLD: audio = data silence_counter = 0 while silence_counter < SILENCE_DETECTION: data = stream.read(CHUNK) if LISTENING: stream.write(data, CHUNK) audio = audio + data rms = audioop.rms(data, 2) if rms < THRESHOLD: silence_counter += 1 else: silence_counter = 0 stream.stop_stream() audio_data = sr.AudioData(audio, RATE, p.get_sample_size(FORMAT)) try: com = self.recognizer.recognize_google(audio_data) t = Thread(target=va.command, args=(com, )) t.start() t.join() except sr.UnknownValueError: print( 'Google Speech Recognition could not understand audio' ) except sr.RequestError as e: print( f'Could not request results from Google Speech Recognition service; {e}' ) stream.start_stream() self.reset() data = stream.read(CHUNK) if LISTENING: stream.write(data, CHUNK) except KeyboardInterrupt: stream.stop_stream() stream.close() p.terminate() raise KeyboardInterrupt
def audio_cb(self, msg): #if not self.is_sound_init: # self.init_sound() if self.is_canceling: rospy.loginfo("Speech is cancelled") return data = SR.AudioData(msg.data, self.sample_rate, self.sample_width) try: rospy.loginfo("Waiting for result %d" % len(data.get_raw_data())) result = self.recognizer.asr(data.get_raw_data(), 'pcm', 16000, { 'dev_pid': 1936#1536, }) if result['err_no']: #rospy.loginfo(result["err_msg"]) return rospy.loginfo(";".join(result["result"])) #result = self.recognizer.recognize_google( # data, language=self.language) #msg = SpeechRecognitionCandidates(transcript=[result]) #self.pub_speech.publish(msg) except SR.UnknownValueError as e: rospy.logerr("Failed to recognize: %s" % str(e)) except SR.RequestError as e: rospy.logerr("Failed to recognize: %s" % str(e))
async def recognize(self, data): mono = audioop.tomono(data, self.SAMPLE_WIDTH, 1, 0) audio = speech_recognition.AudioData(mono, self.SAMPLE_RATE, self.SAMPLE_WIDTH) return await self.loop.run_in_executor( None, self._recognizer.recognize_google_cloud, audio, self.CREDENTIALS)
def storeWav(self, fileName=None, maxDuration=None, useCycleBufferLen=0, response=None): if maxDuration is not None: maxBufferLen = round(maxDuration / self.seconds_per_buffer) if maxBufferLen <= 0 or maxBufferLen >= len(self.frames): maxBufferLen = None else: maxBufferLen = None if maxDuration is None or maxBufferLen is None: temp = self.frames[:] else: temp = self.frames[:maxBufferLen] frame_data = getByteArray(temp) audio = sr.AudioData(frame_data, self.sample_rate, self.sample_width) if useCycleBufferLen > 0: pluginEcho.echoStoreWavCycleBuffer(audio, fileName, "./", useCycleBufferLen, response) else: pluginEcho.echoStoreWav(audio, fileName, response)
def audio(meta_data): audio = sr.AudioData(meta_data["wav blob"], meta_data["sampleRate"], 2) text = r.recognize_sphinx(audio, language=meta_data["language"]) if text: text = punctate(text) text = text[0].upper() + text[1:] socketio.emit("textarea_text", text)
def record(self, stream, rate): threshold = 10 short_normalize = (1.0 / 32768.0) chunk = 1024 width = 2 timeout_length = 1 while True: input = stream.read(chunk) rms_val = self.rms(input, width, short_normalize) if rms_val > threshold: break print('Шум обнаружен, началась запись') logging.info('Шум обнаружен, началась запись') if self.tray_interface is not None: self.tray_interface.set_correct() rec = collections.deque() current = time.time() end = time.time() + timeout_length while current <= end and len(rec) <= 124: data = stream.read(chunk) if self.rms(data, width, short_normalize) >= threshold: end = time.time() + timeout_length current = time.time() rec.append(data) if self.tray_interface is not None: self.tray_interface.set_default() logging.info('Запись остановлена') return sr.AudioData(b"".join(rec), rate, width)
def add_data(self, data): ##need to get the supplied data to valid audio stream to be passed through phonix or some speech to text parser #get the recognizer r = sr.Recognizer() ##data dont need to go through microphone as it is from that already ##maybe convert data to a temp wav file to pass into speech rec ##this should convert it to an audio file to be able to pass it through sphinx audio = sr.AudioData(data, self.audioSampleRate, self.audioSampleSize) #pass through sphinx to get the text ##if sphinx aint any good then chop out to a different handler (maybe even google speach engine api - limits apply try: text = r.recognize_sphinx(audio) print("Sphinx thinks you said " + text) except sr.UnknownValueError: print("Sphinx could not understand audio") except sr.RequestError as e: print("Sphinx error; {0}".format(e)) # if not self.have_phrase: if text == self.phrase: logger.info("clap detected") self.have_phrase = True self.callback() ##prob dont need this as this is used for detecting a sharp clap sound i think self.prev_sample = audio[-1]
def callback(self, data): r = sr.Recognizer() with sr.Microphone() as source2: numpydata = data.data audio = sr.AudioData(numpydata.tobytes(), source2.SAMPLE_RATE, source2.SAMPLE_WIDTH) try: BING_KEY = "INSERT BING API KEY HERE" self.text = r.recognize_bing(audio, key=BING_KEY) except sr.UnknownValueError: print( "Microsoft Bing Voice Recognition could not understand audio" ) except sr.RequestError as e: print( "Could not request results from Microsoft Bing Voice Recognition service; {0}" .format(e)) else: self.text_pub.publish(str(self.text))
def build(self, file_name=''): # initialize variables self.chunk = 1024 self.format = pyaudio.paInt16 self.channels = 1 self.rate = 44100 if file_name: self.file_name = file_name + ".wav" else: self.file_name = re.sub('[:punc:]|\.', '', str( pd.datetime.today())) + ".wav" self.news_article, self.news_article_label = get_url_content() # Deselect useless titles and stuff self.news_article = [x for x in self.news_article if len(x) > 80] self.random_article = 0 self.build_init = True self.clean_text = self.news_article[self.random_article] # initialize classes self.port_audio = pyaudio.PyAudio() self.record_seconds = int() self.sound_data = [] self.audio_data = sr.AudioData(b'', 1, 1) self.recog = sr.Recognizer()
def recognize(stream_text): global args def logger(s): f = open('radio_log.txt', 'a+', encoding='utf-8') f.write(datetime.datetime.now().strftime("[ %d-%b-%Y %H:%M:%S ] ")) f.write(s) f.write("\x0A") f.close() # print('sync') audio_data = sr.AudioData(stream_text, audio_rate, 2) try: # result = recognizer.recognize_sphinx(audio_data) result = recognizer.recognize_google(audio_data, language=args.lang) print(result + " kk") command1 = "hello satellite" counter1 = result.find(command1) if counter1 > 0: print("<<<<<<Hellow Satellite was found>>>>>>>") logger(result) except sr.UnknownValueError: pass except sr.RequestError as e: print("Could not request results from GSR service; {0}".format(e))
def analiseSpeech(audioWav): #Instanciate speech recogntion module r = sr.Recognizer() #Retrieve google credentials from S3 and put in a string credentials_json = retrieveS3File( "apneasleepbucket", "googleCredentials/ApenaSleep-c58f74b11fb6.json") credentialsStr = credentials_json['Body']._raw_stream.data.decode() try: #Get wav audio bytes and instaciate a AudioData Object samplerate, data = wavfile.read(io.BytesIO(audioWav.getvalue())) audio = sr.AudioData(audioWav.getvalue(), samplerate, data.dtype.itemsize) print("Convertendo Audio para Texto ..... ") #Call Google Cloud API to try to find a speech on it result = r.recognize_google_cloud(audio, credentials_json=credentialsStr, language="pt-BR", show_all=True) hasSpoken = True if len(result) == 0: hasSpoken = False #Return JSON string of the speech analysis result return result, hasSpoken except Exception as e: print(e) print("Error: ", e) return jsonify({'errorMessage': str(e)}), 400
def convert(): language = request.form['language'] hot_words = [(hot_word, ACCURACY) for hot_word in json.loads(request.form['hot_words'])] recognizer = sr.Recognizer() frame_data_base64 = request.form['frame_data_base64'] frame_data = base64.b64decode(frame_data_base64) sample_rate = int(request.form['sample_rate']) sample_width = int(request.form['sample_width']) logger.debug( f'Recieved request. Language: {language}. Hot word: {hot_words}.') audio = sr.AudioData(frame_data, sample_rate, sample_width) try: transcript = recognizer.recognize_sphinx(audio, language=language, keyword_entries=hot_words) logger.debug(f'Transcript: {transcript}') return Response(transcript, 200) except sr.UnknownValueError: logger.debug(f'Uknown value') return Response('', 200) except Exception as err: logger.error(f'Error while recognizing. Error: {err}') return Response('', 500)
def performAnalysis(self, expectedStr): #Format the expected result to remove words that cant be recognized correctly formatter = STTFormatter() expectedStr = self.convertSentenceToWords(expectedStr) expectedStr = formatter.performOperations(expectedStr) frame_data = self.frames.getvalue() #Create the audio data from the frames recievedData = sr.AudioData(frame_data, self.m.SAMPLE_RATE, self.m.SAMPLE_WIDTH) #generate the raw data for processing rd = self.r.getRawData(recievedData) #decode self.r.decodeAudio(rd) val = self.r.genHypothesis() print(val) self.numword = len(val.split()) #Test the accuracy val = self.convertSentenceToWords(val) self.compareSentences(expectedStr, val) #Check to ensure wpm defaults to lowest possibe if len(expectedStr) < len(val): self.numword = len(expectedStr) else: self.numword = len(val) wpm = self.getWordsperMin() #return the accuracy and wpm return self.percentageAccuracy, wpm
def __hotword_detected(self, hotword_index): """ Hotword detected process: 1) record audio until timeout 2) convert audio to AudioData instance 3) send AudioData to STT provider 4) finally send event to raspiot Args: hotword_index (int): indicate voice_model detected in case of multiple models (not implemented yet) """ #record audio until timeout self.logger.debug('Recordings audio during %d seconds...' % self.record_duration) start = time.time() recorded_data = bytes() while True: data = self.buffer.get() if len(data)>0: #append buffer recorded_data += data #check timeout if time.time()>(start+self.record_duration): #stop recording self.logger.debug(u'Stop recording') break else: #pause time.sleep(0.03) self.logger.debug(u'Recording finished') #check recorded data if len(recorded_data)==0: self.logger.debug('Nothing said during recording') elif self.logger.getEffectiveLevel()==logging.DEBUG: #write audio to file only during debug self.__write_wav(recorded_data, '/tmp/recording.wav') #speech to text recording try: #create audiodata object to use speechrecognition library audio = speechrecognition.AudioData(recorded_data, self.rate, 2) #recognize audio self.logger.debug(u'Recognizing audio...') command = self.recognizer.recognize_bing(audio, key=self.provider_token, language='fr-FR') self.logger.debug(u'Done: command=%s' % command) except speechrecognition.UnknownValueError: self.logger.warning(u'STT provider doesn\'t understand command') except speechrecognition.RequestError as e: self.logger.error(u'STT provider service seems to be unreachable: %s' % str(e)) #send event params = { u'hotword': 'hello', u'command': command } self.command_event.send(params=params);
def callback(self, data): r = sr.Recognizer() with sr.Microphone() as source2: numpydata = data.data audio = sr.AudioData(numpydata.tobytes(), source2.SAMPLE_RATE, source2.SAMPLE_WIDTH) try: IBM_USERNAME = "******" IBM_PASSWORD = "******" self.text = r.recognize_ibm(audio, username=IBM_USERNAME, password=IBM_PASSWORD) except sr.UnknownValueError: print("IBM Speech to Text could not understand audio") except sr.RequestError as e: print( "Could not request results from IBM Speech to Text service; {0}" .format(e)) else: self.text_pub.publish(str(self.text))
def recognize(stream_text): global args date_of_file = datetime.datetime.now().strftime("[ %d-%b-%Y %H:%M:%S ] ") def logger(s): date = datetime.datetime.now().strftime("[ %d-%b-%Y %H:%M:%S ] ") f = open(date +'radio_log.txt', 'a+', encoding='utf-8') #f.write(datetime.datetime.now().strftime("[ %d-%b-%Y %H:%M:%S ] ")) f.write(date) f.write(s) f.write("\x0A") f.close() # print('sync') audio_data = sr.AudioData(stream_text, audio_rate, 2) try: # result = recognizer.recognize_sphinx(audio_data) result = recognizer.recognize_google(audio_data, language=args.lang) print(result) logger(result) except sr.UnknownValueError: pass except sr.RequestError as e: print("Could not request results from GSR service; {0}".format(e)) # print('done') with open(date_of_file + "recording_of_radio_station.wav", "wb") as f: f.write(audio_data.get_wav_data())
def speech_recog(audio, sr): r = sprecog.Recognizer() audio = sprecog.AudioData(audio.tobytes(), sr, 2) try: text = r.recognize_sphinx(audio, language=args.sr_lan, show_all=False) except sprecog.UnknownValueError: text = "" return text
async def recognize(self, data): mono = audioop.tomono(data, self.SAMPLE_WIDTH, 1, 0) audio = speech_recognition.AudioData(mono, self.SAMPLE_RATE, self.SAMPLE_WIDTH) func = functools.partial(self._recognizer.recognize_google, audio, key=self.KEY) return await self.loop.run_in_executor(None, func)
def raw_speech_to_audio_data(self, raw_data): """ Here width of frame is assumed to be 2 bytes, according to information I found in examples of source code. However this value can be not valid, so it should be tested. All possible values according to `AudioData` source code is 1-4. """ return speech.AudioData(frame_data=raw_data, sample_rate=self.rate, sample_width=2)
def callback (self,data): r = sr.Recognizer() with sr.Microphone() as source2: numpydata = data.data audio=sr.AudioData(numpydata.tobytes(),source2.SAMPLE_RATE, source2.SAMPLE_WIDTH ) data, samplerate = sf.read('myfile.raw', channels=1, samplerate=44100, subtype='FLOAT')
def speech_recognition(samples,rate=16000): is_it_speech = 0 recognizer = sr.Recognizer() harvard = sr.AudioData(samples.tobytes(), sample_rate=rate, sample_width=samples.dtype.itemsize) text_output = recognizer.recognize_google(harvard, show_all=True) if len(text_output) != 0: is_it_speech = 1 return is_it_speech
def test_live(message): speech = sr.AudioData( base64.b64decode(message['data']), message['sample_rate'], message['sample_width'], ) value = r.recognize_google(speech, language = 'ms-MY') emit('speech_update', {'text': value}, broadcast = True)
def recognize(self, frames, begin, end): result = '' frame_data = b''.join(frames[begin:end]) audio_data = sr.AudioData(frame_data, self.rate, self.width) try: result = self._recognize(audio_data, language=self.language) except sr.UnknownValueError: result = '###' return self.position2time(begin), self.position2time(end), result
def rcgn_google(frame_data, sample_rate, sample_size_bytes): text = '' r = sr.Recognizer() audio = sr.AudioData(frame_data, sample_rate, sample_size_bytes) try: text = r.recognize_google(audio, language='ru-RU', show_all=False) except Exception as e: pass #print("Exception: " + str(e)) return text
def get_msg(self, data, rate, sample_width, number_channels): audio_data = sr.AudioData(data, rate, sample_width) msg = self.recognizer.recognize_google(audio_data, key=self._api_key, show_all=True) if not msg: msg = self.recognizer.recognize_sphinx(audio_data) return msg