def label_stream(libpath): audio_stream = AudiostreamSource() extractor = FeatureExtractor(libpath) extactor_gain=16.0 #FIXME: This is just used for bufsize detector = AudioRecognition(default_libpath) bufsize = detector.getInputDataSize() audio_stream.start() try: while(True): frame = audio_stream.read(bufsize*2,bufsize*2) if(not frame): time.sleep(0.01) continue features = extractor.signalToMel(frame,extactor_gain) send_features(features) except KeyboardInterrupt: print("Terminating") audio_stream.stop() sys.exit(0)
def label_stream(labels, libpath, graph, sensitivity): audio_stream = AudiostreamSource() detector = AudioRecognition(libpath, graph, labels) detector.SetSensitivity(sensitivity) detector.SetGain(1) detector.RemoveDC(False) bufsize = detector.GetInputDataSize() play_command = "play -q" if platform.system() == "Darwin" else "aplay" print("Audio Recognition Version: " + detector.GetVersionString()) audio_stream.start() try: while (True): frame = audio_stream.read(bufsize, bufsize) if (not frame): time.sleep(0.01) continue prediction = detector.RunDetection(frame) if (prediction): now = datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") print(detector.GetPredictionLabel(prediction) + " " + now) os.system(play_command + " ./ding.wav") except KeyboardInterrupt: print("Terminating") audio_stream.stop() sys.exit(0)
def __init__(self, libpath, timeout=40): self.current_index = 0 self.number_detectors = 0 self.countdown = 0 self.timeout = timeout self.detector = None self.commands = [] self.libpath = libpath self.history = [] self.last_frames = [] self.max_last_frames = 5 self.detector = AudioRecognition(self.libpath) self.keyword_map = {}
def detectKeywords(libpath): audio_stream = AudiostreamSource() extractor = FeatureExtractor(libpath) detector = AudioRecognition(libpath) extactor_gain = 1.0 vad_threshold = 0.2 keywordVAD = detector.addContinousModel('../../models/Hotword/vad_16.premium') bufsize = detector.getInputDataSize() print("Audio Recognition Version: " + detector.getVersionString()) audio_stream.start() try: while(True): frame = audio_stream.read(bufsize*2,bufsize*2) if(not frame): time.sleep(0.01) continue features = extractor.signalToMel(frame,extactor_gain) _ = detector.runDetection(features) vadResult = detector.getContinousResult(keywordVAD) if(vadResult[1] > vad_threshold): print("Speech detected") except KeyboardInterrupt: print("Terminating") audio_stream.stop() sys.exit(0)
def label_stream(labels, libpath, graph, sensitivity): audio_stream = AudiostreamSource() extractor = FeatureExtractor(libpath) extactor_gain = 1.0 detector = AudioRecognition(libpath, graph, labels) detector.SetSensitivity(sensitivity) bufsize = detector.GetInputDataSize() print("Audio Recognition Version: " + detector.GetVersionString()) audio_stream.start() try: while (True): frame = audio_stream.read(bufsize * 2, bufsize * 2) if (not frame): time.sleep(0.01) continue features = extractor.signal_to_mel(frame, extactor_gain) prediction = detector.RunDetection(features) if (prediction): now = datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") print(detector.GetPredictionLabel(prediction) + " " + now) os.system(play_command + " ./resources/ding.wav") except KeyboardInterrupt: print("Terminating") audio_stream.stop() sys.exit(0)
def detectKeywords(libpath): audio_stream = AudiostreamSource() extractor = FeatureExtractor(libpath) detector = AudioRecognition(libpath) extactor_gain = 1.0 #Add one or more keyword models keywordIdFirefox = detector.addModel( '../../models/Hotword/firefox_v1.4.5.premium', 0.6) keywordIdSheila = detector.addModel( '../../models/Hotword/sheila_v1.4.5.premium', 0.6) keywordIdMarvin = detector.addModel( '../../models/Hotword/marvin_v1.4.5.premium', 0.6) keywordIdAlexa = detector.addModel( '../../models/Hotword/alexa_v1.4.5.premium', 0.6) bufsize = detector.getInputDataSize() print("Audio Recognition Version: " + detector.getVersionString()) audio_stream.start() try: while (True): frame = audio_stream.read(bufsize * 2, bufsize * 2) if (not frame): time.sleep(0.01) continue features = extractor.signalToMel(frame, extactor_gain) prediction = detector.runDetection(features) if (prediction != 0): now = datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") if (prediction == keywordIdFirefox): print("Firefox detected:" + now) elif (prediction == keywordIdSheila): print("Sheila detected:" + now) elif (prediction == keywordIdMarvin): print("Marvin detected:" + now) elif (prediction == keywordIdAlexa): print("Alexa detected:" + now) os.system(play_command + " ../resources/ding.wav") except KeyboardInterrupt: print("Terminating") audio_stream.stop() sys.exit(0)
#!/usr/bin/env python import socket import sys import os sys.path.append('../../python/src') from libnyumaya import AudioRecognition from auto_platform import default_libpath serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) serversocket.bind(('', 9999)) serversocket.listen(5) # become a server socket, maximum 5 connections detector = AudioRecognition(default_libpath) keywordIdFirefox = detector.addModel( '../../models/Hotword/firefox_v2.0.23.premium', 0.8) connection, address = serversocket.accept() while True: buf = connection.recv(640) if len(buf) > 0: prediction = detector.runDetection(buf) if (prediction != 0): print("Keyword detected")
def recordActivations(libpath): audio_stream = AudiostreamSource() extractor = FeatureExtractor(libpath) detectors = {} framebuffersFront = {} framebuffersBack = {} extactor_gain = 1.0 recordBefore = 2.5 # Seconds before the activation recordAfter = 0.5 # Seconds after the activation activationCount = 0 ensure_dir(saveDirectory) rbFrontSize = int(recordBefore * bytesPerSample * framesPerSecond) rbBackSize = int(recordAfter * bytesPerSample * framesPerSecond) for mpath, msens, mname in models: detector = AudioRecognition(libpath) detector.addModel(mpath, msens) detectors[mname] = detector framebuffersFront[mname] = bytearray() framebuffersBack[mname] = bytearray() bufsize = detector.getInputDataSize() print("Audio Recognition Version: " + detector.getVersionString()) audio_stream.start() try: while (True): frame = audio_stream.read(bufsize * 2, bufsize * 2) if (not frame): time.sleep(0.01) continue for mname in detectors: #Fill audio before the activation framebuffersFront[mname] = framebuffersFront[mname] + frame if (len(framebuffersFront[mname]) > rbFrontSize): framebuffersFront[mname] = framebuffersFront[mname][ -rbFrontSize:] features = extractor.signalToMel(frame, extactor_gain) for mname in detectors: detector = detectors[mname] prediction = detector.runDetection(features) if (prediction != 0): #FIXME: Record after is currently ignored #Fill audio after the activation #while(len(framebuffersBack[mname]) < rbBackSize): # frame = audio_stream.read(bufsize*2,bufsize*2) # if(not frame): # time.sleep(0.01) # continue # framebuffersBack[mname] = framebuffersBack[mname] + frame savePath = saveDirectory + "/activation_{}_{}_{}.wav".format( mname, activationCount, time.time_ns()) save_wav(framebuffersFront[mname], savePath) #save_wav(framebufferFront+framebufferBack,savePath) print("Saving Activation to {}".format(savePath)) activationCount += 1 except KeyboardInterrupt: print("Terminating") audio_stream.stop() sys.exit(0)
parser.add_argument( '--good_folder', type=str, default='./good_files/', help='Path to good files.') parser.add_argument( '--noise_folders', type=str, default='./demand/', help='Path to noise files.') parser.add_argument( '--bad_folders', type=str, default='', help='Path to additional bad folders seperated by comma.') parser.add_argument( '--libpath', type=str, default='../lib/linux/libnyumaya.so', help='Path to nyumaya_library') FLAGS, unparsed = parser.parse_known_args() sensitivities = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.99] detector = AudioRecognition(FLAGS.libpath,FLAGS.graph,FLAGS.labels) addnoise = [False,True] results_clean = [] results_noisy = [] results_false = [] print(FLAGS.graph + "\n") for noise in addnoise: for sensitivity in sensitivities: wrong_predictions, good_predictions,missed_predictions,samples = run_good_predictions(detector,FLAGS.good_folder,FLAGS.noise_folders,noise,sensitivity) result = {} result["sensitivity"] = sensitivity result["accuracy"] = 1-(missed_predictions+wrong_predictions)/samples if(noise): results_noisy.append(result)
def add_detector(self,graph,labels,sensitivity): detector = AudioRecognition(self.libpath,graph,labels) detector.SetSensitivity(sensitivity) self.detectors.append(detector)
def label_stream(): hotword_detected = False countdown = 0 audio_stream = AudiostreamSource() action_detector = AudioRecognition(libpath, action_graph, action_labels) hotword_detector = AudioRecognition(libpath, hotword_graph, hotword_labels) # #action_detector = hotword_detector hotword_detector.SetSensitivity(0.5) action_detector.SetSensitivity(0.55) bufsize = hotword_detector.GetInputDataSize() audio_stream.start() print("Audio Recognition Version: " + hotword_detector.GetVersionString()) try: while (True): frame = audio_stream.read(bufsize, bufsize) if (not frame): time.sleep(0.01) continue if (countdown > 0): countdown -= 1 if (countdown == 0): hotword_detected = False print("Stopped Listening") if (not hotword_detected): prediction = hotword_detector.RunDetection(frame) print(hotword_detector.GetPredictionLabel(prediction)) if (prediction and hotword_detector.GetPredictionLabel(prediction) == 'light'): hotword_detected = True countdown = 20 now = datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") print("Listening") else: prediction = action_detector.RunDetection(frame) if (prediction): label = action_detector.GetPredictionLabel(prediction) if (label == "on"): print("Turning light on") if (label == "off"): print("Turning light off") countdown = 0 hotword_detected = False except KeyboardInterrupt: print("Terminating") audio_stream.stop() sys.exit(0)
def detectKeywords(libpath): audio_stream = AudiostreamSource() extractor = FeatureExtractor(libpath) detector = AudioRecognition(libpath) framerate = 16000 model = Model("model") #Let's define a custom dictionary rec = KaldiRecognizer( model, framerate, '["oh one two three four five six seven eight nine zero", "[unk]"]') extactor_gain = 1.0 #Add one or more keyword models keywordIdAlexa = detector.addModel( '../../models/Hotword/alexa_v3.0.35.premium', 0.85) bufsize = detector.getInputDataSize() print("Audio Recognition Version: " + detector.getVersionString()) command_started = False audio_stream.start() try: while (True): # Wakeword loop if (not command_started): frame = audio_stream.read(bufsize * 2, bufsize * 2) if (not frame): time.sleep(0.01) continue features = extractor.signalToMel(frame, extactor_gain) prediction = detector.runDetection(features) if (prediction != 0): now = datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") if (prediction == keywordIdAlexa): print("Alexa detected:" + now) os.system(play_command + " ../resources/ding.wav") command_started = True # vosk loop else: frame = audio_stream.read(4000, 4000) if (not frame): time.sleep(0.01) continue if rec.AcceptWaveform(bytes(frame)): print(rec.Result()) command_started = False print(rec.FinalResult()) except KeyboardInterrupt: print("Terminating") audio_stream.stop() sys.exit(0)
class MultiDetector(): def __init__(self, libpath, timeout=40): self.current_index = 0 self.number_detectors = 0 self.countdown = 0 self.timeout = timeout self.detector = None self.commands = [] self.libpath = libpath self.history = [] self.last_frames = [] self.max_last_frames = 5 self.detector = AudioRecognition(self.libpath) self.keyword_map = {} #Given the current history which words are we checking for? def get_possible_words(self, history): words = [] for cmd in self.commands: index = command_starts_with_history(cmd['command'], history) if (index >= len(cmd['command'])): print("Error index out of range:") print("Command: " + str(cmd)) print("Index: " + str(index)) print("History: " + str(history)) return [] if (index >= 0): cmd = cmd['command'][index] if (not cmd in words): words.append(cmd) return words def UpdateLastFrames(self, frame): self.last_frames.append(frame) if len(self.last_frames) > self.max_last_frames: self.last_frames.pop(0) def add_command(self, command, callback_function): if (len(command.split(",")) == 0): print("No valid command") return self.commands.append({ 'command': command.split(","), 'function': callback_function }) self.update_word_and_detector() def add_word(self, graph, name, sensitivity): keywordId = self.detector.addModel(graph, sensitivity) self.keyword_map[keywordId] = name def add_reset_history_callback(self, callback_function): self.history_callback = callback_function def add_detected_callback(self, callback_function): self.detected_callback = callback_function def GetInputDataSize(self): return self.detector.getInputDataSize() def maby_execute(self): executed_cmd = False for cmd in self.commands: if (cmd['command'] == self.history): cmd['function']() self.history = [] self.countdown = 0 self.last_frames = [] executed_cmd = True return executed_cmd def check_timeout(self): if (self.countdown > 0): self.countdown -= 1 if (self.countdown == 0): self.history = [] self.update_word_and_detector() if (self.history_callback): self.history_callback() def update_word_and_detector(self): self.possible_words = self.get_possible_words(self.history) #Set possible words active #Set impossible words inactive print(self.possible_words) for id in self.keyword_map: key = self.keyword_map[id] if (key in self.possible_words): self.detector.setActive(id, True) else: self.detector.setActive(id, False) def run_frame(self, frame, update_frames=True): if (update_frames): self.UpdateLastFrames(frame) self.check_timeout() prediction = self.detector.runDetection(frame) if (prediction): label = self.keyword_map[prediction] if (label in self.possible_words): print("Got prediction: " + label) self.countdown = self.timeout self.history.append(label) result = self.maby_execute() self.update_word_and_detector() if (self.detected_callback): self.detected_callback() #Command hasn't finished so run last frames in next detectors if (not result): self.run_last_frames() def run_last_frames(self): for frame in self.last_frames: self.run_frame(frame, update_frames=False) def print_commands(self): for cmd in self.commands: print(cmd)
def label_stream(labels, libpath, verification_path, graph, sensitivity): last_frames = [] #Keyword spotting has 200ms frames, Verifiyer takes 2 seconds of audio max_last_frames = 10 audio_stream = AudiostreamSource() extractor = FeatureExtractor(libpath) detector = AudioRecognition(libpath, graph, labels) detector.SetSensitivity(sensitivity) verifiyer = SpeakerVerification(libpath, verification_path) bufsize = detector.GetInputDataSize() print("Bufsize: " + str(bufsize)) print("Audio Recognition Version: " + detector.GetVersionString()) print( "WARNING EXPERIMENTAL: The voice verification module can be use to verify if" ) print( "A command is issued by a certian speaker. It processes speech signals with a" ) print("two second length. This experimental version isn't very good yet.") print( "\n\n During enrolling a fingerprint of your voice is caputred. By default 5 samples" ) print( "Will be captured and averaged. The progam will output a similarity score between 0 and 1" ) print("A value of 1 means totally similar, 0 means different.") print("Currently a threshold of 0.95 seems good") print( "This module should not be run on a Pi Zero, as it uses excessive CPU") print( "Verification can also be helpful to reduce false positives of non speech signals" ) audio_stream.start() try: while (True): frame = audio_stream.read(bufsize * 2, bufsize * 2) if (not frame): time.sleep(0.01) continue features = extractor.signal_to_mel(frame) last_frames.append(features) if len(last_frames) > max_last_frames: last_frames.pop(0) prediction = detector.RunDetection(features) if (prediction): now = datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") print(detector.GetPredictionLabel(prediction) + " " + now) os.system(play_command + " ./resources/ding.wav") detect_frame = bytearray() for element in last_frames: detect_frame.extend(element) print("Running Verification") features = verifiyer.VerifySpeaker(detect_frame) if (len(fingerprints) < enrolling): print("Enrolling") fingerprints.append(features) else: print("Completed") print(features) avg_fingerprint = get_averaged_fingerprint() if (avg_fingerprint): similarity_score = cosine_similarity( features, avg_fingerprint) print("Similarity: " + str(similarity_score)) print("Verification Done") except KeyboardInterrupt: print("Terminating") audio_stream.stop() sys.exit(0)