def __init__(self, parent=None): """ Class constructor. """ QtGui.QWidget.__init__(self, parent) # UI setup self.ui = Ui_MusicSplitter() self.ui.setupUi(self) self.ui.browseButton.clicked.connect(self.HandleBrowseButton) self.ui.processButton.clicked.connect(self.HandleProcessButton) self.ui.saveButton.clicked.connect(self.HandleSaveButton) self.ui.tableView.resizeColumnsToContents() self.ui.tableView.setSelectionBehavior(QtGui.QTableView.SelectRows) self.ui.tableView.clicked.connect(self.HandleTableClicked) self.ui.tableView.setHorizontalScrollBarPolicy( QtCore.Qt.ScrollBarAlwaysOff) self.tableModel = None # VAD class self.vad = VAD() bandStart = self.ui.bandStartBox.value() self.vad.music_start_band = bandStart bandEnd = self.ui.bandEndBox.value() self.vad.music_end_band = bandEnd minSongLen = self.ui.songLenBox.value() self.vad.min_song_len = minSongLen self.foundSongs = None # Class members self.inputFile = None
def extract_features(df, label2ix, spec_kwargs, vad_kwargs, stacksize=1, frate=100, return_y=False): if return_y: return_y = 'label' in df X = {} if return_y: y = {} spectrum_encoder = Spectral(**spec_kwargs) vad_encoder = VAD(**vad_kwargs) for ix, fname in enumerate(df.filename.unique()): sig, fs = wavread(fname) if fs != spec_kwargs['fs']: raise ValueError('expected samplerate {}, got {}'.format( spec_kwargs['fs'], fs) ) spec = spectrum_encoder.transform(sig) spec = (spec - spec.mean(0)) / spec.std(0) if stacksize > 1: spec = roll_array(spec, stacksize) vad = vad_encoder.activations(sig) vad = vad.reshape(vad.shape[0], -1) if stacksize > 1: vad = roll_array(vad, stacksize) X_curr = [] if return_y: y_curr = [] rows_iter = df[df.filename == fname].iterrows() for _, row in rows_iter: start = row.start end = row.end start_fr = int(start * frate) end_fr = int(end * frate) feat = np.hstack( (spec[start_fr: end_fr], vad[start_fr: end_fr]) ) X_curr.append( feat.astype(np.float32) ) if return_y: y_curr.append( np.ones(feat.shape[0], dtype=np.uint8) * \ label2ix[row.label] ) X[fname] = np.vstack(X_curr) if return_y: y[fname] = np.hstack(y_curr) if return_y: return X, y else: return X
def extract_features(df, label2ix, spec_kwargs, vad_kwargs, stacksize=1, frate=100, return_y=False): if return_y: return_y = 'label' in df X = {} if return_y: y = {} spectrum_encoder = Spectral(**spec_kwargs) vad_encoder = VAD(**vad_kwargs) for ix, fname in enumerate(df.filename.unique()): sig, fs = wavread(fname) if fs != spec_kwargs['fs']: raise ValueError('expected samplerate {}, got {}'.format( spec_kwargs['fs'], fs)) spec = spectrum_encoder.transform(sig) spec = (spec - spec.mean(0)) / spec.std(0) if stacksize > 1: spec = roll_array(spec, stacksize) vad = vad_encoder.activations(sig) vad = vad.reshape(vad.shape[0], -1) if stacksize > 1: vad = roll_array(vad, stacksize) X_curr = [] if return_y: y_curr = [] rows_iter = df[df.filename == fname].iterrows() for _, row in rows_iter: start = row.start end = row.end start_fr = int(start * frate) end_fr = int(end * frate) feat = np.hstack((spec[start_fr:end_fr], vad[start_fr:end_fr])) X_curr.append(feat.astype(np.float32)) if return_y: y_curr.append( np.ones(feat.shape[0], dtype=np.uint8) * \ label2ix[row.label] ) X[fname] = np.vstack(X_curr) if return_y: y[fname] = np.hstack(y_curr) if return_y: return X, y else: return X
def get_voice_times(frames, sample_rate, threshold=0, win_size=0.05, hop_size=0.025): ''' Get the times in which the VAD (Voice Activity Detector) detected voice Returns: a list of tuples each contains a voice-segment boundaries ''' detector = VAD(fs=sample_rate, win_size_sec=win_size, win_hop_sec=hop_size) decisions = list(detector.detect_speech(frames, threshold=threshold)) # Smooth the binary hard decisions vector with a sliding average slide_size = int(SLIDING_AVERAGE_WINDOW_SIZE / 2) smooth_decisions = [] for i in range(len(decisions)): if (i < slide_size) or (i >= len(decisions) - slide_size): smooth_decisions.append(False) continue # Majority vote smooth_decisions.append(decisions[i - slide_size:i + slide_size + 1].count(True) > slide_size) # Extract speech segments from the hard decisions voice_times = [] old_dec = False current_start = 0 for i, dec in enumerate(decisions): if dec and not old_dec: # We want to ignore short non-speech segments, so if the previous speech-end # is too close to this speech-start - remove the last speech segment and keep searching if voice_times and ( (i * hop_size) - voice_times[-1][1]) < MIN_SILENT_SEGMENT_LEN_SEC: current_start = voice_times[-1][0] voice_times = voice_times[:-1] else: current_start = i * hop_size if old_dec and not dec: voice_times.append((current_start, i * hop_size)) old_dec = dec return voice_times
def decode_live(source, volume, aggressiveness, url, topic, broker): from pulserecorder import PulseRecorder from vad import VAD, BUFFER_DURATION stream_id = get_uuid() # create requests session for saving cookies session = requests.Session() try: # pulseaudio recorder rec = PulseRecorder (source_name=source, volume=volume) vad = VAD(aggressiveness=aggressiveness) rec.start_recording() logging.info("Start talking.") while True: samples = rec.get_samples() audio, finalize = vad.process_audio(samples) if not audio: continue data = {'audio' : audio, 'do_finalize': finalize, 'topic' : topic, 'broker' : broker, 'id' : stream_id, 'sample_rate': 16000} response = session.post(url, json=data) if not response.ok: logging.error(response.text) else: logging.info ( "\tPrediction : %s - %f" % (response.json()['hstr'], response.json()['confidence'])) except KeyboardInterrupt: logging.info("Keyboard Interrupt: stopping service") rec.stop_recording() session.close() except Exception as e: logging.critical(e) session.close() sys.exit(1)
def remove_silence(signal): """ Detects the speech regions and removes the non-speech regions using VAD (Voice Activity Detection) :param signal: np.ndarray (n by 1): input audio signal from one speaker :return: without_silence: np.ndarray (n by 1): original signal with removed silence regions """ regions = VAD(signal, int(df.sample_rate.iloc[0]), nFFT=512, win_length=0.02, hop_length=0.01, threshold=0.65) without_silence = np.array([signal[160 * i: 160 * (i + 1)] for i in range(regions.shape[0]) if regions[i] > 0]) without_silence = without_silence.flatten() return without_silence
import flask import uuid import time import json import io import logging from flask import Response from flask import Flask from flask import request from flask_cors import CORS, cross_origin import numpy as np app = Flask(__name__) CORS(app) detector = VAD(frame_duration=0.5, model_path='models/vad') SAMPLING_RATE = 44100 @app.route("/") def homepage(): return "Welcome to the REST API!" @app.route("/predict", methods=["POST"]) def predict(): # initialize the data dictionary that will be returned from the view result = {"success": False} frames = flask.request.data array_frames = np.frombuffer(frames, dtype=np.int16) array_frames = array_frames.astype(np.float32, order='C') / 32768.0
class MyForm(QtGui.QMainWindow): """ Main form class. """ def __init__(self, parent=None): """ Class constructor. """ QtGui.QWidget.__init__(self, parent) # UI setup self.ui = Ui_MusicSplitter() self.ui.setupUi(self) self.ui.browseButton.clicked.connect(self.HandleBrowseButton) self.ui.processButton.clicked.connect(self.HandleProcessButton) self.ui.saveButton.clicked.connect(self.HandleSaveButton) self.ui.tableView.resizeColumnsToContents() self.ui.tableView.setSelectionBehavior(QtGui.QTableView.SelectRows) self.ui.tableView.clicked.connect(self.HandleTableClicked) self.ui.tableView.setHorizontalScrollBarPolicy( QtCore.Qt.ScrollBarAlwaysOff) self.tableModel = None # VAD class self.vad = VAD() bandStart = self.ui.bandStartBox.value() self.vad.music_start_band = bandStart bandEnd = self.ui.bandEndBox.value() self.vad.music_end_band = bandEnd minSongLen = self.ui.songLenBox.value() self.vad.min_song_len = minSongLen self.foundSongs = None # Class members self.inputFile = None def HandleTableClicked(self, clickedIndex): """ Handle clicked table. """ index = clickedIndex.row() form = EditForm(self) song = copy.deepcopy(self.foundSongs[index]) song[1] = os.path.join(os.path.dirname(str(self.inputFile)), str(song[1])) form.Init(song, self.vad.data, self.vad.bitrate) form.exec_() if form.result == []: self.foundSongs.pop(index) self.SetupTable(self.foundSongs) elif form.result is None: pass else: form.result[1] = os.path.join(os.path.dirname(str(self.inputFile)), form.result[1]) form.result[1] = os.path.relpath( form.result[1], os.path.dirname(str(self.inputFile))) self.foundSongs[index] = form.result self.SetupTable(self.foundSongs) def HandleBrowseButton(self): """ Handle browse button, select input wav file and open it. """ f = QtGui.QFileDialog.getOpenFileName(self) if f: try: self.inputFile = open(f, 'r') self.inputFile.close() self.inputFile = f self.ui.chosenFile.setText(f) except IOError: self.ShowInformationDialog('Can not open input file!') self.inputFile = None def HandleProcessButton(self): """ Handle button for audio processing. """ self.InitVAD() songs_list = self.vad.ProcessFile(self.inputFile, self.ui.mediancheckBox.isChecked()) # self.emit(QtCore.SIGNAL('BAR_PROGRESS'), None) if songs_list is None: self.ShowInformationDialog('Can not process input file!') elif not songs_list: self.ShowInformationDialog('No songs found in input wav file!') else: self.SetupTable(songs_list) def InitVAD(self): """ Init VAD members when processing. :returns: True if it was possible to initialize, False otherwise :rtype: bool """ if self.ui.levelBox.value() > 0 and self.ui.lengthBox.value() > 0 and self.inputFile is not None \ and self.ui.bandStartBox.value() < self.ui.bandEndBox.value(): self.foundSongs = None self.vad.thr = self.ui.levelBox.value() self.vad.sil_len = self.ui.lengthBox.value() self.vad.music_start_band = self.ui.bandStartBox.value() self.vad.music_end_band = self.ui.bandEndBox.value() self.vad.min_song_len = self.ui.songLenBox.value() self.vad.coresNum = self.ui.coresNumBox.value() else: self.ShowInformationDialog( 'Please, set input file and appropriate parameters!') def HandleSaveButton(self): """ Handle save all files button. """ if self.foundSongs is not None: for x in self.foundSongs: name = os.path.join(os.path.dirname(str(self.inputFile)), x[1]) start = EditForm.Time2Frames(x[2], self.vad.bitrate) end = EditForm.Time2Frames(x[3], self.vad.bitrate) wf.write(name, self.vad.bitrate, self.vad.data[start:end]) else: MyForm.ShowInformationDialog('No results to save!') def SetupTable(self, data): """ Initialize table with data. :param data: table data :type data: list """ self.foundSongs = data header = ['Num', 'File Name', 'Start', 'End'] self.tableModel = MyTableModel(data, header, self) self.ui.tableView.setModel(self.tableModel) MyTableModel.setupColumns(self.ui.tableView) @staticmethod def ShowInformationDialog(text): """ Show simple information dialog. """ msg = QtGui.QMessageBox() msg.setIcon(QtGui.QMessageBox.Information) msg.setText(text) msg.setStandardButtons(QtGui.QMessageBox.Ok) msg.exec_()
parser.add_argument('--window', action='store', dest='window', type=float, required=True) parser.add_argument('--median-filter', action='store_true', dest='median', required=False) parser.add_argument('--no-median-filter', action='store_false', dest='median', required=False) r = parser.parse_args() vad = VAD() bandStart = 50 vad.music_start_band = bandStart minSongLen = 120 vad.min_song_len = minSongLen vad.music_end_band = r.band_end vad.sil_len = r.sil_len vad.thr = r.thr vad.frame_window = r.window vad.frame_overlap = r.window / 2
def __init__(self, session, nSampleRate=16000, datatype=np.int16, nNbrChannel=4, nEnergyThreshold=300, bActivateSpeechRecognition=True, bUseAnonymous=True, strUseLang="", bKeepAudioFiles=False, rTimePreVAD=0.150, rTimePostVAD=0.500): """ analyse chunk of data (must have no specific to robot nor naoqi method) - nSampleRate: the sample rate of your sound - datatype: the way your sound is stored - nNbrChannel: ... - nEnergyThreshold: threshold for the sound to be analysed for sound reco - rVadThreshold: threshold for confidence of the VAD: Currently not used - bActivateSpeechRecognition: do we send the interesting sound to the speech recognition ? - bActivateSoundRecognition: - strUseLang: lang to use for speech recognition, eg: "fr-FR", if leaved to "": use language currently in the tts """ self.session = session self.nSampleRate = nSampleRate self.datatype = datatype self.nNbrChannel = nNbrChannel self.bActivateSpeechRecognition = bActivateSpeechRecognition self.bUseAnonymous = bUseAnonymous self.rEnergyThreshold = nEnergyThreshold # 60 # 10 self.strUseLang = strUseLang self.rTimePreVAD = rTimePreVAD self.rTimePostVAD = rTimePostVAD self.rMfccWindowStepInSec = 0.01 self.nSizePreBuffer = int( self.rTimePreVAD * nSampleRate) # conversion from time to samples self.bStoringSpeech = False # are we currently storing for speech reco ? self.bStoringNoise = False # are we currently storing for sound reco ? self.aRecognizedSpeech = None self.bSpeechDetected = False self.bVisualFeedback = True self.bSpeechAnalysed = False # all sounds buffer will be stored in monochannel self.aStoredDataSpeech = np.array( [], dtype=self.datatype) # a numpy int16 array storing current sound self.aStoredDataNoise = np.array([], dtype=self.datatype) self.aStoredMfccSound = np.array([], dtype=np.float64) self.aStoredSoundPreBuffer = np.array([], dtype=self.datatype) self.createdFiles = [] self.timeLastBufferReceived = time.time() self.timeLastPeak = time.time() - 1000 self.timeLastVAD = self.timeLastPeak self.strLastRecognized = "" self.strDstPath = "/tmp/" self.debug_fileAllSpeech = None self.bKeepAudioFiles = bKeepAudioFiles self.bIsOnRobot = runner.is_on_robot() home = os.path.expanduser("~") self.storeDir = home + "/.abcdk/prevWavs" if not os.path.isdir(self.storeDir): os.makedirs(self.storeDir) self.vad = VAD(self.rTimePreVAD, self.rTimePostVAD) self.fs = freespeech.FreeSpeech(self.session) self.mem = self.session.service("ALMemory") self.leds = LedsDcm.LedsDcm(self.session) self.leds.createProxy() self.leds.createAliases() self.rEndLedLockTime = time.time() self.touch = self.mem.subscriber("TouchChanged") self.id_touch = self.touch.signal.connect( functools.partial(self.onTouch, "TouchChanged")) self.touched = False self.runningThread = True thread.start_new_thread(self.asrOnFile, ())
class SoundAnalyser: def __init__(self, session, nSampleRate=16000, datatype=np.int16, nNbrChannel=4, nEnergyThreshold=300, bActivateSpeechRecognition=True, bUseAnonymous=True, strUseLang="", bKeepAudioFiles=False, rTimePreVAD=0.150, rTimePostVAD=0.500): """ analyse chunk of data (must have no specific to robot nor naoqi method) - nSampleRate: the sample rate of your sound - datatype: the way your sound is stored - nNbrChannel: ... - nEnergyThreshold: threshold for the sound to be analysed for sound reco - rVadThreshold: threshold for confidence of the VAD: Currently not used - bActivateSpeechRecognition: do we send the interesting sound to the speech recognition ? - bActivateSoundRecognition: - strUseLang: lang to use for speech recognition, eg: "fr-FR", if leaved to "": use language currently in the tts """ self.session = session self.nSampleRate = nSampleRate self.datatype = datatype self.nNbrChannel = nNbrChannel self.bActivateSpeechRecognition = bActivateSpeechRecognition self.bUseAnonymous = bUseAnonymous self.rEnergyThreshold = nEnergyThreshold # 60 # 10 self.strUseLang = strUseLang self.rTimePreVAD = rTimePreVAD self.rTimePostVAD = rTimePostVAD self.rMfccWindowStepInSec = 0.01 self.nSizePreBuffer = int( self.rTimePreVAD * nSampleRate) # conversion from time to samples self.bStoringSpeech = False # are we currently storing for speech reco ? self.bStoringNoise = False # are we currently storing for sound reco ? self.aRecognizedSpeech = None self.bSpeechDetected = False self.bVisualFeedback = True self.bSpeechAnalysed = False # all sounds buffer will be stored in monochannel self.aStoredDataSpeech = np.array( [], dtype=self.datatype) # a numpy int16 array storing current sound self.aStoredDataNoise = np.array([], dtype=self.datatype) self.aStoredMfccSound = np.array([], dtype=np.float64) self.aStoredSoundPreBuffer = np.array([], dtype=self.datatype) self.createdFiles = [] self.timeLastBufferReceived = time.time() self.timeLastPeak = time.time() - 1000 self.timeLastVAD = self.timeLastPeak self.strLastRecognized = "" self.strDstPath = "/tmp/" self.debug_fileAllSpeech = None self.bKeepAudioFiles = bKeepAudioFiles self.bIsOnRobot = runner.is_on_robot() home = os.path.expanduser("~") self.storeDir = home + "/.abcdk/prevWavs" if not os.path.isdir(self.storeDir): os.makedirs(self.storeDir) self.vad = VAD(self.rTimePreVAD, self.rTimePostVAD) self.fs = freespeech.FreeSpeech(self.session) self.mem = self.session.service("ALMemory") self.leds = LedsDcm.LedsDcm(self.session) self.leds.createProxy() self.leds.createAliases() self.rEndLedLockTime = time.time() self.touch = self.mem.subscriber("TouchChanged") self.id_touch = self.touch.signal.connect( functools.partial(self.onTouch, "TouchChanged")) self.touched = False self.runningThread = True thread.start_new_thread(self.asrOnFile, ()) def __del__(self): self.stop() if self.debug_fileAllSpeech != None: self.debug_fileAllSpeech.write(self.strDstPath + "concatenated_speechs.wav") def stop(self): self.runningThread = False pass def pause(self): # stop the current recording of audio self.bStoringSpeech = False # reset the stored audio buffer self.aStoredDataSpeech = np.array([], dtype=self.datatype) def setKeepAudioFiles(self, bNewState): self.bKeepAudioFiles = bNewState def setVisualFeedback(self, bNewState): self.bVisualFeedback = bNewState def _writeBufferToFile(self, datas, strFilename): wavFile = wav.Wav() wavFile.new(nSamplingRate=self.nSampleRate, nNbrChannel=1, nNbrBitsPerSample=16) wavFile.addData(datas) bRetWrite = wavFile.write(strFilename) if not self.bIsOnRobot: if self.debug_fileAllSpeech == None: self.debug_fileAllSpeech = wav.Wav() self.debug_fileAllSpeech.new(nSamplingRate=self.nSampleRate, nNbrChannel=1, nNbrBitsPerSample=16) self.debug_fileAllSpeech.addData(datas) self.debug_fileAllSpeech.addData( np.zeros(self.nSampleRate / 2, self.datatype)) return bRetWrite def _sendToSpeechReco(self, strFilename): """ Send a file to the speech recognition engine. Return: the string of recognized text, + confidence or None if nothing recognized """ logging.info("_sendToSpeechReco: sending to speech reco " + strFilename + " to detect it in " + self.strUseLang) retVal = None timeBegin = timeit.default_timer() retVal = self.fs.analyse_anonymous( strFilename, strUseLang=self.strUseLang ) if self.bUseAnonymous \ else self.fs.analyse( strFilename, strUseLang=self.strUseLang ) rProcessDuration = timeit.default_timer() - timeBegin logging.debug( "SoundAnalyser._sendToSpeechReco: freeSpeech analysis processing takes: %5.2fs" % rProcessDuration) if (0): # disabled self.rSkipBufferTime = rProcessDuration # if we're here, it's already to zero if retVal != None: retVal = [retVal[0][0], retVal[0][1]] txtForRenameFile = retVal[0] else: txtForRenameFile = "Not_Recognized" if self.bKeepAudioFiles: newfilename = strFilename.replace( ".wav", "__%s.wav" % self.convertForFilename(txtForRenameFile)) baseFilename = os.path.basename(strFilename) newBaseFilename = os.path.basename(newfilename) newfilename = self.storeDir + "/" + newBaseFilename shutil.move(strFilename, newfilename) logging.info("Saved wav file in " + newfilename) return retVal def convertForFilename(self, strTxt): """ convert a text to be usable as filename "toto is happy" => "toto_is_happy" """ s = strTxt s = s.replace(" ", "_") s = s.replace("'", "_") s = s.replace("\"", "_") s = s.replace(",", "_") s = s.replace(":", "_") s = s.replace("/", "_") s = s.replace("\\", "_") s = s.replace("-", "_") return s # convertForFilename - end def _sendFileToRemoteSoundRecognition(self, strFilename): """ ugly send to the sound recognition in background from a remote computer (just for ears project) """ #~ os.system( "scp -q %s nao@%s:%s" % (strFilename, self.strNaoIP, strFilename) ) # assume folder exists at destination #~ if( self.mem == None ): #~ import naoqi #~ self.mem = naoqi.ALProxy( "ALMemory", self.strNaoIP, 9559 ) #~ if( self.mem != None ): #~ self.mem.raiseMicroEvent( "SoundRecoAnalyseFilename", strFilename ) pass def _sendFileToSoundRecognition(self, buffer): pass def setInputFile(self, strFilename): """ analyse a file, return all voice segment """ s = wav.Wav(strFilename) if not s.isOpen(): raise OSError("File not found: " + strFilename) timeBegin = time.time() aMixedSoundData = s.data if s.nNbrChannel > 1: aMixedSoundData = aMixedSoundData[ 0::s.nNbrChannel] + aMixedSoundData[1::s.nNbrChannel] self.nSampleRate = s.nSamplingRate nNbrOfSamplesByChannel = len(s.data) / s.nNbrChannel rSoundDataDuration = nNbrOfSamplesByChannel / float(self.nSampleRate) return self.analyseBuffer(aMixedSoundData, rSoundDataDuration) def setInputBuffer(self, aInterlacedSoundData, bVerbose=False): """ This is THE method that receives all the sound buffers from the "ALAudioDevice" module or the sound card or a file - aSoundData: it's an interlaced chunks of wav of a various length Return [bNoiseDetected, bSpeechDetected, aRecognizedNoise, aRecognizedSpeech, aRecognizedUser] - aRecognizedXxx: a pair [strText, rConfidence, rDuration] - strText: the recognized text, or name of the recognized sound - rConfidence: [0..1] - rDuration: duration of the recognized sound or [] if nothing recognized or None if nothing analized - aRecognizedUser: to be defined later """ self.bVerbose = bVerbose nNbrOfSamplesByChannel = len(aInterlacedSoundData) / self.nNbrChannel rSoundDataDuration = nNbrOfSamplesByChannel / float(self.nSampleRate) logging.debug( "Receiving a buffer of len: %s (equivalent to %5.3fs) (shape:%s)" % (len(aInterlacedSoundData), rSoundDataDuration, str(aInterlacedSoundData.shape))) if time.time() > self.timeLastBufferReceived + 0.7: # our buffer is now to old (for instance the robot was speaking and we were inhibited) logging.info( "Clearing buffer after gap of %5.2fs (after inhibition...)" % (time.time() - self.timeLastBufferReceived)) self.bStoringSpeech = False self.aStoredDataSpeech = np.array([], dtype=self.datatype) # reset self.timeLastBufferReceived = time.time() aSoundData = np.reshape(aInterlacedSoundData, (self.nNbrChannel, nNbrOfSamplesByChannel), 'F') # sum of two mics aMixedSoundData = aSoundData[0] + aSoundData[1] self.vadSplitBuffer(aMixedSoundData, rSoundDataDuration) def vadSplitBuffer(self, aMixedSoundData, rSoundDataDuration): global recordedFilesMutex start = timeit.default_timer() computedMfcc = base.mfcc(aMixedSoundData, samplerate=self.nSampleRate, winstep=self.rMfccWindowStepInSec) stop = timeit.default_timer() logging.debug("Time for mfcc: " + str(stop - start)) start = timeit.default_timer() aVadStateChange = self.vad.computeFromMfcc(computedMfcc, self.rMfccWindowStepInSec) stop = timeit.default_timer() logging.debug("aVadStateChange: %s" % aVadStateChange) logging.debug("Time for vad " + str(stop - start)) if len(aVadStateChange) > 0: nColor = 0x0000ff #blue rTime = 0.5 else: nColor = 0x808080 #grey rTime = 0.0 self.visualFeedback(rTime, nColor) # analyse VAD analyse results bStoringSpeechDone = False self.bSpeechDetected = False self.bSpeechAnalysed = False for i in range(len(aVadStateChange)): b, t = aVadStateChange[i] if b: if t < 0.: # take datas from preBuffer nNbrSamples = int(-t * self.nSampleRate) self.aStoredDataSpeech = self.aStoredSoundPreBuffer[ -nNbrSamples:] t = 0. # add data to next changes: if i + 1 < len(aVadStateChange): rDuration = aVadStateChange[i + 1][1] - t else: rDuration = rSoundDataDuration - t nStart = int(t * self.nSampleRate) nDuration = int(rDuration * self.nSampleRate) self.aStoredDataSpeech = np.concatenate( (self.aStoredDataSpeech, aMixedSoundData[nStart:nStart + nDuration])) self.bStoringSpeech = True bStoringSpeechDone = True else: self.bStoringSpeech = False self.mem.raiseMicroEvent("Audio/SpeechDetected", True) strFilename = self.strDstPath + datetime.datetime.now( ).strftime("%Y_%m_%d-%Hh%Mm%Ss%fms") + "_speech.wav" logging.debug("Outputting speech to file: '%s'" % strFilename) rDuration = len(self.aStoredDataSpeech) / float( self.nSampleRate) start = timeit.default_timer() bRetWrite = self._writeBufferToFile(self.aStoredDataSpeech, strFilename) stop = timeit.default_timer() logging.debug("Writting buffer to file: " + str(stop - start)) if bRetWrite: recordedFilesMutex.acquire() self.createdFiles.append((strFilename, rDuration)) recordedFilesMutex.release() # reset the stored speech self.aStoredDataSpeech = np.array([], dtype=self.datatype) logging.debug("Lenght aStoredDataSpeech: '%i'" % len(self.aStoredDataSpeech)) if self.bStoringSpeech and not bStoringSpeechDone: self.aStoredDataSpeech = np.concatenate( (self.aStoredDataSpeech, aMixedSoundData)) self.bSpeechDetected = True logging.debug("recording!") if self.bStoringSpeech: if len(self.aStoredDataSpeech ) > self.nSampleRate * 8 or self.touched: # if more than 14 sec, keep only 10 last sec #logging.warning( "SoundAnalyser.analyse: buffer too long, keeping only 10 last seconds..." ) #self.aStoredDataSpeech = self.aStoredDataSpeech[self.nSampleRate*4:] # removing by chunk of 4 sec if self.touched: logging.warning( "SoundAnalyser.analyse: Pepper touched, forcing analysis" ) self.touched = False else: logging.warning( "SoundAnalyser.analyse: buffer too long, forcing analysis" ) ##tobeadded->send audio and start reco again self.bStoringSpeech = False self.mem.raiseMicroEvent("Audio/SpeechDetected", True) strFilename = self.strDstPath + datetime.datetime.now( ).strftime("%Y_%m_%d-%Hh%Mm%Ss%fms") + "_speech.wav" logging.debug("Outputting speech to file: '%s'" % strFilename) rDuration = len(self.aStoredDataSpeech) / float( self.nSampleRate) bRetWrite = self._writeBufferToFile(self.aStoredDataSpeech, strFilename) logging.debug("Writting buffer to file: ") if bRetWrite: recordedFilesMutex.acquire() self.createdFiles.append((strFilename, rDuration)) recordedFilesMutex.release() self.aStoredDataSpeech = np.array([], dtype=self.datatype) nNbrSamples = int(0.3 * self.nSampleRate) self.aStoredDataSpeech = self.aStoredSoundPreBuffer[ -nNbrSamples:] self.bStoringSpeech = True # rDuration = rSoundDataDuration-t #nStart = int(-0.3*self.nSampleRate) #nDuration = int(rDuration*self.nSampleRate) #self.aStoredDataSpeech = np.concatenate( (self.aStoredDataSpeech, aMixedSoundData[nStart:nStart+nDuration]) ); ################# # prebuffer store and offseting self.aStoredSoundPreBuffer = np.concatenate( (self.aStoredSoundPreBuffer, aMixedSoundData)) self.aStoredSoundPreBuffer = self.aStoredSoundPreBuffer[ -self.nSizePreBuffer:] def asrOnFile(self): global recordedFilesMutex while self.runningThread == True: recordedFilesMutex.acquire() if len(self.createdFiles) > 0: inputFile, rDuration = self.createdFiles.pop(0) else: inputFile = "" recordedFilesMutex.release() if inputFile != "": ret = self._sendToSpeechReco(inputFile) if ret != None: self.aRecognizedSpeech = [ret[0], ret[1], rDuration] else: self.aRecognizedSpeech = None self.bSpeechDetected = False self.bSpeechAnalysed = True if self.aRecognizedSpeech != None: nColor = 0x00ff00 #green self.mem.raiseMicroEvent("Audio/RecognizedWords", [[ self.aRecognizedSpeech[0], self.aRecognizedSpeech[1] ]]) else: nColor = 0xff0000 #red self.mem.raiseMicroEvent("Audio/RecognizedWords", []) rTime = 1.0 self.visualFeedback(rTime, nColor) time.sleep(0.1) def onTouch(self, msg, value): self.touched = True def visualFeedback(self, rTime, nColor): if self.bVisualFeedback == True: ledsMutex.acquire() if time.time() > self.rEndLedLockTime: logging.debug("Actually do the feedback") self.leds.setEyesOneLed(1, .0, nColor) self.leds.setChestColor(.0, nColor) self.rEndLedLockTime = time.time() + rTime logging.debug("End lock time: " + str(self.rEndLedLockTime)) ledsMutex.release() def computeEnergyBestNumpy(self, aSample): """ Compute sound energy on a mono channel sample, aSample contents signed int from -32000 to 32000 (in fact any signed value) """ if (len(aSample) < 1): return 0 diff = np.diff(aSample) diff = np.array(diff, dtype=np.int32) diff *= diff rEnergy = np.mean(diff) nEnergyFinal = int(math.sqrt(rEnergy)) return nEnergyFinal
print('|{0:>15}|{1:<30.2f}|'.format('', phi)) print('+{0:->15}+{1:-<30}+'.format('-', '-')) for wavPath in filePathes: t = time.time() head, fileName = os.path.split(wavPath) clean, sr = sf.read(wavPath) maxi = max(abs(clean)) clean = [samp * (0.99 / maxi) for samp in clean] # magnitude normalization. noiseExt = np.resize(noise, len(clean)) vad = VAD(np.asarray(clean[19960521:19960521 + 360 * sr]), sr, threshold=0.95) ratio = list(vad).count(1) / len(vad) magWave = np.sum(np.square(clean)) / (ratio * len(clean)) # print(ratio, len(clean[19960521:19960521+120*sr]), len(vad), list(vad).count(1)) print('|{0:>15}|{1:<30}|'.format('file name', fileName)) print('|{0:>15}|{1:<30.4f}|'.format('vad ratio', ratio)) print('|{0:>15}|{1:<30}|'.format('conv', 'start')) fakeMic1 = np.convolve(conv1, clean) fakeMic2 = np.convolve(conv2, clean) maxi = max(abs(fakeMic1)) fakeMic1 = [samp * (0.99 / maxi) for samp in fakeMic1] # magnitude normalization. maxi = max(abs(fakeMic2))