Python VAD Examples, vad.VAD Python Examples

Example #1

0

Show file

File: main.py Project: Jamiroquai88/MUL

 def __init__(self, parent=None):
     """ Class constructor. 
     
     """
     QtGui.QWidget.__init__(self, parent)
     # UI setup
     self.ui = Ui_MusicSplitter()
     self.ui.setupUi(self)
     self.ui.browseButton.clicked.connect(self.HandleBrowseButton)
     self.ui.processButton.clicked.connect(self.HandleProcessButton)
     self.ui.saveButton.clicked.connect(self.HandleSaveButton)
     self.ui.tableView.resizeColumnsToContents()
     self.ui.tableView.setSelectionBehavior(QtGui.QTableView.SelectRows)
     self.ui.tableView.clicked.connect(self.HandleTableClicked)
     self.ui.tableView.setHorizontalScrollBarPolicy(
         QtCore.Qt.ScrollBarAlwaysOff)
     self.tableModel = None
     # VAD class
     self.vad = VAD()
     bandStart = self.ui.bandStartBox.value()
     self.vad.music_start_band = bandStart
     bandEnd = self.ui.bandEndBox.value()
     self.vad.music_end_band = bandEnd
     minSongLen = self.ui.songLenBox.value()
     self.vad.min_song_len = minSongLen
     self.foundSongs = None
     # Class members
     self.inputFile = None

Example #2

0

Show file

File: load_transcription.py Project: ewan/mcr

def extract_features(df, label2ix, spec_kwargs, vad_kwargs,
                     stacksize=1, frate=100, return_y=False):
    if return_y:
        return_y = 'label' in df
    X = {}
    if return_y:
        y = {}
    spectrum_encoder = Spectral(**spec_kwargs)
    vad_encoder = VAD(**vad_kwargs)
    for ix, fname in enumerate(df.filename.unique()):
        sig, fs = wavread(fname)
        if fs != spec_kwargs['fs']:
            raise ValueError('expected samplerate {}, got {}'.format(
                spec_kwargs['fs'], fs)
            )
        spec = spectrum_encoder.transform(sig)
        spec = (spec - spec.mean(0)) / spec.std(0)
        if stacksize > 1:
            spec = roll_array(spec, stacksize)
        vad = vad_encoder.activations(sig)
        vad = vad.reshape(vad.shape[0], -1)
        if stacksize > 1:
            vad = roll_array(vad, stacksize)

        X_curr = []
        if return_y:
            y_curr = []

        rows_iter = df[df.filename == fname].iterrows()
        for _, row in rows_iter:
            start = row.start
            end = row.end

            start_fr = int(start * frate)
            end_fr = int(end * frate)

            feat = np.hstack(
                (spec[start_fr: end_fr],
                 vad[start_fr: end_fr])
            )
            X_curr.append(
                feat.astype(np.float32)
            )
            if return_y:
                y_curr.append(
                    np.ones(feat.shape[0], dtype=np.uint8) * \
                    label2ix[row.label]
                )
        X[fname] = np.vstack(X_curr)
        if return_y:
            y[fname] = np.hstack(y_curr)
    if return_y:
        return X, y
    else:
        return X

Example #3

0

Show file

File: load_transcription.py Project: jonathanwoodard/mcr

def extract_features(df,
                     label2ix,
                     spec_kwargs,
                     vad_kwargs,
                     stacksize=1,
                     frate=100,
                     return_y=False):
    if return_y:
        return_y = 'label' in df
    X = {}
    if return_y:
        y = {}
    spectrum_encoder = Spectral(**spec_kwargs)
    vad_encoder = VAD(**vad_kwargs)
    for ix, fname in enumerate(df.filename.unique()):
        sig, fs = wavread(fname)
        if fs != spec_kwargs['fs']:
            raise ValueError('expected samplerate {}, got {}'.format(
                spec_kwargs['fs'], fs))
        spec = spectrum_encoder.transform(sig)
        spec = (spec - spec.mean(0)) / spec.std(0)
        if stacksize > 1:
            spec = roll_array(spec, stacksize)
        vad = vad_encoder.activations(sig)
        vad = vad.reshape(vad.shape[0], -1)
        if stacksize > 1:
            vad = roll_array(vad, stacksize)

        X_curr = []
        if return_y:
            y_curr = []

        rows_iter = df[df.filename == fname].iterrows()
        for _, row in rows_iter:
            start = row.start
            end = row.end

            start_fr = int(start * frate)
            end_fr = int(end * frate)

            feat = np.hstack((spec[start_fr:end_fr], vad[start_fr:end_fr]))
            X_curr.append(feat.astype(np.float32))
            if return_y:
                y_curr.append(
                    np.ones(feat.shape[0], dtype=np.uint8) * \
                    label2ix[row.label]
                )
        X[fname] = np.vstack(X_curr)
        if return_y:
            y[fname] = np.hstack(y_curr)
    if return_y:
        return X, y
    else:
        return X

Example #4

0

Show file

File: preprocess_speech.py Project: Yanivs24/SpeechSegmentor

def get_voice_times(frames,
                    sample_rate,
                    threshold=0,
                    win_size=0.05,
                    hop_size=0.025):
    '''
    Get the times in which the VAD (Voice Activity Detector) detected voice

    Returns:
        a list of tuples each contains a voice-segment boundaries
    '''

    detector = VAD(fs=sample_rate, win_size_sec=win_size, win_hop_sec=hop_size)
    decisions = list(detector.detect_speech(frames, threshold=threshold))

    # Smooth the binary hard decisions vector with a sliding average
    slide_size = int(SLIDING_AVERAGE_WINDOW_SIZE / 2)
    smooth_decisions = []
    for i in range(len(decisions)):
        if (i < slide_size) or (i >= len(decisions) - slide_size):
            smooth_decisions.append(False)
            continue

        # Majority vote
        smooth_decisions.append(decisions[i - slide_size:i + slide_size +
                                          1].count(True) > slide_size)

    # Extract speech segments from the hard decisions
    voice_times = []
    old_dec = False
    current_start = 0
    for i, dec in enumerate(decisions):
        if dec and not old_dec:
            # We want to ignore short non-speech segments, so if the previous speech-end
            # is too close to this speech-start - remove the last speech segment and keep searching
            if voice_times and (
                (i * hop_size) -
                    voice_times[-1][1]) < MIN_SILENT_SEGMENT_LEN_SEC:
                current_start = voice_times[-1][0]
                voice_times = voice_times[:-1]
            else:
                current_start = i * hop_size
        if old_dec and not dec:
            voice_times.append((current_start, i * hop_size))

        old_dec = dec

    return voice_times

Example #5

0

Show file

File: app.py Project: Gkrumbach07/docker-py-kaldi-asr

def decode_live(source, volume, aggressiveness, url, topic, broker):
    from pulserecorder import PulseRecorder
    from vad import VAD, BUFFER_DURATION

    stream_id = get_uuid()

    # create requests session for saving cookies
    session = requests.Session()

    try:
        # pulseaudio recorder
        rec = PulseRecorder (source_name=source, volume=volume)
        vad = VAD(aggressiveness=aggressiveness)

        rec.start_recording()
        logging.info("Start talking.")

        while True:
            samples = rec.get_samples()
            audio, finalize = vad.process_audio(samples)


            if not audio:
                continue

            data = {'audio'      : audio,
                    'do_finalize': finalize,
                    'topic'      : topic,
                    'broker'     : broker,
                    'id'         : stream_id,
                    'sample_rate': 16000}

            response = session.post(url, json=data)
            if not response.ok:
                logging.error(response.text)
            else:
                logging.info ( "\tPrediction    : %s - %f" % (response.json()['hstr'], response.json()['confidence']))
    except KeyboardInterrupt:
        logging.info("Keyboard Interrupt: stopping service")
        rec.stop_recording()
        session.close()
    except Exception as e:
        logging.critical(e)
        session.close()
        sys.exit(1)

Example #6

0

Show file

def remove_silence(signal):
    """
    Detects the speech regions and removes the non-speech regions using VAD (Voice Activity Detection)
    :param signal: np.ndarray (n by 1): input audio signal from one speaker
    :return: without_silence: np.ndarray (n by 1): original signal with removed silence regions
    """
    regions = VAD(signal, int(df.sample_rate.iloc[0]), nFFT=512, win_length=0.02, hop_length=0.01, threshold=0.65)

    without_silence = np.array([signal[160 * i: 160 * (i + 1)] for i in range(regions.shape[0]) if regions[i] > 0])
    without_silence = without_silence.flatten()
    return without_silence

Example #7

0

Show file

File: server.py Project: mailong25/vad

import flask
import uuid
import time
import json
import io
import logging
from flask import Response
from flask import Flask
from flask import request
from flask_cors import CORS, cross_origin
import numpy as np

app = Flask(__name__)
CORS(app)

detector = VAD(frame_duration=0.5, model_path='models/vad')
SAMPLING_RATE = 44100


@app.route("/")
def homepage():
    return "Welcome to the REST API!"


@app.route("/predict", methods=["POST"])
def predict():
    # initialize the data dictionary that will be returned from the view
    result = {"success": False}
    frames = flask.request.data
    array_frames = np.frombuffer(frames, dtype=np.int16)
    array_frames = array_frames.astype(np.float32, order='C') / 32768.0

Example #8

0

Show file

File: main.py Project: Jamiroquai88/MUL

class MyForm(QtGui.QMainWindow):
    """ Main form class.
    
    """
    def __init__(self, parent=None):
        """ Class constructor. 
        
        """
        QtGui.QWidget.__init__(self, parent)
        # UI setup
        self.ui = Ui_MusicSplitter()
        self.ui.setupUi(self)
        self.ui.browseButton.clicked.connect(self.HandleBrowseButton)
        self.ui.processButton.clicked.connect(self.HandleProcessButton)
        self.ui.saveButton.clicked.connect(self.HandleSaveButton)
        self.ui.tableView.resizeColumnsToContents()
        self.ui.tableView.setSelectionBehavior(QtGui.QTableView.SelectRows)
        self.ui.tableView.clicked.connect(self.HandleTableClicked)
        self.ui.tableView.setHorizontalScrollBarPolicy(
            QtCore.Qt.ScrollBarAlwaysOff)
        self.tableModel = None
        # VAD class
        self.vad = VAD()
        bandStart = self.ui.bandStartBox.value()
        self.vad.music_start_band = bandStart
        bandEnd = self.ui.bandEndBox.value()
        self.vad.music_end_band = bandEnd
        minSongLen = self.ui.songLenBox.value()
        self.vad.min_song_len = minSongLen
        self.foundSongs = None
        # Class members
        self.inputFile = None

    def HandleTableClicked(self, clickedIndex):
        """ Handle clicked table.

        """
        index = clickedIndex.row()
        form = EditForm(self)
        song = copy.deepcopy(self.foundSongs[index])
        song[1] = os.path.join(os.path.dirname(str(self.inputFile)),
                               str(song[1]))
        form.Init(song, self.vad.data, self.vad.bitrate)
        form.exec_()
        if form.result == []:
            self.foundSongs.pop(index)
            self.SetupTable(self.foundSongs)
        elif form.result is None:
            pass
        else:
            form.result[1] = os.path.join(os.path.dirname(str(self.inputFile)),
                                          form.result[1])
            form.result[1] = os.path.relpath(
                form.result[1], os.path.dirname(str(self.inputFile)))
            self.foundSongs[index] = form.result
            self.SetupTable(self.foundSongs)

    def HandleBrowseButton(self):
        """ Handle browse button, select input wav file and open it.

        """
        f = QtGui.QFileDialog.getOpenFileName(self)
        if f:
            try:
                self.inputFile = open(f, 'r')
                self.inputFile.close()
                self.inputFile = f
                self.ui.chosenFile.setText(f)
            except IOError:
                self.ShowInformationDialog('Can not open input file!')
                self.inputFile = None

    def HandleProcessButton(self):
        """ Handle button for audio processing.
        
        """
        self.InitVAD()
        songs_list = self.vad.ProcessFile(self.inputFile,
                                          self.ui.mediancheckBox.isChecked())
        # self.emit(QtCore.SIGNAL('BAR_PROGRESS'), None)
        if songs_list is None:
            self.ShowInformationDialog('Can not process input file!')
        elif not songs_list:
            self.ShowInformationDialog('No songs found in input wav file!')
        else:
            self.SetupTable(songs_list)

    def InitVAD(self):
        """ Init VAD members when processing.
        
            :returns: True if it was possible to initialize, False otherwise
            :rtype: bool
        """
        if self.ui.levelBox.value() > 0 and self.ui.lengthBox.value() > 0 and self.inputFile is not None \
           and self.ui.bandStartBox.value() < self.ui.bandEndBox.value():
            self.foundSongs = None
            self.vad.thr = self.ui.levelBox.value()
            self.vad.sil_len = self.ui.lengthBox.value()
            self.vad.music_start_band = self.ui.bandStartBox.value()
            self.vad.music_end_band = self.ui.bandEndBox.value()
            self.vad.min_song_len = self.ui.songLenBox.value()
            self.vad.coresNum = self.ui.coresNumBox.value()
        else:
            self.ShowInformationDialog(
                'Please, set input file and appropriate parameters!')

    def HandleSaveButton(self):
        """ Handle save all files button.
        
        """
        if self.foundSongs is not None:
            for x in self.foundSongs:
                name = os.path.join(os.path.dirname(str(self.inputFile)), x[1])
                start = EditForm.Time2Frames(x[2], self.vad.bitrate)
                end = EditForm.Time2Frames(x[3], self.vad.bitrate)
                wf.write(name, self.vad.bitrate, self.vad.data[start:end])
        else:
            MyForm.ShowInformationDialog('No results to save!')

    def SetupTable(self, data):
        """ Initialize table with data.
        
            :param data: table data
            :type data: list
        """
        self.foundSongs = data
        header = ['Num', 'File Name', 'Start', 'End']
        self.tableModel = MyTableModel(data, header, self)
        self.ui.tableView.setModel(self.tableModel)
        MyTableModel.setupColumns(self.ui.tableView)

    @staticmethod
    def ShowInformationDialog(text):
        """ Show simple information dialog.

        """
        msg = QtGui.QMessageBox()
        msg.setIcon(QtGui.QMessageBox.Information)
        msg.setText(text)
        msg.setStandardButtons(QtGui.QMessageBox.Ok)
        msg.exec_()

Example #9

0

Show file

    parser.add_argument('--window',
                        action='store',
                        dest='window',
                        type=float,
                        required=True)
    parser.add_argument('--median-filter',
                        action='store_true',
                        dest='median',
                        required=False)
    parser.add_argument('--no-median-filter',
                        action='store_false',
                        dest='median',
                        required=False)
    r = parser.parse_args()

    vad = VAD()

    bandStart = 50
    vad.music_start_band = bandStart

    minSongLen = 120
    vad.min_song_len = minSongLen

    vad.music_end_band = r.band_end

    vad.sil_len = r.sil_len

    vad.thr = r.thr

    vad.frame_window = r.window
    vad.frame_overlap = r.window / 2

Example #10

0

Show file

File: sound_analyser.py Project: laboratoriumDIBRIS/caresses-opensource

    def __init__(self,
                 session,
                 nSampleRate=16000,
                 datatype=np.int16,
                 nNbrChannel=4,
                 nEnergyThreshold=300,
                 bActivateSpeechRecognition=True,
                 bUseAnonymous=True,
                 strUseLang="",
                 bKeepAudioFiles=False,
                 rTimePreVAD=0.150,
                 rTimePostVAD=0.500):
        """
        analyse chunk of data (must have no specific to robot nor naoqi method)
        - nSampleRate: the sample rate of your sound
        - datatype: the way your sound is stored
        - nNbrChannel: ...
        - nEnergyThreshold: threshold for the sound to be analysed for sound reco
        - rVadThreshold: threshold for confidence of the VAD: Currently not used
        - bActivateSpeechRecognition: do we send the interesting sound to the speech recognition ?
        - bActivateSoundRecognition: 
        - strUseLang: lang to use for speech recognition, eg: "fr-FR", if leaved to "": use language currently in the tts
        """

        self.session = session
        self.nSampleRate = nSampleRate
        self.datatype = datatype
        self.nNbrChannel = nNbrChannel
        self.bActivateSpeechRecognition = bActivateSpeechRecognition
        self.bUseAnonymous = bUseAnonymous
        self.rEnergyThreshold = nEnergyThreshold
        # 60 # 10
        self.strUseLang = strUseLang

        self.rTimePreVAD = rTimePreVAD
        self.rTimePostVAD = rTimePostVAD

        self.rMfccWindowStepInSec = 0.01

        self.nSizePreBuffer = int(
            self.rTimePreVAD * nSampleRate)  # conversion from time to samples

        self.bStoringSpeech = False
        # are we currently storing for speech reco ?
        self.bStoringNoise = False
        # are we currently storing for sound reco ?
        self.aRecognizedSpeech = None
        self.bSpeechDetected = False
        self.bVisualFeedback = True
        self.bSpeechAnalysed = False

        # all sounds buffer will be stored in monochannel
        self.aStoredDataSpeech = np.array(
            [],
            dtype=self.datatype)  # a numpy int16 array storing current sound
        self.aStoredDataNoise = np.array([], dtype=self.datatype)
        self.aStoredMfccSound = np.array([], dtype=np.float64)
        self.aStoredSoundPreBuffer = np.array([], dtype=self.datatype)
        self.createdFiles = []

        self.timeLastBufferReceived = time.time()

        self.timeLastPeak = time.time() - 1000
        self.timeLastVAD = self.timeLastPeak

        self.strLastRecognized = ""
        self.strDstPath = "/tmp/"
        self.debug_fileAllSpeech = None
        self.bKeepAudioFiles = bKeepAudioFiles
        self.bIsOnRobot = runner.is_on_robot()
        home = os.path.expanduser("~")
        self.storeDir = home + "/.abcdk/prevWavs"
        if not os.path.isdir(self.storeDir):
            os.makedirs(self.storeDir)

        self.vad = VAD(self.rTimePreVAD, self.rTimePostVAD)
        self.fs = freespeech.FreeSpeech(self.session)
        self.mem = self.session.service("ALMemory")
        self.leds = LedsDcm.LedsDcm(self.session)
        self.leds.createProxy()
        self.leds.createAliases()
        self.rEndLedLockTime = time.time()
        self.touch = self.mem.subscriber("TouchChanged")
        self.id_touch = self.touch.signal.connect(
            functools.partial(self.onTouch, "TouchChanged"))
        self.touched = False
        self.runningThread = True
        thread.start_new_thread(self.asrOnFile, ())

Example #11

0

Show file

File: sound_analyser.py Project: laboratoriumDIBRIS/caresses-opensource

class SoundAnalyser:
    def __init__(self,
                 session,
                 nSampleRate=16000,
                 datatype=np.int16,
                 nNbrChannel=4,
                 nEnergyThreshold=300,
                 bActivateSpeechRecognition=True,
                 bUseAnonymous=True,
                 strUseLang="",
                 bKeepAudioFiles=False,
                 rTimePreVAD=0.150,
                 rTimePostVAD=0.500):
        """
        analyse chunk of data (must have no specific to robot nor naoqi method)
        - nSampleRate: the sample rate of your sound
        - datatype: the way your sound is stored
        - nNbrChannel: ...
        - nEnergyThreshold: threshold for the sound to be analysed for sound reco
        - rVadThreshold: threshold for confidence of the VAD: Currently not used
        - bActivateSpeechRecognition: do we send the interesting sound to the speech recognition ?
        - bActivateSoundRecognition: 
        - strUseLang: lang to use for speech recognition, eg: "fr-FR", if leaved to "": use language currently in the tts
        """

        self.session = session
        self.nSampleRate = nSampleRate
        self.datatype = datatype
        self.nNbrChannel = nNbrChannel
        self.bActivateSpeechRecognition = bActivateSpeechRecognition
        self.bUseAnonymous = bUseAnonymous
        self.rEnergyThreshold = nEnergyThreshold
        # 60 # 10
        self.strUseLang = strUseLang

        self.rTimePreVAD = rTimePreVAD
        self.rTimePostVAD = rTimePostVAD

        self.rMfccWindowStepInSec = 0.01

        self.nSizePreBuffer = int(
            self.rTimePreVAD * nSampleRate)  # conversion from time to samples

        self.bStoringSpeech = False
        # are we currently storing for speech reco ?
        self.bStoringNoise = False
        # are we currently storing for sound reco ?
        self.aRecognizedSpeech = None
        self.bSpeechDetected = False
        self.bVisualFeedback = True
        self.bSpeechAnalysed = False

        # all sounds buffer will be stored in monochannel
        self.aStoredDataSpeech = np.array(
            [],
            dtype=self.datatype)  # a numpy int16 array storing current sound
        self.aStoredDataNoise = np.array([], dtype=self.datatype)
        self.aStoredMfccSound = np.array([], dtype=np.float64)
        self.aStoredSoundPreBuffer = np.array([], dtype=self.datatype)
        self.createdFiles = []

        self.timeLastBufferReceived = time.time()

        self.timeLastPeak = time.time() - 1000
        self.timeLastVAD = self.timeLastPeak

        self.strLastRecognized = ""
        self.strDstPath = "/tmp/"
        self.debug_fileAllSpeech = None
        self.bKeepAudioFiles = bKeepAudioFiles
        self.bIsOnRobot = runner.is_on_robot()
        home = os.path.expanduser("~")
        self.storeDir = home + "/.abcdk/prevWavs"
        if not os.path.isdir(self.storeDir):
            os.makedirs(self.storeDir)

        self.vad = VAD(self.rTimePreVAD, self.rTimePostVAD)
        self.fs = freespeech.FreeSpeech(self.session)
        self.mem = self.session.service("ALMemory")
        self.leds = LedsDcm.LedsDcm(self.session)
        self.leds.createProxy()
        self.leds.createAliases()
        self.rEndLedLockTime = time.time()
        self.touch = self.mem.subscriber("TouchChanged")
        self.id_touch = self.touch.signal.connect(
            functools.partial(self.onTouch, "TouchChanged"))
        self.touched = False
        self.runningThread = True
        thread.start_new_thread(self.asrOnFile, ())

    def __del__(self):
        self.stop()
        if self.debug_fileAllSpeech != None:
            self.debug_fileAllSpeech.write(self.strDstPath +
                                           "concatenated_speechs.wav")

    def stop(self):
        self.runningThread = False
        pass

    def pause(self):
        # stop the current recording of audio
        self.bStoringSpeech = False
        # reset the stored audio buffer
        self.aStoredDataSpeech = np.array([], dtype=self.datatype)

    def setKeepAudioFiles(self, bNewState):
        self.bKeepAudioFiles = bNewState

    def setVisualFeedback(self, bNewState):
        self.bVisualFeedback = bNewState

    def _writeBufferToFile(self, datas, strFilename):
        wavFile = wav.Wav()
        wavFile.new(nSamplingRate=self.nSampleRate,
                    nNbrChannel=1,
                    nNbrBitsPerSample=16)
        wavFile.addData(datas)
        bRetWrite = wavFile.write(strFilename)

        if not self.bIsOnRobot:
            if self.debug_fileAllSpeech == None:
                self.debug_fileAllSpeech = wav.Wav()
                self.debug_fileAllSpeech.new(nSamplingRate=self.nSampleRate,
                                             nNbrChannel=1,
                                             nNbrBitsPerSample=16)
            self.debug_fileAllSpeech.addData(datas)
            self.debug_fileAllSpeech.addData(
                np.zeros(self.nSampleRate / 2, self.datatype))

        return bRetWrite

    def _sendToSpeechReco(self, strFilename):
        """
        Send a file to the speech recognition engine.
        Return: the string of recognized text, + confidence or None if nothing recognized
        """

        logging.info("_sendToSpeechReco: sending to speech reco " +
                     strFilename + " to detect it in " + self.strUseLang)
        retVal = None

        timeBegin = timeit.default_timer()

        retVal = self.fs.analyse_anonymous( strFilename, strUseLang=self.strUseLang ) if self.bUseAnonymous \
            else self.fs.analyse( strFilename, strUseLang=self.strUseLang )

        rProcessDuration = timeit.default_timer() - timeBegin

        logging.debug(
            "SoundAnalyser._sendToSpeechReco: freeSpeech analysis processing takes: %5.2fs"
            % rProcessDuration)

        if (0):  # disabled
            self.rSkipBufferTime = rProcessDuration  # if we're here, it's already to zero

        if retVal != None:
            retVal = [retVal[0][0], retVal[0][1]]
            txtForRenameFile = retVal[0]
        else:
            txtForRenameFile = "Not_Recognized"

        if self.bKeepAudioFiles:
            newfilename = strFilename.replace(
                ".wav", "__%s.wav" % self.convertForFilename(txtForRenameFile))
            baseFilename = os.path.basename(strFilename)
            newBaseFilename = os.path.basename(newfilename)
            newfilename = self.storeDir + "/" + newBaseFilename
            shutil.move(strFilename, newfilename)
            logging.info("Saved wav file in " + newfilename)

        return retVal

    def convertForFilename(self, strTxt):
        """
        convert a text to be usable as filename
        "toto is happy" => "toto_is_happy"
        """
        s = strTxt
        s = s.replace(" ", "_")
        s = s.replace("'", "_")
        s = s.replace("\"", "_")
        s = s.replace(",", "_")
        s = s.replace(":", "_")
        s = s.replace("/", "_")
        s = s.replace("\\", "_")
        s = s.replace("-", "_")
        return s

    # convertForFilename - end

    def _sendFileToRemoteSoundRecognition(self, strFilename):
        """
        ugly send to the sound recognition in background from a remote computer (just for ears project)
        """
        #~ os.system( "scp -q %s nao@%s:%s" % (strFilename, self.strNaoIP, strFilename) ) # assume folder exists at destination
        #~ if( self.mem == None ):
        #~ import naoqi
        #~ self.mem = naoqi.ALProxy( "ALMemory", self.strNaoIP, 9559 )
        #~ if( self.mem != None ):
        #~ self.mem.raiseMicroEvent( "SoundRecoAnalyseFilename", strFilename )
        pass

    def _sendFileToSoundRecognition(self, buffer):
        pass

    def setInputFile(self, strFilename):
        """
        analyse a file, return all voice segment
        """
        s = wav.Wav(strFilename)

        if not s.isOpen():
            raise OSError("File not found: " + strFilename)

        timeBegin = time.time()
        aMixedSoundData = s.data
        if s.nNbrChannel > 1:
            aMixedSoundData = aMixedSoundData[
                0::s.nNbrChannel] + aMixedSoundData[1::s.nNbrChannel]

        self.nSampleRate = s.nSamplingRate

        nNbrOfSamplesByChannel = len(s.data) / s.nNbrChannel
        rSoundDataDuration = nNbrOfSamplesByChannel / float(self.nSampleRate)

        return self.analyseBuffer(aMixedSoundData, rSoundDataDuration)

    def setInputBuffer(self, aInterlacedSoundData, bVerbose=False):
        """
        This is THE method that receives all the sound buffers from the "ALAudioDevice" module or the sound card or a file
        - aSoundData: it's an interlaced chunks of wav of a various length
        Return [bNoiseDetected, bSpeechDetected, aRecognizedNoise, aRecognizedSpeech, aRecognizedUser]
            - aRecognizedXxx: a pair [strText, rConfidence, rDuration]
                - strText: the recognized text, or name of the recognized sound
                - rConfidence: [0..1]
                - rDuration: duration of the recognized sound                 
                or [] if nothing recognized
                or None if nothing analized
            - aRecognizedUser: to be defined later
            
        """
        self.bVerbose = bVerbose

        nNbrOfSamplesByChannel = len(aInterlacedSoundData) / self.nNbrChannel
        rSoundDataDuration = nNbrOfSamplesByChannel / float(self.nSampleRate)

        logging.debug(
            "Receiving a buffer of len: %s (equivalent to %5.3fs) (shape:%s)" %
            (len(aInterlacedSoundData), rSoundDataDuration,
             str(aInterlacedSoundData.shape)))

        if time.time() > self.timeLastBufferReceived + 0.7:
            # our buffer is now to old (for instance the robot was speaking and we were inhibited)
            logging.info(
                "Clearing buffer after gap of %5.2fs (after inhibition...)" %
                (time.time() - self.timeLastBufferReceived))
            self.bStoringSpeech = False
            self.aStoredDataSpeech = np.array([], dtype=self.datatype)  # reset

        self.timeLastBufferReceived = time.time()

        aSoundData = np.reshape(aInterlacedSoundData,
                                (self.nNbrChannel, nNbrOfSamplesByChannel),
                                'F')

        # sum of two mics
        aMixedSoundData = aSoundData[0] + aSoundData[1]

        self.vadSplitBuffer(aMixedSoundData, rSoundDataDuration)

    def vadSplitBuffer(self, aMixedSoundData, rSoundDataDuration):
        global recordedFilesMutex

        start = timeit.default_timer()
        computedMfcc = base.mfcc(aMixedSoundData,
                                 samplerate=self.nSampleRate,
                                 winstep=self.rMfccWindowStepInSec)
        stop = timeit.default_timer()
        logging.debug("Time for mfcc: " + str(stop - start))

        start = timeit.default_timer()
        aVadStateChange = self.vad.computeFromMfcc(computedMfcc,
                                                   self.rMfccWindowStepInSec)
        stop = timeit.default_timer()
        logging.debug("aVadStateChange: %s" % aVadStateChange)
        logging.debug("Time for vad " + str(stop - start))

        if len(aVadStateChange) > 0:
            nColor = 0x0000ff  #blue
            rTime = 0.5
        else:
            nColor = 0x808080  #grey
            rTime = 0.0
        self.visualFeedback(rTime, nColor)

        # analyse VAD analyse results
        bStoringSpeechDone = False
        self.bSpeechDetected = False
        self.bSpeechAnalysed = False
        for i in range(len(aVadStateChange)):
            b, t = aVadStateChange[i]
            if b:
                if t < 0.:
                    # take datas from preBuffer
                    nNbrSamples = int(-t * self.nSampleRate)
                    self.aStoredDataSpeech = self.aStoredSoundPreBuffer[
                        -nNbrSamples:]

                    t = 0.

                # add data to next changes:
                if i + 1 < len(aVadStateChange):
                    rDuration = aVadStateChange[i + 1][1] - t
                else:
                    rDuration = rSoundDataDuration - t

                nStart = int(t * self.nSampleRate)
                nDuration = int(rDuration * self.nSampleRate)

                self.aStoredDataSpeech = np.concatenate(
                    (self.aStoredDataSpeech,
                     aMixedSoundData[nStart:nStart + nDuration]))

                self.bStoringSpeech = True
                bStoringSpeechDone = True

            else:
                self.bStoringSpeech = False
                self.mem.raiseMicroEvent("Audio/SpeechDetected", True)

                strFilename = self.strDstPath + datetime.datetime.now(
                ).strftime("%Y_%m_%d-%Hh%Mm%Ss%fms") + "_speech.wav"
                logging.debug("Outputting speech to file: '%s'" % strFilename)

                rDuration = len(self.aStoredDataSpeech) / float(
                    self.nSampleRate)
                start = timeit.default_timer()
                bRetWrite = self._writeBufferToFile(self.aStoredDataSpeech,
                                                    strFilename)
                stop = timeit.default_timer()
                logging.debug("Writting buffer to file: " + str(stop - start))

                if bRetWrite:
                    recordedFilesMutex.acquire()
                    self.createdFiles.append((strFilename, rDuration))
                    recordedFilesMutex.release()

                # reset the stored speech
                self.aStoredDataSpeech = np.array([], dtype=self.datatype)

        logging.debug("Lenght aStoredDataSpeech: '%i'" %
                      len(self.aStoredDataSpeech))

        if self.bStoringSpeech and not bStoringSpeechDone:
            self.aStoredDataSpeech = np.concatenate(
                (self.aStoredDataSpeech, aMixedSoundData))
            self.bSpeechDetected = True
            logging.debug("recording!")

        if self.bStoringSpeech:
            if len(self.aStoredDataSpeech
                   ) > self.nSampleRate * 8 or self.touched:
                # if more than 14 sec, keep only 10 last sec
                #logging.warning( "SoundAnalyser.analyse: buffer too long, keeping only 10 last seconds..." )
                #self.aStoredDataSpeech = self.aStoredDataSpeech[self.nSampleRate*4:] # removing by chunk of 4 sec
                if self.touched:
                    logging.warning(
                        "SoundAnalyser.analyse: Pepper touched, forcing analysis"
                    )
                    self.touched = False
                else:
                    logging.warning(
                        "SoundAnalyser.analyse: buffer too long, forcing analysis"
                    )

                ##tobeadded->send audio and start reco again
                self.bStoringSpeech = False
                self.mem.raiseMicroEvent("Audio/SpeechDetected", True)
                strFilename = self.strDstPath + datetime.datetime.now(
                ).strftime("%Y_%m_%d-%Hh%Mm%Ss%fms") + "_speech.wav"
                logging.debug("Outputting speech to file: '%s'" % strFilename)
                rDuration = len(self.aStoredDataSpeech) / float(
                    self.nSampleRate)
                bRetWrite = self._writeBufferToFile(self.aStoredDataSpeech,
                                                    strFilename)
                logging.debug("Writting buffer to file: ")
                if bRetWrite:
                    recordedFilesMutex.acquire()
                    self.createdFiles.append((strFilename, rDuration))
                    recordedFilesMutex.release()
                self.aStoredDataSpeech = np.array([], dtype=self.datatype)
                nNbrSamples = int(0.3 * self.nSampleRate)
                self.aStoredDataSpeech = self.aStoredSoundPreBuffer[
                    -nNbrSamples:]
                self.bStoringSpeech = True

                # rDuration = rSoundDataDuration-t

                #nStart = int(-0.3*self.nSampleRate)
                #nDuration = int(rDuration*self.nSampleRate)
                #self.aStoredDataSpeech = np.concatenate( (self.aStoredDataSpeech, aMixedSoundData[nStart:nStart+nDuration]) );
                #################

        # prebuffer store and offseting
        self.aStoredSoundPreBuffer = np.concatenate(
            (self.aStoredSoundPreBuffer, aMixedSoundData))
        self.aStoredSoundPreBuffer = self.aStoredSoundPreBuffer[
            -self.nSizePreBuffer:]

    def asrOnFile(self):
        global recordedFilesMutex

        while self.runningThread == True:

            recordedFilesMutex.acquire()
            if len(self.createdFiles) > 0:
                inputFile, rDuration = self.createdFiles.pop(0)
            else:
                inputFile = ""
            recordedFilesMutex.release()

            if inputFile != "":
                ret = self._sendToSpeechReco(inputFile)
                if ret != None:
                    self.aRecognizedSpeech = [ret[0], ret[1], rDuration]
                else:
                    self.aRecognizedSpeech = None

                self.bSpeechDetected = False
                self.bSpeechAnalysed = True
                if self.aRecognizedSpeech != None:
                    nColor = 0x00ff00  #green
                    self.mem.raiseMicroEvent("Audio/RecognizedWords", [[
                        self.aRecognizedSpeech[0], self.aRecognizedSpeech[1]
                    ]])
                else:
                    nColor = 0xff0000  #red
                    self.mem.raiseMicroEvent("Audio/RecognizedWords", [])
                rTime = 1.0
                self.visualFeedback(rTime, nColor)

            time.sleep(0.1)

    def onTouch(self, msg, value):
        self.touched = True

    def visualFeedback(self, rTime, nColor):
        if self.bVisualFeedback == True:
            ledsMutex.acquire()
            if time.time() > self.rEndLedLockTime:
                logging.debug("Actually do the feedback")
                self.leds.setEyesOneLed(1, .0, nColor)
                self.leds.setChestColor(.0, nColor)
                self.rEndLedLockTime = time.time() + rTime
                logging.debug("End lock time: " + str(self.rEndLedLockTime))
            ledsMutex.release()

    def computeEnergyBestNumpy(self, aSample):
        """
        Compute sound energy on a mono channel sample, aSample contents signed int from -32000 to 32000 (in fact any signed value)
        """
        if (len(aSample) < 1):
            return 0
        diff = np.diff(aSample)
        diff = np.array(diff, dtype=np.int32)
        diff *= diff
        rEnergy = np.mean(diff)
        nEnergyFinal = int(math.sqrt(rEnergy))
        return nEnergyFinal

Example #12

0

Show file

File: add_noise.py Project: km4sh/sound_data_preparation

        print('|{0:>15}|{1:<30.2f}|'.format('', phi))
        print('+{0:->15}+{1:-<30}+'.format('-', '-'))
        for wavPath in filePathes:
            t = time.time()
            head, fileName = os.path.split(wavPath)

            clean, sr = sf.read(wavPath)

            maxi = max(abs(clean))
            clean = [samp * (0.99 / maxi)
                     for samp in clean]  # magnitude normalization.

            noiseExt = np.resize(noise, len(clean))

            vad = VAD(np.asarray(clean[19960521:19960521 + 360 * sr]),
                      sr,
                      threshold=0.95)
            ratio = list(vad).count(1) / len(vad)
            magWave = np.sum(np.square(clean)) / (ratio * len(clean))
            # print(ratio, len(clean[19960521:19960521+120*sr]), len(vad), list(vad).count(1))
            print('|{0:>15}|{1:<30}|'.format('file name', fileName))
            print('|{0:>15}|{1:<30.4f}|'.format('vad ratio', ratio))
            print('|{0:>15}|{1:<30}|'.format('conv', 'start'))

            fakeMic1 = np.convolve(conv1, clean)
            fakeMic2 = np.convolve(conv2, clean)

            maxi = max(abs(fakeMic1))
            fakeMic1 = [samp * (0.99 / maxi)
                        for samp in fakeMic1]  # magnitude normalization.
            maxi = max(abs(fakeMic2))