def extractEnergyFeatures ( signal, sampleRate=16000, windowSize=30,windowStep=10): """Extract energy features for each audio window This function construct a energy signal that is readed through consecutive windows. For each window, 11 representative metrics are computed. Those metrics are defined in the speechTools.features.computeLocalFeatures function. The final result is a features matrix where each time ordered row refers to a window, and each column represents a single metric. Args: signal (numpy.array): a mono PCM16 audio signal sampleRate (int): The audio signal sample rate(values different then 16000 may cose troubles) windowSize (float): The size of the aggregative windo in second windowStep (float): the duration between 2 consecutive windows(overlapping is aloud) Returns: numpy.ndarray: The energy features matrix """ energies = st.collectEnergies ( signal ) vadMask = st.getVadMask(signal, sampleRate) energies *= vadMask energyFeatures = [] energyReader = st.getSignalReader( energies, 100, windowSize, windowStep) for window in energyReader: localFeatures = computeLocalFeatures( window ) energyFeatures.append( localFeatures) energyFeatures = np.array(energyFeatures) return energyFeatures
def extractSnrFeatures ( signal, sampleRate=16000, windowSize=30,windowStep=10 ): """Extract SNR features for each audio window This function construct a SNR signal that is readed through consecutive windows. For each window, 11 representative metrics are computed. Those metrics are defined in the speechTools.features.computeLocalFeatures function. The final result is a features matrix where each time ordered row refers to a window, and each column represents a single metric. In this case the SNR refers to a speech signal to noise ratio. It distinguish the main voice signal from other background audio signal with the WADA algorithm definition. Args: signal (numpy.array): a mono PCM16 audio signal sampleRate (int): The audio signal sample rate(values different then 16000 may cose troubles) windowSize (float): The size of the aggregative windo in second windowStep (float): the duration between 2 consecutive windows(overlapping is aloud) Returns: numpy.ndarray: The SNR features matrix """ snrs = st.getSnrs ( signal, sampleRate) snrIndex = 0 minSnr = np.min(snrs) vadMask = st.getVadMask(signal, sampleRate) vadMaskReader = st.getSignalReader( vadMask, 100, 1, 0.5) for window in vadMaskReader: if np.mean(window) <= 0.5: snrs[snrIndex] = minSnr snrIndex +=1 snrFeatures = [] snrReader = st.getSignalReader( snrs, 2, windowSize, windowStep) for window in snrReader: localFeatures = computeLocalFeatures( window , 2, minSnr) snrFeatures.append( localFeatures) snrFeatures = np.array(snrFeatures) return snrFeatures
def extractPitchFeatures(signal, sampleRate=16000, pitchesFile=None, windowSize=30, windowStep=10): """Extract pitch features for each audio window This function construct a pitch signal that is readed through consecutive windows. For each window, 11 representative metrics are computed. Those metrics are defined in the speechTools.features.computeLocalFeatures function. The final result is a features matrix where each time ordered row refers to a window, and each column represents a single metric. if The param pitchFile is specified, the pitch signal is not constructed from the given audio signal but directly loaded from a .f0 file. Args: signal (numpy.array): a mono PCM16 audio signal sampleRate (int): The audio signal sample rate(values different than 16000 may cose troubles) pitchesFile (str): The path to the precomputed pitch .f0 file windowSize (float): The size of the aggregative windo in second windowStep (float): the duration between 2 consecutive windows(overlapping is aloud) Returns: numpy.ndarray: The pitch features matrix """ #if precomputed pitches are available if pitchesFile: pitches = None with open(pitchesFile, "rb") as pitchData: pitches = pickle.load(pitchData) else: pitches = st.getPitches(signal, sampleRate) # positive pitches intersection with VAD nonZeros = np.nonzero(pitches)[0] vadMask = st.getVadMask(signal, sampleRate) pitches, vadMask = st.equalizeShapes(pitches, vadMask) pitches[nonZeros] *= vadMask[nonZeros] #pitches conversion on semitone for macro melody quantification and geting spectral linearity nonZeros = np.nonzero(pitches)[0] semitoneScale = st.getSemitoneScale() pitches[nonZeros] = st.quantifyValues(pitches[nonZeros], semitoneScale) semitones = np.zeros(pitches.size) semitones[nonZeros] = st.pitches2semitones(pitches[nonZeros], semitoneScale) #for each pitches window compute the statistic features pitchFeatures = [] semitoneReader = st.getSignalReader(semitones, 100, windowSize, windowStep) for window in semitoneReader: localFeatures = computeLocalFeatures(window) pitchFeatures.append(localFeatures) pitcheFeatures = np.array(pitchFeatures) return pitchFeatures
def extractSpectralFeatures ( signal, sampleRate, windowSize=30,windowStep=10): """Extract spectral features for each audio window This function constructs 2 spectral signals that are readed through consecutive windows. For each window and each signal, 11 representative metrics are computed. Those metrics are defined in the speechTools.features.computeLocalFeatures function. The final result is 2 features matrixes where each time ordered row refers to a window, and each column represents a single metric. The 2 signals represent respectively: spectral centroid, spectral flatness. Args: signal (numpy.array): a mono PCM16 audio signal sampleRate (int): The audio signal sample rate(values different then 16000 may cose troubles) windowSize (float): The size of the aggregative windo in second windowStep (float): the duration between 2 consecutive windows(overlapping is aloud) Returns: numpy.ndarray, numpy.ndarray: The spectral centroid features matrix, the spectral flatness features matrix """ floatSignal = np.float32(signal) spectralCentroids = librosa.feature.spectral_centroid( floatSignal, sampleRate, n_fft=512, hop_length=160, center=False) spectralCentroids = spectralCentroids.flatten() vadMask = st.getVadMask( signal, sampleRate) spectralCentroids, vadMask = st.equalizeShapes( spectralCentroids, vadMask) spectralCentroids *= vadMask nonZeros = np.nonzero(spectralCentroids)[0] semitoneScale = st.getSemitoneScale() spectralCentroids[nonZeros] = st.quantifyValues( spectralCentroids[nonZeros], semitoneScale) semitones = np.zeros( spectralCentroids.size) semitones[nonZeros]= st.pitches2semitones( spectralCentroids[nonZeros], semitoneScale) semitoneFeatures = [] semitoneReader = st.getSignalReader( semitones, 100, windowSize, windowStep) for window in semitoneReader: localFeatures = computeLocalFeatures( window ) semitoneFeatures.append( localFeatures) semitoneFeatures = np.array(semitoneFeatures) spectralCentroidFeatures = semitoneFeatures spectralFlatnesses = librosa.feature.spectral_flatness ( floatSignal, n_fft=512, hop_length=160, center=False) spectralFlatnesses = spectralFlatnesses.flatten() spectralFlatnesses, vadMask = st.equalizeShapes( spectralFlatnesses, vadMask) spectralFlatnesses *= vadMask spectralFlatnessFeatures = [] spectralFlatnessReader = st.getSignalReader( spectralFlatnesses, 100, windowSize, windowStep) for window in spectralFlatnessReader: localFeatures = computeLocalFeatures( window ) spectralFlatnessFeatures.append( localFeatures) spectralFlatnessFeatures = np.array(spectralFlatnessFeatures) return spectralCentroidFeatures, spectralFlatnessFeatures
def detectVocalActivity(signal, sampleRate, segments, aggressiveness=3, aloudError=0.25, reductionFactor=0.8): """Find respectively speech, plosives and silence segments in a sequence of audio stable segments This function reads a sequence of stable audio segments and returns separately and respectively the segments corresponding to speech, plosives and silence. Each segment is a list with the forme : [start time in second, end time in second]. The affectations use an web RTC VAD object with an given aggressiveness. The stable segments are phonologicaly supra segmantal units, smaller than phonems and not differentiated as those. The stable segments are provided by the speechTools.speech.getFBSegments function, using the forward-backward algorithm. The result is 3 lists representing the segments containing speech, those containing plosives ( less than 150ms silences like in the sounds "P, T, C"), and long silence segments containing any speech. Args: signal (numpy.array): mono int16 audio signal sampleRate (int): audio signal sample rate(should be equal to 16000 for avoiding troubles) segments (numpy.ndarray): the time ordered stable audio segments aggressiveness (int): The aggressiveness of the Vad instance, could only be [1,2,3] with 3 the most aggressive value aloudError (float): The aloud error between 0 and 1 for quantifying speech quantity in each segment reductionFactor (float): The symmetrical length reduction applied to each segment for avoiding segment transition perturbations Returns: numpy.array, numpy.array, numpy.array: List of speech segments, list of plosive segments, list of silence segments """ speechSegments = [] plosiveSegments = [] silenceSegments = [] vad = webrtcvad.Vad(aggressiveness) for segmentStart, segmentEnd in segments: #find the segment corresponding signal window windowStart = int(segmentStart * sampleRate) windowEnd = int(segmentEnd * sampleRate) windowSize = windowEnd - windowStart #find the reduced window which is a troncated window for excluding the parts of the window that could be impacted by the neighbour windows reducedWindowSize = int(windowSize * reductionFactor) reducedWindowStart = int(windowStart + (windowSize * (1 - reductionFactor) * 0.5)) reducedWindowEnd = reducedWindowStart + reducedWindowSize reducedWindow = signal[reducedWindowStart:reducedWindowEnd] #get the VAD mask for the reduced window vadMask = st.getVadMask(reducedWindow, sampleRate, vad=vad, windowWidth=0.01, windowStep=0.005) #compute the speech quantity i.e. the proportion of [1] in the mask speechQuantity = 1 if vadMask.size > 0: speechQuantity = vadMask[vadMask == 1].size / vadMask.size # if there is enough speech: add the segment to the speech segments #else if it is a very short silence add it to plosives #else add it to silences if speechQuantity > (1 - aloudError): speechSegments.append([segmentStart, segmentEnd]) elif (segmentEnd - segmentStart) < 0.15: plosiveSegments.append([segmentStart, segmentEnd]) else: silenceSegments.append([segmentStart, segmentEnd]) speechSegments = np.array(speechSegments) plosiveSegments = np.array(plosiveSegments) silenceSegments = np.array(silenceSegments) return speechSegments, plosiveSegments, silenceSegments