def silenceRemoval(x, Fs, stWin, stStep, smoothWindow = 0.5, Weight = 0.5, plot = False): ''' Event Detection (silence removal) ARGUMENTS: - x: the input audio signal - Fs: sampling freq - stWin, stStep: window size and step in seconds - smoothWindow: (optinal) smooth window (in seconds) - Weight: (optinal) weight factor (0 < Weight < 1) the higher, the more strict - plot: (optinal) True if results are to be plotted RETURNS: - segmentLimits: list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds ''' if Weight>=1: Weight = 0.99; if Weight<=0: Weight = 0.01; # Step 1: feature extraction x = audioBasicIO.stereo2mono(x); # convert to mono ShortTermFeatures = aF.stFeatureExtraction(x, Fs, stWin*Fs, stStep*Fs) # extract short-term features # Step 2: train binary SVM classifier of low vs high energy frames EnergySt = ShortTermFeatures[1, :] # keep only the energy short-term sequence (2nd feature) E = numpy.sort(EnergySt) # sort the energy feature values: L1 = int(len(E)/10) # number of 10% of the total short-term windows T1 = numpy.mean(E[0:L1]) # compute "lower" 10% energy threshold T2 = numpy.mean(E[-L1:-1]) # compute "higher" 10% energy threshold Class1 = ShortTermFeatures[:,numpy.where(EnergySt<T1)[0]] # get all features that correspond to low energy Class2 = ShortTermFeatures[:,numpy.where(EnergySt>T2)[0]] # get all features that correspond to high energy featuresSS = [Class1.T, Class2.T]; # form the binary classification task and ... [featuresNormSS, MEANSS, STDSS] = aT.normalizeFeatures(featuresSS) # normalize and ... SVM = aT.trainSVM(featuresNormSS, 1.0) # train the respective SVM probabilistic model (ONSET vs SILENCE) # Step 3: compute onset probability based on the trained SVM ProbOnset = [] for i in range(ShortTermFeatures.shape[1]): # for each frame curFV = (ShortTermFeatures[:,i] - MEANSS) / STDSS # normalize feature vector ProbOnset.append(SVM.pred_probability(curFV)[1]) # get SVM probability (that it belongs to the ONSET class) ProbOnset = numpy.array(ProbOnset) ProbOnset = smoothMovingAvg(ProbOnset, smoothWindow / stStep) # smooth probability # Step 4A: detect onset frame indices: ProbOnsetSorted = numpy.sort(ProbOnset) # find probability Threshold as a weighted average of top 10% and lower 10% of the values Nt = ProbOnsetSorted.shape[0] / 10; T = (numpy.mean( (1-Weight)*ProbOnsetSorted[0:Nt] ) + Weight*numpy.mean(ProbOnsetSorted[-Nt::]) ) MaxIdx = numpy.where(ProbOnset>T)[0]; # get the indices of the frames that satisfy the thresholding i = 0; timeClusters = [] segmentLimits = [] # Step 4B: group frame indices to onset segments while i<len(MaxIdx): # for each of the detected onset indices curCluster = [MaxIdx[i]] if i==len(MaxIdx)-1: break while MaxIdx[i+1] - curCluster[-1] <= 2: curCluster.append(MaxIdx[i+1]) i += 1 if i==len(MaxIdx)-1: break i += 1 timeClusters.append(curCluster) segmentLimits.append([curCluster[0]*stStep, curCluster[-1]*stStep]) # Step 5: Post process: remove very small segments: minDuration = 0.2; segmentLimits2 = [] for s in segmentLimits: if s[1] - s[0] > minDuration: segmentLimits2.append(s) segmentLimits = segmentLimits2; if plot: timeX = numpy.arange(0, x.shape[0] / float(Fs) , 1.0/Fs) plt.subplot(2,1,1); plt.plot(timeX, x) for s in segmentLimits: plt.axvline(x=s[0]); plt.axvline(x=s[1]); plt.subplot(2,1,2); plt.plot(numpy.arange(0, ProbOnset.shape[0] * stStep, stStep), ProbOnset); plt.title('Signal') for s in segmentLimits: plt.axvline(x=s[0]); plt.axvline(x=s[1]); plt.title('SVM Probability') plt.show() return segmentLimits
def silenceRemoval(x, fs, st_win, st_step, smoothWindow=0.5, weight=0.5, plot=False): ''' Event Detection (silence removal) ARGUMENTS: - x: the input audio signal - fs: sampling freq - st_win, st_step: window size and step in seconds - smoothWindow: (optinal) smooth window (in seconds) - weight: (optinal) weight factor (0 < weight < 1) the higher, the more strict - plot: (optinal) True if results are to be plotted RETURNS: - seg_limits: list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds ''' if weight >= 1: weight = 0.99 if weight <= 0: weight = 0.01 # Step 1: feature extraction x = audioBasicIO.stereo2mono(x) st_feats, _ = aF.stFeatureExtraction(x, fs, st_win * fs, st_step * fs) # Step 2: train binary svm classifier of low vs high energy frames # keep only the energy short-term sequence (2nd feature) st_energy = st_feats[1, :] en = numpy.sort(st_energy) # number of 10% of the total short-term windows l1 = int(len(en) / 10) # compute "lower" 10% energy threshold t1 = numpy.mean(en[0:l1]) + 0.000000000000001 # compute "higher" 10% energy threshold t2 = numpy.mean(en[-l1:-1]) + 0.000000000000001 # get all features that correspond to low energy class1 = st_feats[:, numpy.where(st_energy <= t1)[0]] # get all features that correspond to high energy class2 = st_feats[:, numpy.where(st_energy >= t2)[0]] # form the binary classification task and ... faets_s = [class1.T, class2.T] # normalize and train the respective svm probabilistic model # (ONSET vs SILENCE) [faets_s_norm, means_s, stds_s] = aT.normalizeFeatures(faets_s) svm = aT.trainSVM(faets_s_norm, 1.0) # Step 3: compute onset probability based on the trained svm prob_on_set = [] for i in range(st_feats.shape[1]): # for each frame cur_fv = (st_feats[:, i] - means_s) / stds_s # get svm probability (that it belongs to the ONSET class) prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1]) prob_on_set = numpy.array(prob_on_set) # smooth probability: prob_on_set = smoothMovingAvg(prob_on_set, smoothWindow / st_step) # Step 4A: detect onset frame indices: prog_on_set_sort = numpy.sort(prob_on_set) # find probability Threshold as a weighted average # of top 10% and lower 10% of the values Nt = int(prog_on_set_sort.shape[0] / 10) T = (numpy.mean((1 - weight) * prog_on_set_sort[0:Nt]) + weight * numpy.mean(prog_on_set_sort[-Nt::])) max_idx = numpy.where(prob_on_set > T)[0] # get the indices of the frames that satisfy the thresholding i = 0 time_clusters = [] seg_limits = [] # Step 4B: group frame indices to onset segments while i < len(max_idx): # for each of the detected onset indices cur_cluster = [max_idx[i]] if i == len(max_idx) - 1: break while max_idx[i + 1] - cur_cluster[-1] <= 2: cur_cluster.append(max_idx[i + 1]) i += 1 if i == len(max_idx) - 1: break i += 1 time_clusters.append(cur_cluster) seg_limits.append( [cur_cluster[0] * st_step, cur_cluster[-1] * st_step]) # Step 5: Post process: remove very small segments: min_dur = 0.2 seg_limits_2 = [] for s in seg_limits: if s[1] - s[0] > min_dur: seg_limits_2.append(s) seg_limits = seg_limits_2 if plot: timeX = numpy.arange(0, x.shape[0] / float(fs), 1.0 / fs) plt.subplot(2, 1, 1) plt.plot(timeX, x) for s in seg_limits: plt.axvline(x=s[0]) plt.axvline(x=s[1]) plt.subplot(2, 1, 2) plt.plot(numpy.arange(0, prob_on_set.shape[0] * st_step, st_step), prob_on_set) plt.title('Signal') for s in seg_limits: plt.axvline(x=s[0]) plt.axvline(x=s[1]) plt.title('svm Probability') plt.show() return seg_limits