def fileChromagramWrapper(wav_file): if not os.path.isfile(wav_file): raise Exception("Input audio file not found!") [fs, x] = audioBasicIO.readAudioFile(wav_file) x = audioBasicIO.stereo2mono(x) specgram, TimeAxis, FreqAxis = aF.stChromagram(x, fs, round(fs * 0.040), round(fs * 0.040), True)
def POST(self): x = web.input(myfile={}) filename = 'tmp/'+uuid.uuid4().hex+'.wav' file = open(filename, 'w+') file.seek(0) file.write(x['myfile'].value) file.close() [Fs, x] = audioBasicIO.readAudioFile(filename); #os.remove(filename) x = audioBasicIO.stereo2mono(x) [F, _] = audioFeatureExtraction.mtFeatureExtraction(x, Fs, round(Fs*1.0), round(Fs * 1.0), round(Fs * 0.050), round(Fs * 0.050)) F = F.transpose() for vec in F: results={} current_highest = "" current_highest_value = 0 vec = numpy.around(vec.astype(numpy.float), 6) current = model.getNN(vec) result = current[0][1].partition("_")[0] if result in results: results[result] = results[result]+1 else: results[result] = 1 if results[result] > current_highest_value: current_highest_value = results[result] current_highest = result print results print current_highest raise web.seeother('/')
def dirWavFeatureExtractionNoAveraging(dirName, mt_win, mt_step, st_win, st_step): """ This function extracts the mid-term features of the WAVE files of a particular folder without averaging each file. ARGUMENTS: - dirName: the path of the WAVE directory - mt_win, mt_step: mid-term window and step (in seconds) - st_win, st_step: short-term window and step (in seconds) RETURNS: - X: A feature matrix - Y: A matrix of file labels - filenames: """ all_mt_feats = numpy.array([]) signal_idx = numpy.array([]) process_times = [] types = ('*.wav', '*.aif', '*.aiff', '*.ogg') wav_file_list = [] for files in types: wav_file_list.extend(glob.glob(os.path.join(dirName, files))) wav_file_list = sorted(wav_file_list) for i, wavFile in enumerate(wav_file_list): [fs, x] = audioBasicIO.readAudioFile(wavFile) if isinstance(x, int): continue x = audioBasicIO.stereo2mono(x) [mt_term_feats, _, _] = mtFeatureExtraction(x, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step)) mt_term_feats = numpy.transpose(mt_term_feats) if len(all_mt_feats) == 0: # append feature vector all_mt_feats = mt_term_feats signal_idx = numpy.zeros((mt_term_feats.shape[0], )) else: all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats)) signal_idx = numpy.append(signal_idx, i * numpy.ones((mt_term_feats.shape[0], ))) return (all_mt_feats, signal_idx, wav_file_list)
def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile, storeStFeatures=False, storeToCSV=False, PLOT=False): """ This function is used as a wrapper to: a) read the content of a WAV file b) perform mid-term feature extraction on that signal c) write the mid-term feature sequences to a numpy file """ [fs, x] = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) if storeStFeatures: [mtF, stF, _] = mtFeatureExtraction(x, fs, round(fs * midTermSize), round(fs * midTermStep), round(fs * shortTermSize), round(fs * shortTermStep)) else: [mtF, _, _] = mtFeatureExtraction(x, fs, round(fs*midTermSize), round(fs * midTermStep), round(fs * shortTermSize), round(fs * shortTermStep)) # save mt features to numpy file numpy.save(outPutFile, mtF) if PLOT: print("Mid-term numpy file: " + outPutFile + ".npy saved") if storeToCSV: numpy.savetxt(outPutFile+".csv", mtF.T, delimiter=",") if PLOT: print("Mid-term CSV file: " + outPutFile + ".csv saved") if storeStFeatures: # save st features to numpy file numpy.save(outPutFile+"_st", stF) if PLOT: print("Short-term numpy file: " + outPutFile + "_st.npy saved") if storeToCSV: # store st features to CSV file numpy.savetxt(outPutFile+"_st.csv", stF.T, delimiter=",") if PLOT: print("Short-term CSV file: " + outPutFile + "_st.csv saved")
from pyAudioAnalysis import audioFeatureExtraction from pyAudioAnalysis import audioTrainTest as aT from pyAudioAnalysis import audioSegmentation as aS import matplotlib.pyplot as plt root_data_path = "/Users/tyiannak/ResearchData/Audio Dataset/pyAudioAnalysisData/" print("\n\n\n * * * TEST 1 * * * \n\n\n") [Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/count.wav"); F, f_names = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs); plt.subplot(2,1,1); plt.plot(F[0,:]); plt.xlabel('Frame no'); plt.ylabel(f_names[0]); plt.subplot(2,1,2); plt.plot(F[1,:]); plt.xlabel('Frame no'); plt.ylabel(f_names[1]); plt.show() print("\n\n\n * * * TEST 2 * * * \n\n\n") [Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/doremi.wav") x = audioBasicIO.stereo2mono(x) specgram, TimeAxis, FreqAxis = audioFeatureExtraction.stSpectogram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True) print("\n\n\n * * * TEST 3 * * * \n\n\n") [Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/doremi.wav") x = audioBasicIO.stereo2mono(x) specgram, TimeAxis, FreqAxis = audioFeatureExtraction.stChromagram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True) print("\n\n\n * * * TEST 4 * * * \n\n\n") aT.featureAndTrain([root_data_path +"SM/speech",root_data_path + "SM/music"], 1.0, 1.0, 0.2, 0.2, "svm", "temp", True) print("\n\n\n * * * TEST 5 * * * \n\n\n") [flagsInd, classesAll, acc, CM] = aS.mtFileClassification(root_data_path + "pyAudioAnalysis/data//scottish.wav", root_data_path + "pyAudioAnalysis/data/svmSM", "svm", True, root_data_path + 'pyAudioAnalysis/data/scottish.segments') print("\n\n\n * * * TEST 6 * * * \n\n\n") aS.trainHMM_fromFile(root_data_path + 'radioFinal/train/bbc4A.wav', root_data_path + 'radioFinal/train/bbc4A.segments', 'hmmTemp1', 1.0, 1.0)
def fileClassification(inputFile, modelName, modelType): # Load classifier: if not os.path.isfile(modelName): print("fileClassification: input modelName not found!") return (-1, -1, -1) if not os.path.isfile(inputFile): print("fileClassification: wav file not found!") return (-1, -1, -1) if (modelType) == 'svm' or (modelType == 'svm_rbf'): [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadSVModel(modelName) elif modelType == 'knn': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadKNNModel(modelName) elif modelType == 'randomforest': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadRandomForestModel(modelName) elif modelType == 'gradientboosting': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadGradientBoostingModel(modelName) elif modelType == 'extratrees': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadExtraTreesModel(modelName) # read audio file and convert to mono [Fs, x] = audioBasicIO.readAudioFile(inputFile) x = audioBasicIO.stereo2mono(x) if isinstance(x, int): # audio file IO problem return (-1, -1, -1) if x.shape[0] / float(Fs) <= mtWin: return (-1, -1, -1) # feature extraction: [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) # long term averaging of mid-term statistics MidTermFeatures = MidTermFeatures.mean(axis=1) if computeBEAT: [beat, beatConf] = aF.beatExtraction(s, stStep) MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) curFV = (MidTermFeatures - MEAN) / STD # normalization [Result, P] = classifierWrapper(Classifier, modelType, curFV) # classification return Result, P, classNames
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35, PLOT=False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' [Fs, x] = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) Duration = len(x) / Fs [ Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 ] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerAll")) [ Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 ] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerFemaleMale")) [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 # TODO # SELECT FEATURES: #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96, # 97,98, 99,100]; # SET 0C iFeaturesSelect = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] # SET 1A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C #iFeaturesSelect = range(100); # SET 3 #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] # remove outliers: DistancesAll = numpy.sum(distance.squareform( distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(MidTermFeatures[1,:]) #EnergyMean = numpy.mean(MidTermFeatures[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0] #print iNonOutLiers perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] # LDA dimensionality reduction: if LDAdim > 0: #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin)); # extract mid-term features with minimum step: mtWinRatio = int(round(mtSize / stWin)) mtStepRatio = int(round(stWin / stWin)) mtFeaturesToReduce = [] numOfFeatures = len(ShortTermFeatures) numOfStatistics = 2 #for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append( numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[ mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) #MDistancesAll = numpy.mean(DistancesAll) #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0] #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T if numOfSpeakers <= 0: sRange = range(2, 10) else: sRange = [numOfSpeakers] clsAll = [] silAll = [] centersAll = [] for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) clsAll.append(cls) centersAll.append(means) silA = [] silB = [] for c in range(iSpeakers ): # for each speaker (i.e. for each extracted cluster) clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float( len(cls)) if clusterPerCent < 0.020: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] # get subset of feature vectors Yt = distance.pdist( MidTermFeaturesNormTemp.T ) # compute average distance between samples that belong to the cluster (a values) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = [] for c2 in range( iSpeakers ): # compute distances from samples of other clusters if c2 != c: clusterPerCent2 = numpy.nonzero( cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append( numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append( min(silBs) ) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) silA = numpy.array(silA) silB = numpy.array(silB) sil = [] for c in range(iSpeakers): # for each cluster (speaker) sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) # compute silhouette silAll.append(numpy.mean(sil)) # keep the AVERAGE SILLOUETTE #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5))) imax = numpy.argmax(silAll) # position of the maximum sillouette value nSpeakersFinal = sRange[imax] # optimal number of clusters # generate the final set of cluster labels # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window) cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): startprob, transmat, means, cov = trainHMM_computeStatistics( MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # hmm training hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] # final sillouette classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)] # load ground-truth if available gtFile = fileName.replace('.wav', '.segments') # open for annotated file if os.path.isfile(gtFile): # if groundturh exists [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags if PLOT: fig = plt.figure() if numOfSpeakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(classNames)))) ax1.axis((0, Duration, -1, len(classNames))) ax1.set_yticklabels(classNames) ax1.plot(numpy.array(range(len(cls))) * mtStep + mtStep / 2.0, cls) if os.path.isfile(gtFile): if PLOT: ax1.plot( numpy.array(range(len(flagsGT))) * mtStep + mtStep / 2.0, flagsGT, 'r') purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization( cls, flagsGT) print("{0:.1f}\t{1:.1f}".format(100 * purityClusterMean, 100 * puritySpeakerMean)) if PLOT: plt.title( "Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format( 100 * purityClusterMean, 100 * puritySpeakerMean)) if PLOT: plt.xlabel("time (seconds)") #print sRange, silAll if numOfSpeakers <= 0: plt.subplot(212) plt.plot(sRange, silAll) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() return cls
return x[15:17] def getFourth(val): return val[3] os.chdir('C:/Users/konst_000/Desktop/Σχολή/6ο Εξάμηνο/ΨΕΣ/Speech Emotion Recognition/Audio Database/Complete') fileList = os.listdir('C:/Users/konst_000/Desktop/Σχολή/6ο Εξάμηνο/ΨΕΣ/Speech Emotion Recognition/Audio Database/Complete') featureList = [] #list of lists used to store the extracted features of each training sample labelListAct = [] #list of strings used to store the labels(emotions) for each training sample labelListVal = [] speakerList = [] #list of strings used to store the speaker identity for f in fileList: label = getEmotionLabel(f) [Fs, sample] = audioBasicIO.readAudioFile(f) sample = audioBasicIO.stereo2mono(sample) #feature extraction can be performed only on mono signals speaker = getSpeakerLabel(f) features = emoFeatExtract(sample, Fs, 0.050*Fs, 0.025*Fs) featureList.append(features) #Binary Activation Labels if (label == '01' or label == '02' or label == '04' or label == '07'): labelListAct.append('Low') else: labelListAct.append('High') if (label == '04' or label == '05' or label == '06' or label == '07'): labelListVal.append('Negative') else: labelListVal.append('Positive') speakerList.append(speaker) final = []
import matplotlib.pyplot as plt from pyAudioAnalysis import audioBasicIO import numpy import cPickle import audioTrainTest # [features, classNames, fileNames] = audioFeatureExtraction.dirsWavFeatureExtraction(['train/bumps', 'train/door', # 'train/steps', 'train/speech', # 'train/specificDoor', # 'train/background'], # 0.25, 0.25, 0.02, 0.02) [Fs1, x1] = audioBasicIO.readAudioFile("backgr5.wav") x1 = audioBasicIO.stereo2mono(x1) [Fs2, x2] = audioBasicIO.readAudioFile("boots10.wav") x2 = audioBasicIO.stereo2mono(x2) [Fs3, x3] = audioBasicIO.readAudioFile("door12.wav") x3 = audioBasicIO.stereo2mono(x3) [Fs4, x4] = audioBasicIO.readAudioFile("drop_key1.wav") x4 = audioBasicIO.stereo2mono(x4) [Fs5, x5] = audioBasicIO.readAudioFile("eng6.wav") x5 = audioBasicIO.stereo2mono(x5) [Fs6, x6] = audioBasicIO.readAudioFile("man_scream1.wav") x6 = audioBasicIO.stereo2mono(x6) [Fs7, x7] = audioBasicIO.readAudioFile("door19.wav") x7 = audioBasicIO.stereo2mono(x7) print Fs1 print x1 # print(Fs1*0.040) # print(len(x1)) # print(Fs1,Fs2,Fs3,Fs4,Fs5,Fs6,Fs7)
def evaluateSpeechMusic(fileName, modelName, method="svm", postProcess=0, postProcessModelName="", PLOT=False): # load grount truth file (matlab annotation) matFile = fileName.replace(".wav", "_true.mat") if os.path.isfile(matFile): matfile = loadmat(matFile) segs_gt = matfile["segs_r"] classes_gt1 = matfile["classes_r"] classes_gt = [] for c in classes_gt1[0]: if c == "M": classes_gt.append("music") if c == "S" or c == "E": classes_gt.append("speech") flagsIndGT, classesAllGT = audioSegmentation.segs2flags( [s[0] for s in segs_gt], [s[1] for s in segs_gt], classes_gt, 1.0) if method == "svm" or method == "randomforest" or method == "gradientboosting" or method == "extratrees": # speech-music segmentation: [flagsInd, classesAll, acc, CM] = audioSegmentation.mtFileClassification(fileName, modelName, method, False, '') elif method == "hmm": [flagsInd, classesAll, _, _] = audioSegmentation.hmmSegmentation(fileName, modelName, PLOT=False, gtFileName="") elif method == "cnn": WIDTH_SEC = 2.4 [Fs, x] = io.readAudioFile(fileName) x = io.stereo2mono(x) [flagsInd, classesAll, CNNprobs] = mtCNN_classification(x, Fs, WIDTH_SEC, 1.0, RGB_singleFrame_net, SOUND_mean_RGB, transformer_RGB, classNamesCNN) for i in range(flagsIndGT.shape[0]): flagsIndGT[i] = classesAll.index(classesAllGT[flagsIndGT[i]]) #plt.plot(flagsIndGT, 'r') #plt.plot(flagsInd) #plt.show() #print classesAllGT, classesAll if postProcess >= 1: # medfilt here! flagsInd = scipy.signal.medfilt(flagsInd, 11) if postProcess >= 2: #load HMM try: fo = open(postProcessModelName, "rb") except IOError: print "didn't find file" return try: hmm = cPickle.load(fo) classesAll = cPickle.load(fo) except: fo.close() #Features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs); # feature extraction #[Features, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)) flagsInd = hmm.predict(CNNprobs) flagsInd = scipy.signal.medfilt(flagsInd, 3) if PLOT: plt.plot(flagsInd + 0.01) plt.plot(flagsIndGT, 'r') plt.show() CM = np.zeros((2, 2)) for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])): CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1 print CM return CM, classesAll
def musicThumbnailing(x, fs, short_term_size=1.0, short_term_step=0.5, thumb_size=10.0, limit_1=0, limit_2=1): ''' This function detects instances of the most representative part of a music recording, also called "music thumbnails". A technique similar to the one proposed in [1], however a wider set of audio features is used instead of chroma features. In particular the following steps are followed: - Extract short-term audio features. Typical short-term window size: 1 second - Compute the self-silimarity matrix, i.e. all pairwise similarities between feature vectors - Apply a diagonal mask is as a moving average filter on the values of the self-similarty matrix. The size of the mask is equal to the desirable thumbnail length. - Find the position of the maximum value of the new (filtered) self-similarity matrix. The audio segments that correspond to the diagonial around that position are the selected thumbnails ARGUMENTS: - x: input signal - fs: sampling frequency - short_term_size: window size (in seconds) - short_term_step: window step (in seconds) - thumb_size: desider thumbnail size (in seconds) RETURNS: - A1: beginning of 1st thumbnail (in seconds) - A2: ending of 1st thumbnail (in seconds) - B1: beginning of 2nd thumbnail (in seconds) - B2: ending of 2nd thumbnail (in seconds) USAGE EXAMPLE: import audioFeatureExtraction as aF [fs, x] = basicIO.readAudioFile(input_file) [A1, A2, B1, B2] = musicThumbnailing(x, fs) [1] Bartsch, M. A., & Wakefield, G. H. (2005). Audio thumbnailing of popular music using chroma-based representations. Multimedia, IEEE Transactions on, 7(1), 96-104. ''' x = audioBasicIO.stereo2mono(x) # feature extraction: st_feats, _ = aF.stFeatureExtraction(x, fs, fs * short_term_size, fs * short_term_step) # self-similarity matrix S = selfSimilarityMatrix(st_feats) # moving filter: M = int(round(thumb_size / short_term_step)) B = numpy.eye(M, M) S = scipy.signal.convolve2d(S, B, 'valid') # post-processing (remove main diagonal elements) min_sm = numpy.min(S) for i in range(S.shape[0]): for j in range(S.shape[1]): if abs(i - j) < 5.0 / short_term_step or i > j: S[i, j] = min_sm # find max position: S[0:int(limit_1 * S.shape[0]), :] = min_sm S[:, 0:int(limit_1 * S.shape[0])] = min_sm S[int(limit_2 * S.shape[0])::, :] = min_sm S[:, int(limit_2 * S.shape[0])::] = min_sm maxVal = numpy.max(S) [I, J] = numpy.unravel_index(S.argmax(), S.shape) #plt.imshow(S) #plt.show() # expand: i1 = I i2 = I j1 = J j2 = J while i2 - i1 < M: if i1 <= 0 or j1 <= 0 or i2 >= S.shape[0] - 2 or j2 >= S.shape[1] - 2: break if S[i1 - 1, j1 - 1] > S[i2 + 1, j2 + 1]: i1 -= 1 j1 -= 1 else: i2 += 1 j2 += 1 return short_term_step * i1, short_term_step * i2, \ short_term_step * j1, short_term_step * j2, S
def silenceRemoval(x, fs, st_win, st_step, minDuration=0.2, maxDuration=0, smoothWindow=1.0, weight=0.5, plot=False): ''' Event Detection (silence removal) ARGUMENTS: - x: the input audio signal - fs: sampling freq - st_win, st_step: window size and step in seconds - minDuration: (optional) minium duration in seconds of segments to keep - maxDuration: (optional) maximum duration in seconds of segments to keep (0 is no limit) - smoothWindow: (optional) smooth window (in seconds) - weight: (optional) weight factor (0 < weight < 1) the higher, the more strict - plot: (optional) True if results are to be plotted RETURNS: - seg_limits: list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds ''' if weight >= 1: weight = 0.99 if weight <= 0: weight = 0.01 # Step 1: feature extraction x = audioBasicIO.stereo2mono(x) st_feats, _ = aF.stFeatureExtraction(x, fs, st_win * fs, st_step * fs) # Step 2: train binary svm classifier of low vs high energy frames # keep only the energy short-term sequence (2nd feature) st_energy = st_feats[1, :] en = numpy.sort(st_energy) # number of 10% of the total short-term windows l1 = int(len(en) / 10) # compute "lower" 10% energy threshold t1 = numpy.mean(en[0:l1]) + 0.000000000000001 # compute "higher" 10% energy threshold t2 = numpy.mean(en[-l1:-1]) + 0.000000000000001 # get all features that correspond to low energy class1 = st_feats[:, numpy.where(st_energy <= t1)[0]] # get all features that correspond to high energy class2 = st_feats[:, numpy.where(st_energy >= t2)[0]] # form the binary classification task and ... faets_s = [class1.T, class2.T] # normalize and train the respective svm probabilistic model # (ONSET vs SILENCE) [faets_s_norm, means_s, stds_s] = aT.normalizeFeatures(faets_s) svm = aT.trainSVM(faets_s_norm, 1.0) # Step 3: compute onset probability based on the trained svm prob_on_set = [] for i in range(st_feats.shape[1]): # for each frame cur_fv = (st_feats[:, i] - means_s) / stds_s # get svm probability (that it belongs to the ONSET class) prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1]) prob_on_set = numpy.array(prob_on_set) # smooth probability: prob_on_set = smoothMovingAvg(prob_on_set, smoothWindow / st_step) # Step 4A: detect onset frame indices: prog_on_set_sort = numpy.sort(prob_on_set) # find probability Threshold as a weighted average # of top 10% and lower 10% of the values Nt = int(prog_on_set_sort.shape[0] / 10) T = (numpy.mean((1 - weight) * prog_on_set_sort[0:Nt]) + weight * numpy.mean(prog_on_set_sort[-Nt::])) max_idx = numpy.where(prob_on_set > T)[0] # get the indices of the frames that satisfy the thresholding i = 0 time_clusters = [] seg_limits = [] # Step 4B: group frame indices to onset segments while i < len(max_idx): # for each of the detected onset indices cur_cluster = [max_idx[i]] if i == len(max_idx) - 1: break while max_idx[i + 1] - cur_cluster[-1] <= 2: cur_cluster.append(max_idx[i + 1]) i += 1 if i == len(max_idx) - 1: break i += 1 time_clusters.append(cur_cluster) seg_limits.append( [cur_cluster[0] * st_step, cur_cluster[-1] * st_step]) # Step 5: Post process: remove segments that are outside the min and max segment bounds: seg_limits_2 = [] for s in seg_limits: if s[1] - s[0] > minDuration and ( (maxDuration == None or maxDuration == 0) or (s[1] - s[0] < maxDuration)): seg_limits_2.append(s) seg_limits = seg_limits_2 if plot: timeX = numpy.arange(0, x.shape[0] / float(fs), 1.0 / fs) plt.subplot(2, 1, 1) plt.plot(timeX, x) for s in seg_limits: plt.axvline(x=s[0]) plt.axvline(x=s[1]) plt.subplot(2, 1, 2) plt.plot( numpy.arange(0, prob_on_set.shape[0] * st_step, st_step)[:len(prob_on_set)], prob_on_set) plt.title('Signal') for s in seg_limits: plt.axvline(x=s[0]) plt.axvline(x=s[1]) plt.title('svm Probability') plt.show() return seg_limits
"""! @brief Example 29 @details: Music segmentation example @author Theodoros Giannakopoulos {[email protected]} """ import os, readchar, sklearn.cluster from pyAudioAnalysis.audioFeatureExtraction import mtFeatureExtraction as mT from pyAudioAnalysis.audioBasicIO import readAudioFile, stereo2mono from pyAudioAnalysis.audioSegmentation import flags2segs from pyAudioAnalysis.audioTrainTest import normalizeFeatures if __name__ == '__main__': # read signal and get normalized segment features: input_file = "../data/song1.mp3" fs, x = readAudioFile(input_file) x = stereo2mono(x) mt_size, mt_step, st_win = 5, 0.5, 0.05 [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5)) (mt_feats_norm, MEAN, STD) = normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T # perform clustering (k = 4) n_clusters = 4 k_means = sklearn.cluster.KMeans(n_clusters=n_clusters) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ segs, c = flags2segs(cls, mt_step) # convert flags to segment limits for sp in range(n_clusters): # play each cluster's segment for i in range(len(c)): if c[i] == sp and segs[i, 1] - segs[i, 0] > 5: # play long segments of current cluster (only win_to_play seconds)
print('Converting...') sound = pydub.AudioSegment.from_mp3( os.path.join('C:\\', 'Users', 'akauf', 'Desktop', 'song.mp3')) sound.export(os.path.join('E:\\', 'Python_Projects', 'Audio_engine', 'temp.wav'), format="wav") print('Converted File to wav!') print('Horaay!') print('I am coolguy') #Loads audio into bits file print('Extracting Data...') [Fs, x] = audioBasicIO.readAudioFile( os.path.join('E:\\', 'Python_Projects', 'Audio_engine', 'temp.wav')) x = audioBasicIO.stereo2mono(x) # Collapses to mono signal F = audioFeatureExtraction.stFeatureExtraction( x, Fs, 0.050 * Fs, 0.025 * Fs) #Creates an array of features per frame print('Extracted!') N = [] #Array of features that we are going to use and modify harmonic_p = [] #Item for array N, power of higher frequencies percussive_p = [] #Item for array N, power of lower frequencies arc_p = [] #Item for array N, total track power spec_center_p = [] chaos_p = [] #PHASE THIS OUT IN FUTURE VERSIONS perc_colors = [0, 10879, 5461] #List of hue values for percparse to use
def silenceCounter(x, fs, st_win, st_step, smoothWindow=0.5, weight=0.5, plot=False): if weight >= 1: weight = 0.99 if weight <= 0: weight = 0.01 # Step 1: feature extraction x = audioBasicIO.stereo2mono(x) st_feats, _ = aF.stFeatureExtraction(x, fs, st_win * fs, st_step * fs) # Step 2: train binary svm classifier of low vs high energy frames # keep only the energy short-term sequence (2nd feature) st_energy = st_feats[1, :] en = numpy.sort(st_energy) # number of 10% of the total short-term windows l1 = int(len(en) / 10) # compute "lower" 10% energy threshold t1 = numpy.mean(en[0:l1]) + 0.000000000000001 # compute "higher" 10% energy threshold t2 = numpy.mean(en[-l1:-1]) + 0.000000000000001 # get all features that correspond to low energy class1 = st_feats[:, numpy.where(st_energy <= t1)[0]] # get all features that correspond to high energy class2 = st_feats[:, numpy.where(st_energy >= t2)[0]] # form the binary classification task and ... # change the order of the array # faets_s = [class1.T, class2.T] # changing order gives the segmens with silence faets_s = [class2.T, class1.T] # normalize and train the respective svm probabilistic model # (SILENCE vs ONSET) [faets_s_norm, means_s, stds_s] = aT.normalizeFeatures(faets_s) svm = aT.trainSVM(faets_s_norm, 1.0) # Step 3: compute onset probability based on the trained svm prob_on_set = [] for i in range(st_feats.shape[1]): # for each frame cur_fv = (st_feats[:, i] - means_s) / stds_s # get svm probability (that it belongs to the ONSET class) prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1]) prob_on_set = numpy.array(prob_on_set) # smooth probability: prob_on_set = smoothMovingAvg(prob_on_set, smoothWindow / st_step) # Step 4A: detect onset frame indices: prog_on_set_sort = numpy.sort(prob_on_set) # find probability Threshold as a weighted average # of top 10% and lower 10% of the values Nt = int(prog_on_set_sort.shape[0] / 10) T = (numpy.mean((1 - weight) * prog_on_set_sort[0:Nt]) + weight * numpy.mean(prog_on_set_sort[-Nt::])) max_idx = numpy.where(prob_on_set > T)[0] # get the indices of the frames that satisfy the thresholding i = 0 time_clusters = [] seg_limits = [] # Step 4B: group frame indices to onset segments while i < len(max_idx): # for each of the detected onset indices cur_cluster = [max_idx[i]] if i == len(max_idx) - 1: break while max_idx[i + 1] - cur_cluster[-1] <= 2: cur_cluster.append(max_idx[i + 1]) i += 1 if i == len(max_idx) - 1: break i += 1 time_clusters.append(cur_cluster) seg_limits.append( [cur_cluster[0] * st_step, cur_cluster[-1] * st_step]) # Step 5: Post process: remove very small segments: min_dur = 0.2 seg_limits_2 = [] for s in seg_limits: if s[1] - s[0] > min_dur: seg_limits_2.append(s) print(f"SEGMENTS 0.2: {seg_limits_2}") print(F"SEGMENTS: {seg_limits}")
def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT=False): """ This function extracts the mid-term features of the WAVE files of a particular folder. The resulting feature vector is extracted by long-term averaging the mid-term features. Therefore ONE FEATURE VECTOR is extracted for each WAV file. ARGUMENTS: - dirName: the path of the WAVE directory - mtWin, mtStep: mid-term window and step (in seconds) - stWin, stStep: short-term window and step (in seconds) """ allMtFeatures = numpy.array([]) processingTimes = [] types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au') wavFilesList = [] for files in types: wavFilesList.extend(glob.glob(os.path.join(dirName, files))) wavFilesList = sorted(wavFilesList) wavFilesList2 = [] for i, wavFile in enumerate(wavFilesList): print("Analyzing file {0:d} of " "{1:d}: {2:s}".format(i + 1, len(wavFilesList), wavFile)) if os.stat(wavFile).st_size == 0: print(" (EMPTY FILE -- SKIPPING)") continue [Fs, x] = audioBasicIO.readAudioFile(wavFile) # read file if isinstance(x, int): continue t1 = time.clock() x = audioBasicIO.stereo2mono(x) # convert stereo to mono if x.shape[0] < float(Fs) / 5: print(" (AUDIO FILE TOO SMALL - SKIPPING)") continue wavFilesList2.append(wavFile) if computeBEAT: # mid-term feature extraction for current file [MidTermFeatures, stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) [beat, beatConf] = beatExtraction(stFeatures, stStep) else: [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = numpy.transpose(MidTermFeatures) MidTermFeatures = MidTermFeatures.mean( axis=0) # long term averaging of mid-term statistics if (not numpy.isnan(MidTermFeatures).any()) and ( not numpy.isinf(MidTermFeatures).any()): if computeBEAT: MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) if len(allMtFeatures) == 0: # append feature vector allMtFeatures = MidTermFeatures else: allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures)) t2 = time.clock() duration = float(len(x)) / Fs processingTimes.append((t2 - t1) / duration) if len(processingTimes) > 0: print("Feature extraction complexity ratio: " "{0:.1f} x realtime".format( (1.0 / numpy.mean(numpy.array(processingTimes))))) return (allMtFeatures, wavFilesList2)
def speakerDiarization(fileName, sRange=xrange(2, 10), mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35): Fs, x = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) duration = len(x) / Fs Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel( os.path.join( '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll')) Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel( os.path.join( '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale')) MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction( x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 iFeaturesSelect = range(8, 21) + range(41, 54) MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] DistancesAll = numpy.sum(distance.squareform( distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] if LDAdim > 0: mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int( round(mtSize / stWin)), int(round( stWin / stWin)), list(), len(ShortTermFeatures), 2 for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list()) for i in range(numOfFeatures): curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1, N2 = curPos, curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append( numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[ mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures( [mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T clsAll, silAll, centersAll = list(), list(), list() for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ clsAll.append(cls) centersAll.append(means) silA, silB = list(), list() for c in range(iSpeakers): clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float( len(cls)) if clusterPerCent < 0.02: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] Yt = distance.pdist(MidTermFeaturesNormTemp.T) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = list() for c2 in range(iSpeakers): if c2 != c: clusterPerCent2 = numpy.nonzero( cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append( numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) silA, silB, sil = numpy.array(silA), numpy.array(silB), list() for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) silAll.append(numpy.mean(sil)) imax = numpy.argmax(silAll) nSpeakersFinal = sRange[imax] cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag') hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)] return cls, classNames, duration, mtStep, silAll
def main(argv): dirName = argv[1] types = ('*.wav', ) filesList = [] for files in types: filesList.extend(glob.glob(os.path.join(dirName, files))) filesList = sorted(filesList) WIDTH_SEC = 2.4 stWin = 0.020 stStep = 0.015 WIDTH = WIDTH_SEC / stStep for f in filesList: [Fs, x] = audioBasicIO.readAudioFile(f) print(Fs) x = audioBasicIO.stereo2mono(x) specgramOr, TimeAxis, FreqAxis = aF.stSpectogram( x, Fs, round(Fs * stWin), round(Fs * stStep), False) if specgramOr.shape[0] > WIDTH: specgram = specgramOr[int(specgramOr.shape[0] / 2) - WIDTH / 2:int(specgramOr.shape[0] / 2) + WIDTH / 2, :] specgram = scipy.misc.imresize(specgram, float(227.0) / float(specgram.shape[0]), interp='bilinear') print specgram.shape im = Image.fromarray(numpy.uint8( matplotlib.cm.jet(specgram) * 255)) #plt.imshow(im) scipy.misc.imsave(f.replace(".wav", ".jpg"), im) if int(specgramOr.shape[0] / 2) - WIDTH / 2 - int( (0.2) / stStep) > 0: specgram = specgramOr[ int(specgramOr.shape[0] / 2) - WIDTH / 2 - int((0.2) / stStep):int(specgramOr.shape[0] / 2) + WIDTH / 2 - int((0.2) / stStep), :] specgram = scipy.misc.imresize(specgram, float(227.0) / float(specgram.shape[0]), interp='bilinear') im = Image.fromarray( numpy.uint8(matplotlib.cm.jet(specgram) * 255)) print specgram.shape scipy.misc.imsave(f.replace(".wav", "_02A.jpg"), im) specgram = specgramOr[ int(specgramOr.shape[0] / 2) - WIDTH / 2 + int((0.2) / stStep):int(specgramOr.shape[0] / 2) + WIDTH / 2 + int((0.2) / stStep), :] specgram = scipy.misc.imresize(specgram, float(227.0) / float(specgram.shape[0]), interp='bilinear') print specgram.shape im = Image.fromarray( numpy.uint8(matplotlib.cm.jet(specgram) * 255)) scipy.misc.imsave(f.replace(".wav", "_02B.jpg"), im) # ONLY FOR SPEECH (fewer samples). Must comment for music """specgram = specgramOr[int(specgramOr.shape[0]/2) - WIDTH/2 - int((0.1) / stStep):int(specgramOr.shape[0]/2) + WIDTH/2 - int((0.1) / stStep), :]
def speakerDiarization(fileName, sRange = xrange(2, 10), mtSize = 2.0, mtStep = 0.2, stWin = 0.05, LDAdim = 35): Fs, x = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) duration = len(x) / Fs Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll')) Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale')) MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) MidTermFeatures2[0: MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]: MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 iFeaturesSelect = range(8, 21) + range(41, 54) MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis = 0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] if LDAdim > 0: mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int(round(mtSize / stWin)), int(round(stWin / stWin)), list(), len(ShortTermFeatures), 2 for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list()) for i in range(numOfFeatures): curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1, N2 = curPos, curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1: N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) mtFeaturesToReduce2[0: mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]: mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components = LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T clsAll, silAll, centersAll = list(), list(), list() for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ clsAll.append(cls) centersAll.append(means) silA, silB = list(), list() for c in range(iSpeakers): clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.02: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] Yt = distance.pdist(MidTermFeaturesNormTemp.T) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = list() for c2 in range(iSpeakers): if c2 != c: clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt) * (clusterPerCent+clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) silA, silB, sil = numpy.array(silA), numpy.array(silB), list() for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) silAll.append(numpy.mean(sil)) imax = numpy.argmax(silAll) nSpeakersFinal = sRange[imax] cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag') hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)] return cls, classNames, duration, mtStep, silAll
def dirWavFeatureExtraction(dirName, mt_win, mt_step, st_win, st_step, feats, compute_beat=False): """ This function extracts the mid-term features of the WAVE files of a particular folder. The resulting feature vector is extracted by long-term averaging the mid-term features. Therefore ONE FEATURE VECTOR is extracted for each WAV file. ARGUMENTS: - dirName: the path of the WAVE directory - mt_win, mt_step: mid-term window and step (in seconds) - st_win, st_step: short-term window and step (in seconds) """ all_mt_feats = numpy.array([]) process_times = [] types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au', '*.ogg') wav_file_list = [] for files in types: wav_file_list.extend(glob.glob(os.path.join(dirName, files))) wav_file_list = sorted(wav_file_list) wav_file_list2, mt_feature_names = [], [] for i, wavFile in enumerate(wav_file_list): print("Analyzing file {0:d} of " "{1:d}: {2:s}".format(i + 1, len(wav_file_list), wavFile)) if os.stat(wavFile).st_size == 0: print(" (EMPTY FILE -- SKIPPING)") continue [fs, x] = audioBasicIO.readAudioFile(wavFile) if isinstance(x, int): continue t1 = time.clock() x = audioBasicIO.stereo2mono(x) if x.shape[0] < float(fs) / 5: print(" (AUDIO FILE TOO SMALL - SKIPPING)") continue wav_file_list2.append(wavFile) if compute_beat: [mt_term_feats, st_features, mt_feature_names] = \ mtFeatureExtraction(x, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step), feats) [beat, beat_conf] = beatExtraction(st_features, st_step) else: [mt_term_feats, _, mt_feature_names] = \ mtFeatureExtraction(x, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step), feats) mt_term_feats = numpy.transpose(mt_term_feats) mt_term_feats = mt_term_feats.mean(axis=0) # long term averaging of mid-term statistics if (not numpy.isnan(mt_term_feats).any()) and \ (not numpy.isinf(mt_term_feats).any()): if compute_beat: mt_term_feats = numpy.append(mt_term_feats, beat) mt_term_feats = numpy.append(mt_term_feats, beat_conf) if len(all_mt_feats) == 0: # append feature vector all_mt_feats = mt_term_feats else: all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats)) t2 = time.clock() duration = float(len(x)) / fs process_times.append((t2 - t1) / duration) if len(process_times) > 0: print("Feature extraction complexity ratio: " "{0:.1f} x realtime".format( (1.0 / numpy.mean(numpy.array(process_times))))) return (all_mt_feats, wav_file_list2, mt_feature_names)
def mtFileClassification(inputFile, modelName, modelType, plotResults=False, gtFile=""): ''' This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - inputFile: path of the input WAV file - modelName: name of the classification model - modelType: svm or knn depending on the classifier type - plotResults: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment ''' if not os.path.isfile(modelName): print("mtFileClassificationError: input modelType not found!") return (-1, -1, -1, -1) # Load classifier: if (modelType == 'svm') or (modelType == 'svm_rbf'): [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName) elif modelType == 'knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName) elif modelType == 'randomforest': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadRandomForestModel(modelName) elif modelType == 'gradientboosting': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadGradientBoostingModel(modelName) elif modelType == 'extratrees': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadExtraTreesModel(modelName) if computeBEAT: print("Model " + modelName + " contains long-term music features (beat etc) and cannot be used in segmentation") return (-1, -1, -1, -1) [Fs, x] = audioBasicIO.readAudioFile(inputFile) # load input file if Fs == -1: # could not read file return (-1, -1, -1, -1) # convert stereo (if) to mono x = audioBasicIO.stereo2mono(x) Duration = len(x) / Fs # mid-term feature extraction: [MidTermFeatures, _] = aF.mtFeatureExtraction( x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) flags = [] Ps = [] flagsInd = [] # for each feature vector (i.e. for each fix-sized segment): for i in range(MidTermFeatures.shape[1]): # normalize current feature vector curFV = (MidTermFeatures[:, i] - MEAN) / STD [Result, P] = aT.classifierWrapper( Classifier, modelType, curFV) # classify vector flagsInd.append(Result) # update class label matrix flags.append(classNames[int(Result)]) # update probability matrix Ps.append(numpy.max(P)) flagsInd = numpy.array(flagsInd) # 1-window smoothing for i in range(1, len(flagsInd) - 1): if flagsInd[i - 1] == flagsInd[i + 1]: flagsInd[i] = flagsInd[i + 1] # convert fix-sized flags to segments and classes (segs, classes) = flags2segs(flags, mtStep) segs[-1] = len(x) / float(Fs) # Load grount-truth: if os.path.isfile(gtFile): [segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile) flagsGT, classNamesGT = segs2flags( segStartGT, segEndGT, segLabelsGT, mtStep) flagsIndGT = [] for j, fl in enumerate(flagsGT): # "align" labels with GT if classNamesGT[flagsGT[j]] in classNames: flagsIndGT.append(classNames.index(classNamesGT[flagsGT[j]])) else: flagsIndGT.append(-1) flagsIndGT = numpy.array(flagsIndGT) CM = numpy.zeros((len(classNamesGT), len(classNamesGT))) for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])): CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1 else: CM = [] flagsIndGT = numpy.array([]) acc = plotSegmentationResults( flagsInd, flagsIndGT, classNames, mtStep, not plotResults) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc)) return (flagsInd, classNamesGT, acc, CM) else: return (flagsInd, classNames, acc, CM)
feeling_list.append('male_surprised') elif (item[6:8]=='08' and int(item[18:20])%2==0) or (item[:3]=='sor' and item[4]=='f'): feeling_list.append('female_surprised') elif item[:1] == 'd' or (item[:3]=='dis' and item[4]=='m') or (item[6:8]=='07' and int(item[18:20])%2==1): feeling_list.append('male_disgust') elif (item[:3]=='dis' and item[4]=='f') or (item[6:8]=='07' and int(item[18:20])%2==0): feeling_list.append('female_disgust') stdout.write('\033[F') labels = np.array(feeling_list) np.save(EMOTION_LABEL_PICKLE, labels) bookmark=0 for file_name in fileList: print('[INFO] Extracting features - {} / {} - {}'.format(bookmark + 1, len(fileList), file_name)) [Fs, x] = audioBasicIO.readAudioFile(args['dataset'] + '/' + file_name) x = audioBasicIO.stereo2mono(x) features, feature_names = audioFeatureExtraction.stFeatureExtraction(x, Fs, FRAME_SIZE * Fs, FRAME_SIZE / 2 * Fs); # npArray = np.array([np.array(feature, copy=False) for feature in features], copy=False, ndmin=3) npArray = np.array(features, ndmin=3) if (bookmark == 0): # soundData = pd.DataFrame(data=npArray, columns=feature_names) soundData = npArray columns = np.array(feature_names) else: if soundData.shape[2] > npArray.shape[2]: npArray = np.pad(npArray, ( (0, 0), (0, 0), (0, soundData.shape[2] - npArray.shape[2]) ), 'constant') else: soundData = np.pad(soundData, ( (0, 0), (0, 0), (0, npArray.shape[2] - soundData.shape[2]) ), 'constant') soundData = np.concatenate((soundData, npArray)) bookmark=bookmark+1 stdout.write('\033[F')
import time #set chunk size in seconds to work with chunk_size = 3 start_time = time.time() Fs = wf.getframerate() CHUNK = Fs * chunk_size #read next chunk from audio file data = wf.readframes(CHUNK) #iterate over track while data != '': #stream.write(data) array = _wav2array(wf.getnchannels(), wf.getsampwidth(), data) array = audioBasicIO.stereo2mono(array) #extract features MidTermFeatures = aF.mtFeatureExtraction(array, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = MidTermFeatures[0] #classify chunks to speech/music flags = [] Ps = [] flagsInd = [] for i in range( MidTermFeatures[0].shape[0] ): # for each feature vector (i.e. for each fix-sized segment): curFV = (MidTermFeatures[:, i] - MEAN) / STD # normalize current feature vector
def mtFileClassification(input_file, model_name, model_type, plot_results=False, gt_file=""): ''' This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment ''' if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return (-1, -1, -1, -1) # Load classifier: if model_type == "knn": [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = \ aT.load_model_knn(model_name) else: [ classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat ] = aT.load_model(model_name) if compute_beat: print("Model " + model_name + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return (-1, -1, -1, -1) [fs, x] = audioBasicIO.readAudioFile(input_file) # load input file if fs == -1: # could not read file return (-1, -1, -1, -1) x = audioBasicIO.stereo2mono(x) # convert stereo (if) to mono duration = len(x) / fs # mid-term feature extraction: [mt_feats, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs, round(fs * st_win), round(fs * st_step)) flags = [] Ps = [] flags_ind = [] for i in range( mt_feats.shape[1] ): # for each feature vector (i.e. for each fix-sized segment): cur_fv = (mt_feats[:, i] - MEAN) / STD # normalize current feature vector [res, P] = aT.classifierWrapper(classifier, model_type, cur_fv) # classify vector flags_ind.append(res) flags.append(class_names[int(res)]) # update class label matrix Ps.append(numpy.max(P)) # update probability matrix flags_ind = numpy.array(flags_ind) # 1-window smoothing for i in range(1, len(flags_ind) - 1): if flags_ind[i - 1] == flags_ind[i + 1]: flags_ind[i] = flags_ind[i + 1] # convert fix-sized flags to segments and classes (segs, classes) = flags2segs(flags, mt_step) segs[-1] = len(x) / float(fs) # Load grount-truth: if os.path.isfile(gt_file): [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt, seg_l_gt, mt_step) flags_ind_gt = [] for j, fl in enumerate(flags_gt): # "align" labels with GT if class_names_gt[flags_gt[j]] in class_names: flags_ind_gt.append( class_names.index(class_names_gt[flags_gt[j]])) else: flags_ind_gt.append(-1) flags_ind_gt = numpy.array(flags_ind_gt) cm = numpy.zeros((len(class_names_gt), len(class_names_gt))) for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])): cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1 else: cm = [] flags_ind_gt = numpy.array([]) acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names, mt_step, not plot_results) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc)) return (flags_ind, class_names_gt, acc, cm) else: return (flags_ind, class_names, acc, cm)
def main(argv): if argv[2] == 'full': dirName = argv[1] types = ('*.wav', ) filesList = [] for files in types: filesList.extend(glob.glob(os.path.join(dirName, files))) filesList = sorted(filesList) filesListIrr = [] filesListIrr = sorted(filesListIrr) stWin = 0.020 stStep = 0.015 for f in filesList: [Fs, x] = audioBasicIO.readAudioFile(f) x = audioBasicIO.stereo2mono(x) createSpectrogramFile(x, Fs, f.replace(".wav", ".png"), stWin, stStep) else: dirName = argv[1] dirNameIrrelevant = argv[2] types = ('*.wav', ) filesList = [] for files in types: filesList.extend(glob.glob(os.path.join(dirName, files))) filesList = sorted(filesList) filesListIrr = [] for files in types: filesListIrr.extend( glob.glob(os.path.join(dirNameIrrelevant, files))) filesListIrr = sorted(filesListIrr) print filesListIrr WIDTH_SEC = 1.5 stWin = 0.040 stStep = 0.005 WIDTH = WIDTH_SEC / stStep for f in filesList: print f [Fs, x] = audioBasicIO.readAudioFile(f) x = audioBasicIO.stereo2mono(x) x = x.astype(float) / x.max() for i in range(3): if x.shape[0] > WIDTH_SEC * Fs + 200: randStartSignal = random.randrange( 0, int(x.shape[0] - WIDTH_SEC * Fs - 200)) x2 = x[randStartSignal:randStartSignal + int((WIDTH_SEC + stStep) * Fs)] createSpectrogramFile(x2, Fs, f.replace(".wav", ".png"), stWin, stStep) # ORIGINAL if len(dirNameIrrelevant) > 0: # AUGMENTED randIrrelevant = random.randrange(0, len(filesListIrr)) [Fs, xnoise] = audioBasicIO.readAudioFile( filesListIrr[randIrrelevant]) xnoise = xnoise.astype(float) / xnoise.max() randStartNoise = random.randrange( 0, xnoise.shape[0] - WIDTH_SEC * Fs - 200) R = 5 xN = (R * x2.astype(float) + xnoise[randStartNoise:randStartNoise + x2.shape[0]].astype(float)) / float(R + 1) wavfile.write( f.replace(".wav", "_rnoise{0:d}1.wav".format(i)), Fs, (16000 * xN).astype('int16')) createSpectrogramFile( xN, Fs, f.replace(".wav", "_rnoise{0:d}1.png".format(i)), stWin, stStep) randStartNoise = random.randrange( 0, xnoise.shape[0] - WIDTH_SEC * Fs - 200) R = 4 xN = (R * x2.astype(float) + xnoise[randStartNoise:randStartNoise + x2.shape[0]].astype(float)) / float(R + 1) wavfile.write( f.replace(".wav", "_rnoise{0:d}2.wav".format(i)), Fs, (16000 * xN).astype('int16')) createSpectrogramFile( xN, Fs, f.replace(".wav", "_rnoise{0:d}2.png".format(i)), stWin, stStep) randStartNoise = random.randrange( 0, xnoise.shape[0] - WIDTH_SEC * Fs - 200) R = 3 xN = (R * x2.astype(float) + xnoise[randStartNoise:randStartNoise + x2.shape[0]].astype(float)) / float(R + 1) wavfile.write( f.replace(".wav", "_rnoise{0:d}3.wav".format(i)), Fs, (16000 * xN).astype('int16')) createSpectrogramFile( xN, Fs, f.replace(".wav", "_rnoise{0:d}3.png".format(i)), stWin, stStep) #specgramOr, TimeAxis, FreqAxis = aF.stSpectogram(x2, Fs, round(Fs * stWin), round(Fs * stStep), False) #im2 = Image.fromarray(numpy.uint8(matplotlib.cm.jet(specgram)*255)) #plt.subplot(2,1,1) #plt.imshow(im1) #plt.subplot(2,1,2) #plt.imshow(im2) #plt.show() '''
def dirWavFeatureExtraction(dirName, mt_win, mt_step, st_win, st_step, compute_beat=False): """ This function extracts the mid-term features of the WAVE files of a particular folder. The resulting feature vector is extracted by long-term averaging the mid-term features. Therefore ONE FEATURE VECTOR is extracted for each WAV file. ARGUMENTS: - dirName: the path of the WAVE directory - mt_win, mt_step: mid-term window and step (in seconds) - st_win, st_step: short-term window and step (in seconds) """ all_mt_feats = numpy.array([]) process_times = [] types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au', '*.ogg') wav_file_list = [] for files in types: wav_file_list.extend(glob.glob(os.path.join(dirName, files))) wav_file_list = sorted(wav_file_list) wav_file_list2, mt_feature_names = [], [] for i, wavFile in enumerate(wav_file_list): print("Analyzing file {0:d} of " "{1:d}: {2:s}".format(i+1, len(wav_file_list), wavFile)) if os.stat(wavFile).st_size == 0: print(" (EMPTY FILE -- SKIPPING)") continue [fs, x] = audioBasicIO.readAudioFile(wavFile) if isinstance(x, int): continue t1 = time.clock() x = audioBasicIO.stereo2mono(x) if x.shape[0]<float(fs)/5: print(" (AUDIO FILE TOO SMALL - SKIPPING)") continue wav_file_list2.append(wavFile) if compute_beat: [mt_term_feats, st_features, mt_feature_names] = \ mtFeatureExtraction(x, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step)) [beat, beat_conf] = beatExtraction(st_features, st_step) else: [mt_term_feats, _, mt_feature_names] = \ mtFeatureExtraction(x, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step)) mt_term_feats = numpy.transpose(mt_term_feats) mt_term_feats = mt_term_feats.mean(axis=0) # long term averaging of mid-term statistics if (not numpy.isnan(mt_term_feats).any()) and \ (not numpy.isinf(mt_term_feats).any()): if compute_beat: mt_term_feats = numpy.append(mt_term_feats, beat) mt_term_feats = numpy.append(mt_term_feats, beat_conf) if len(all_mt_feats) == 0: # append feature vector all_mt_feats = mt_term_feats else: all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats)) t2 = time.clock() duration = float(len(x)) / fs process_times.append((t2 - t1) / duration) if len(process_times) > 0: print("Feature extraction complexity ratio: " "{0:.1f} x realtime".format((1.0 / numpy.mean(numpy.array(process_times))))) return (all_mt_feats, wav_file_list2, mt_feature_names)
def speakerDiarization(filename, n_speakers, mt_size=2.0, mt_step=0.2, st_win=0.05, lda_dim=35, plot_res=False): ''' ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mt_size (opt) mid-term window size - mt_step (opt) mid-term window step - st_win (opt) short-term window size - lda_dim (opt) LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plottingy ''' [fs, x] = audioBasicIO.readAudioFile(filename) x = audioBasicIO.stereo2mono(x) duration = len(x) / fs [ classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 ] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerAll")) [ classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 ] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerFemaleMale")) [mt_feats, st_feats, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5)) MidTermFeatures2 = numpy.zeros( (mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T n_wins = mt_feats.shape[1] # remove outliers: dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0) m_dist_all = numpy.mean(dist_all) i_non_outliers = numpy.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(mt_feats[1,:]) #EnergyMean = numpy.mean(mt_feats[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #i_non_outliers = numpy.nonzero(mt_feats[1,:] > Thres)[0] #print i_non_outliers perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins mt_feats_norm_or = mt_feats_norm mt_feats_norm = mt_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, st_win * fs, round(fs*st_win), round(fs*st_win)); # extract mid-term features with minimum step: mt_win_ratio = int(round(mt_size / st_win)) mt_step_ratio = int(round(st_win / st_win)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 #for i in range(num_of_stats * num_of_features + 1): for i in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) for i in range( num_of_features): # for each of the short-term features: curPos = 0 N = len(st_feats[i]) while (curPos < N): N1 = curPos N2 = curPos + mt_win_ratio if N2 > N: N2 = N curStFeatures = st_feats[i][N1:N2] mt_feats_to_red[i].append(numpy.mean(curStFeatures)) mt_feats_to_red[i + num_of_features].append( numpy.std(curStFeatures)) curPos += mt_step_ratio mt_feats_to_red = numpy.array(mt_feats_to_red) mt_feats_to_red_2 = numpy.zeros( (mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2), mt_feats_to_red.shape[1])) for i in range(mt_feats_to_red.shape[1]): cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i] mt_feats_to_red_2[ mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001 mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :] #mt_feats_to_red += numpy.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010 (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures([mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T #dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0) #m_dist_all = numpy.mean(dist_all) #iNonOutLiers2 = numpy.nonzero(dist_all < 3.0*m_dist_all)[0] #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2] Labels = numpy.zeros((mt_feats_to_red.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / st_win #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i * st_win / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=lda_dim) clf.fit(mt_feats_to_red.T, Labels) mt_feats_norm = (clf.transform(mt_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] clsAll = [] sil_all = [] centersAll = [] for iSpeakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(mt_feats_norm.T)) clsAll.append(cls) centersAll.append(means) sil_1 = [] sil_2 = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = numpy.nonzero(cls == c)[0].shape[0] / \ float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mt_feats_norm[:, cls == c] # compute average distance between samples # that belong to the cluster (a values) Yt = distance.pdist(mt_feats_norm_temp.T) sil_1.append(numpy.mean(Yt) * clust_per_cent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = numpy.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2] Yt = distance.cdist(mt_feats_norm_temp.T, MidTermFeaturesNormTemp2.T) silBs.append( numpy.mean(Yt) * (clust_per_cent + clust_per_cent_2) / 2.0) silBs = numpy.array(silBs) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(silBs)) sil_1 = numpy.array(sil_1) sil_2 = numpy.array(sil_2) sil = [] for c in range(iSpeakers): # for each cluster (speaker) compute silhouette sil.append( (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001)) # keep the AVERAGE SILLOUETTE sil_all.append(numpy.mean(sil)) imax = numpy.argmax(sil_all) # optimal number of clusters nSpeakersFinal = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = numpy.zeros((n_wins, )) for i in range(n_wins): j = numpy.argmin(numpy.abs(i - i_non_outliers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): # hmm training start_prob, transmat, means, cov = \ trainHMM_computeStatistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = sil_all[imax] class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundturh exists if os.path.isfile(gt_file): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(numpy.array(range(len(cls))) * mt_step + mt_step / 2.0, cls) if os.path.isfile(gt_file): if plot_res: ax1.plot( numpy.array(range(len(flags_gt))) * mt_step + mt_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluateSpeakerDiarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format( 100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") #print s_range, sil_all if n_speakers <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() return cls
def silenceRemoval(x, Fs, stWin, stStep, smoothWindow=0.5, Weight=0.5, plot=False): ''' Event Detection (silence removal) ARGUMENTS: - x: the input audio signal - Fs: sampling freq - stWin, stStep: window size and step in seconds - smoothWindow: (optinal) smooth window (in seconds) - Weight: (optinal) weight factor (0 < Weight < 1) the higher, the more strict - plot: (optinal) True if results are to be plotted RETURNS: - segmentLimits: list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds ''' if Weight >= 1: Weight = 0.99 if Weight <= 0: Weight = 0.01 # Step 1: feature extraction x = audioBasicIO.stereo2mono(x) # convert to mono ShortTermFeatures = aF.stFeatureExtraction( x, Fs, stWin * Fs, stStep * Fs) # extract short-term features # Step 2: train binary SVM classifier of low vs high energy frames EnergySt = ShortTermFeatures[ 1, :] # keep only the energy short-term sequence (2nd feature) E = numpy.sort(EnergySt) # sort the energy feature values: L1 = int(len(E) / 10) # number of 10% of the total short-term windows T1 = numpy.mean( E[0:L1]) + 0.000000000000001 # compute "lower" 10% energy threshold T2 = numpy.mean( E[-L1:-1]) + 0.000000000000001 # compute "higher" 10% energy threshold Class1 = ShortTermFeatures[:, numpy.where( EnergySt <= T1)[0]] # get all features that correspond to low energy Class2 = ShortTermFeatures[:, numpy.where( EnergySt >= T2)[0]] # get all features that correspond to high energy featuresSS = [Class1.T, Class2.T] # form the binary classification task and ... [featuresNormSS, MEANSS, STDSS] = aT.normalizeFeatures(featuresSS) # normalize and ... SVM = aT.trainSVM( featuresNormSS, 1.0) # train the respective SVM probabilistic model (ONSET vs SILENCE) # Step 3: compute onset probability based on the trained SVM ProbOnset = [] for i in range(ShortTermFeatures.shape[1]): # for each frame curFV = (ShortTermFeatures[:, i] - MEANSS) / STDSS # normalize feature vector ProbOnset.append( SVM.predict_proba(curFV.reshape(1, -1))[0] [1]) # get SVM probability (that it belongs to the ONSET class) ProbOnset = numpy.array(ProbOnset) ProbOnset = smoothMovingAvg(ProbOnset, smoothWindow / stStep) # smooth probability # Step 4A: detect onset frame indices: ProbOnsetSorted = numpy.sort( ProbOnset ) # find probability Threshold as a weighted average of top 10% and lower 10% of the values Nt = int(ProbOnsetSorted.shape[0] / 10) T = (numpy.mean((1 - Weight) * ProbOnsetSorted[0:Nt]) + Weight * numpy.mean(ProbOnsetSorted[-Nt::])) MaxIdx = numpy.where(ProbOnset > T)[ 0] # get the indices of the frames that satisfy the thresholding i = 0 timeClusters = [] segmentLimits = [] # Step 4B: group frame indices to onset segments while i < len(MaxIdx): # for each of the detected onset indices curCluster = [MaxIdx[i]] if i == len(MaxIdx) - 1: break while MaxIdx[i + 1] - curCluster[-1] <= 2: curCluster.append(MaxIdx[i + 1]) i += 1 if i == len(MaxIdx) - 1: break i += 1 timeClusters.append(curCluster) segmentLimits.append([curCluster[0] * stStep, curCluster[-1] * stStep]) # Step 5: Post process: remove very small segments: minDuration = 0.2 segmentLimits2 = [] for s in segmentLimits: if s[1] - s[0] > minDuration: segmentLimits2.append(s) segmentLimits = segmentLimits2 if plot: timeX = numpy.arange(0, x.shape[0] / float(Fs), 1.0 / Fs) plt.subplot(2, 1, 1) plt.plot(timeX, x) for s in segmentLimits: plt.axvline(x=s[0]) plt.axvline(x=s[1]) plt.subplot(2, 1, 2) plt.plot(numpy.arange(0, ProbOnset.shape[0] * stStep, stStep), ProbOnset) plt.title('Signal') for s in segmentLimits: plt.axvline(x=s[0]) plt.axvline(x=s[1]) plt.title('SVM Probability') plt.show() return segmentLimits
def trainMetaClassifier(dirName, outputmodelName, modelName, method="svm", postProcess=0, PLOT=False): types = ('*.wav', ) wavFilesList = [] for files in types: wavFilesList.extend(glob.glob(os.path.join(dirName, files))) wavFilesList = sorted(wavFilesList) flagsAll = np.array([]) for ifile, wavFile in enumerate( wavFilesList): # for each wav file in folder print "{0:s}, {1:d} file of {2:d}".format(wavFile, ifile + 1, len(wavFilesList)) matFile = wavFile.replace(".wav", "_true.mat") # load current ground truth if os.path.isfile(matFile): matfile = loadmat(matFile) segs_gt = matfile["segs_r"] classes_gt1 = matfile["classes_r"] classes_gt = [] for c in classes_gt1[0]: if c == "M": classes_gt.append("music") if c == "S" or c == "E": classes_gt.append("speech") flagsIndGT, classesAllGT = audioSegmentation.segs2flags( [s[0] for s in segs_gt], [s[1] for s in segs_gt], classes_gt, 1.0) #if method == "svm": # speech-music segmentation: # [flagsInd, classesAll, acc] = audioSegmentation.mtFileClassification(fileName, modelName, "svm", False, '') if method == "cnn": # apply the CNN on the current WAV WIDTH_SEC = 2.4 [Fs, x] = io.readAudioFile(wavFile) # read the WAV x = io.stereo2mono(x) [flagsInd, classesAll, P] = mtCNN_classification( x, Fs, WIDTH_SEC, 1.0, RGB_singleFrame_net, SOUND_mean_RGB, transformer_RGB, classNamesCNN) # apply the CNN mid-term classifier print len( flagsIndGT ), P.shape # append the current ground truth labels AND estimated probabilities (either from the CNN or the SVM) on the global arrays lenF = P.shape[0] lenL = len(flagsIndGT) MIN = min(lenF, lenL) P = P[0:MIN, :] flagsIndGT = flagsIndGT[0:MIN] flagsNew = [] for j, fl in enumerate(flagsIndGT): # append features and labels flagsNew.append(classesAll.index(classesAllGT[flagsIndGT[j]])) flagsAll = np.append(flagsAll, np.array(flagsNew)) if ifile == 0: Fall = P else: Fall = np.concatenate((Fall, P), axis=0) print Fall.shape print flagsAll.shape startprob, transmat, means, cov = audioSegmentation.trainHMM_computeStatistics( Fall.T, flagsAll) # compute HMM statistics hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # train HMM hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov fo = open(outputmodelName, "wb") # save HMM model cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classesAll, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classesAll