def fileChromagramWrapper(wavFileName): if not os.path.isfile(wavFileName): raise Exception("Input audio file not found!") [Fs, x] = audioBasicIO.readAudioFile(wavFileName) x = audioBasicIO.stereo2mono(x) specgram, TimeAxis, FreqAxis = aF.stChromagram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True)
def fileClassification(inputFile, modelName, modelType): # Load classifier: if not os.path.isfile(modelName): print "fileClassification: input modelName not found!" return (-1, -1, -1) if not os.path.isfile(inputFile): print "fileClassification: wav file not found!" return (-1, -1, -1) if modelType == 'svm': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(modelName) elif modelType == 'knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(modelName) [Fs, x] = audioBasicIO.readAudioFile(inputFile) # read audio file and convert to mono x = audioBasicIO.stereo2mono(x) # feature extraction: [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = MidTermFeatures.mean(axis=1) # long term averaging of mid-term statistics if computeBEAT: [beat, beatConf] = aF.beatExtraction(s, stStep) MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) curFV = (MidTermFeatures - MEAN) / STD # normalization [Result, P] = classifierWrapper(Classifier, modelType, curFV) # classification return Result, P, classNames
def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile, storeStFeatures=False, storeToCSV=False, PLOT=False): """ This function is used as a wrapper to: a) read the content of a WAV file b) perform mid-term feature extraction on that signal c) write the mid-term feature sequences to a numpy file """ [Fs, x] = audioBasicIO.readAudioFile(fileName) # read the wav file x = audioBasicIO.stereo2mono(x) # convert to MONO if required if storeStFeatures: [mtF, stF] = mtFeatureExtraction(x, Fs, round(Fs * midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep)) else: [mtF, _] = mtFeatureExtraction(x, Fs, round(Fs*midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep)) numpy.save(outPutFile, mtF) # save mt features to numpy file if PLOT: print "Mid-term numpy file: " + outPutFile + ".npy saved" if storeToCSV: numpy.savetxt(outPutFile+".csv", mtF.T, delimiter=",") if PLOT: print "Mid-term CSV file: " + outPutFile + ".csv saved" if storeStFeatures: numpy.save(outPutFile+"_st", stF) # save st features to numpy file if PLOT: print "Short-term numpy file: " + outPutFile + "_st.npy saved" if storeToCSV: numpy.savetxt(outPutFile+"_st.csv", stF.T, delimiter=",") # store st features to CSV file if PLOT: print "Short-term CSV file: " + outPutFile + "_st.csv saved"
def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT=False): """ This function extracts the mid-term features of the WAVE files of a particular folder. The resulting feature vector is extracted by long-term averaging the mid-term features. Therefore ONE FEATURE VECTOR is extracted for each WAV file. ARGUMENTS: - dirName: the path of the WAVE directory - mtWin, mtStep: mid-term window and step (in seconds) - stWin, stStep: short-term window and step (in seconds) """ allMtFeatures = numpy.array([]) processingTimes = [] types = ('*.wav', '*.aif', '*.aiff', '*.mp3','*.au') wavFilesList = [] for files in types: wavFilesList.extend(glob.glob(os.path.join(dirName, files))) wavFilesList = sorted(wavFilesList) wavFilesList2 = [] for i, wavFile in enumerate(wavFilesList): print "Analyzing file {0:d} of {1:d}: {2:s}".format(i+1, len(wavFilesList), wavFile.encode('utf-8')) if os.stat(wavFile).st_size == 0: print " (EMPTY FILE -- SKIPPING)" continue [Fs, x] = audioBasicIO.readAudioFile(wavFile) # read file if isinstance(x, int): continue t1 = time.clock() x = audioBasicIO.stereo2mono(x) # convert stereo to mono if x.shape[0]<float(Fs)/10: print " (AUDIO FILE TOO SMALL - SKIPPING)" continue wavFilesList2.append(wavFile) if computeBEAT: # mid-term feature extraction for current file [MidTermFeatures, stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) [beat, beatConf] = beatExtraction(stFeatures, stStep) else: [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = numpy.transpose(MidTermFeatures) MidTermFeatures = MidTermFeatures.mean(axis=0) # long term averaging of mid-term statistics if (not numpy.isnan(MidTermFeatures).any()) and (not numpy.isinf(MidTermFeatures).any()): if computeBEAT: MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) if len(allMtFeatures) == 0: # append feature vector allMtFeatures = MidTermFeatures else: allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures)) t2 = time.clock() duration = float(len(x)) / Fs processingTimes.append((t2 - t1) / duration) if len(processingTimes) > 0: print "Feature extraction complexity ratio: {0:.1f} x realtime".format((1.0 / numpy.mean(numpy.array(processingTimes)))) return (allMtFeatures, wavFilesList2)
def thumbnailWrapper(inputFile, thumbnailWrapperSize): stWindow = 1.0 stStep = 1.0 if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") [Fs, x] = audioBasicIO.readAudioFile(inputFile) if Fs == -1: # could not read file return [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x, Fs, stWindow, stStep, thumbnailWrapperSize) # write thumbnailWrappers to WAV files: if inputFile.endswith(".wav"): thumbnailWrapperFileName1 = inputFile.replace(".wav", "_thumb1.wav") thumbnailWrapperFileName2 = inputFile.replace(".wav", "_thumb2.wav") if inputFile.endswith(".mp3"): thumbnailWrapperFileName1 = inputFile.replace(".mp3", "_thumb1.mp3") thumbnailWrapperFileName2 = inputFile.replace(".mp3", "_thumb2.mp3") wavfile.write(thumbnailWrapperFileName1, Fs, x[int(Fs * A1):int(Fs * A2)]) wavfile.write(thumbnailWrapperFileName2, Fs, x[int(Fs * B1):int(Fs * B2)]) print "1st thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec" \ " -- {2:4.1f}sec".format(thumbnailWrapperFileName1, A1, A2) print "2nd thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec" \ " -- {2:4.1f}sec".format(thumbnailWrapperFileName2, B1, B2) # Plot self-similarity matrix: fig = plt.figure() ax = fig.add_subplot(111, aspect="auto") plt.imshow(Smatrix) # Plot best-similarity diagonal: Xcenter = (A1 / stStep + A2 / stStep) / 2.0 Ycenter = (B1 / stStep + B2 / stStep) / 2.0 e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter), thumbnailWrapperSize * 1.4, 3, angle=45, linewidth=3, fill=False) ax.add_patch(e1) plt.plot([B1, Smatrix.shape[0]], [A1, A1], color="k", linestyle="--", linewidth=2) plt.plot([B2, Smatrix.shape[0]], [A2, A2], color="k", linestyle="--", linewidth=2) plt.plot([B1, B1], [A1, Smatrix.shape[0]], color="k", linestyle="--", linewidth=2) plt.plot([B2, B2], [A2, Smatrix.shape[0]], color="k", linestyle="--", linewidth=2) plt.xlim([0, Smatrix.shape[0]]) plt.ylim([Smatrix.shape[1], 0]) ax.yaxis.set_label_position("right") ax.yaxis.tick_right() plt.xlabel("frame no") plt.ylabel("frame no") plt.title("Self-similarity matrix") plt.show()
def trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep): ''' This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored ARGUMENTS: - dirPath: the path of the data diretory - hmmModelName: the name of the HMM model to be stored - mtWin: mid-term window size - mtStep: mid-term window step RETURNS: - hmm: an object to the resulting HMM - classNames: a list of classNames After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file ''' flagsAll = numpy.array([]) classesAll = [] for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')): # for each WAV file wavFile = f gtFile = f.replace('.wav', '.segments') # open for annotated file if not os.path.isfile(gtFile): # if current WAV file does not have annotation -> skip continue [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flags, classNames = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags for c in classNames: # update classnames: if c not in classesAll: classesAll.append(c) [Fs, x] = audioBasicIO.readAudioFile(wavFile) # read audio data [F, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)) # feature extraction lenF = F.shape[1] lenL = len(flags) MIN = min(lenF, lenL) F = F[:, 0:MIN] flags = flags[0:MIN] flagsNew = [] for j, fl in enumerate(flags): # append features and labels flagsNew.append(classesAll.index(classNames[flags[j]])) flagsAll = numpy.append(flagsAll, numpy.array(flagsNew)) if i == 0: Fall = F else: Fall = numpy.concatenate((Fall, F), axis=1) startprob, transmat, means, cov = trainHMM_computeStatistics(Fall, flagsAll) # compute HMM statistics hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat) # train HMM hmm.means_ = means hmm.covars_ = cov fo = open(hmmModelName, "wb") # save HMM model cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classesAll, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classesAll
def beatExtractionWrapper(wavFileName, plot): if not os.path.isfile(wavFileName): raise Exception("Input audio file not found!") [Fs, x] = audioBasicIO.readAudioFile(wavFileName) F = aF.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.050 * Fs) BPM, ratio = aF.beatExtraction(F, 0.050, plot) print "Beat: {0:d} bpm ".format(int(BPM)) print "Ratio: {0:.2f} ".format(ratio)
def silenceRemovalWrapper(inputFile, smoothingWindow, weight): if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") [Fs, x] = audioBasicIO.readAudioFile(inputFile) # read audio signal segmentLimits = aS.silenceRemoval(x, Fs, 0.05, 0.05, smoothingWindow, weight, True) # get onsets for i, s in enumerate(segmentLimits): strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(inputFile[0:-4], s[0], s[1]) wavfile.write( strOut, Fs, x[int(Fs*s[0]):int(Fs*s[1])])
def classifyFolderWrapper(inputFolder, modelType, modelName, outputMode=False): if not os.path.isfile(modelName): raise Exception("Input modelName not found!") if modelType=='svm': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName) elif modelType=='knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName) PsAll = numpy.zeros((len(classNames), )) files = "*.wav" if os.path.isdir(inputFolder): strFilePattern = os.path.join(inputFolder, files) else: strFilePattern = inputFolder + files wavFilesList = [] wavFilesList.extend(glob.glob(strFilePattern)) wavFilesList = sorted(wavFilesList) if len(wavFilesList)==0: print "No WAV files found!" return Results = [] for wavFile in wavFilesList: [Fs, x] = audioBasicIO.readAudioFile(wavFile) signalLength = x.shape[0] / float(Fs) [Result, P, classNames] = aT.fileClassification(wavFile, modelName, modelType) PsAll += (numpy.array(P) * signalLength) Result = int(Result) Results.append(Result) if outputMode: print "{0:s}\t{1:s}".format(wavFile,classNames[Result]) Results = numpy.array(Results) # print distribution of classes: [Histogram, _] = numpy.histogram(Results, bins=numpy.arange(len(classNames)+1)) if outputMode: for i,h in enumerate(Histogram): print "{0:20s}\t\t{1:d}".format(classNames[i], h) PsAll = PsAll / numpy.sum(PsAll) if outputMode: fig = plt.figure() ax = fig.add_subplot(111) plt.title("Classes percentage " + inputFolder.replace('Segments','')) ax.axis((0, len(classNames)+1, 0, 1)) ax.set_xticks(numpy.array(range(len(classNames)+1))) ax.set_xticklabels([" "] + classNames) ax.bar(numpy.array(range(len(classNames)))+0.5, PsAll) plt.show() return classNames, PsAll
def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT=False): """ This function extracts the mid-term features of the WAVE files of a particular folder. The resulting feature vector is extracted by long-term averaging the mid-term features. Therefore ONE FEATURE VECTOR is extracted for each WAV file. ARGUMENTS: - dirName: the path of the WAVE directory - mtWin, mtStep: mid-term window and step (in seconds) - stWin, stStep: short-term window and step (in seconds) """ allMtFeatures = numpy.array([]) processingTimes = [] types = ('*.wav', '*.aif', '*.aiff') wavFilesList = [] for files in types: wavFilesList.extend(glob.glob(os.path.join(dirName, files))) wavFilesList = sorted(wavFilesList) for wavFile in wavFilesList: [Fs, x] = audioBasicIO.readAudioFile(wavFile) # read file t1 = time.clock() x = audioBasicIO.stereo2mono(x) # convert stereo to mono if computeBEAT: # mid-term feature extraction for current file [MidTermFeatures, stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) [beat, beatConf] = beatExtraction(stFeatures, stStep) else: [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = numpy.transpose(MidTermFeatures) MidTermFeatures = MidTermFeatures.mean(axis=0) # long term averaging of mid-term statistics if computeBEAT: MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) if len(allMtFeatures) == 0: # append feature vector allMtFeatures = MidTermFeatures else: allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures)) t2 = time.clock() duration = float(len(x)) / Fs processingTimes.append((t2 - t1) / duration) if len(processingTimes) > 0: print "Feature extraction complexity ratio: {0:.1f} x realtime".format( (1.0 / numpy.mean(numpy.array(processingTimes)))) return (allMtFeatures, wavFilesList)
def fileRegression(inputFile, modelName, modelType): # Load classifier: if not os.path.isfile(inputFile): print "fileClassification: wav file not found!" return (-1, -1, -1) regressionModels = glob.glob(modelName + "_*") regressionModels2 = [] for r in regressionModels: if r[-5::] != "MEANS": regressionModels2.append(r) regressionModels = regressionModels2 regressionNames = [] for r in regressionModels: regressionNames.append(r[r.rfind("_") + 1::]) # FEATURE EXTRACTION # LOAD ONLY THE FIRST MODEL (for mtWin, etc) if modelType == 'svm': [_, _, _, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(regressionModels[0], True) elif modelType == 'knn': [_, _, _, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(regressionModels[0], True) [Fs, x] = audioBasicIO.readAudioFile(inputFile) # read audio file and convert to mono x = audioBasicIO.stereo2mono(x) # feature extraction: [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = MidTermFeatures.mean(axis=1) # long term averaging of mid-term statistics if computeBEAT: [beat, beatConf] = aF.beatExtraction(s, stStep) MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) # REGRESSION R = [] for ir, r in enumerate(regressionModels): if not os.path.isfile(r): print "fileClassification: input modelName not found!" return (-1, -1, -1) if modelType == 'svm': [Model, MEAN, STD, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(r, True) elif modelType == 'knn': [Model, MEAN, STD, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(r, True) curFV = (MidTermFeatures - MEAN) / STD # normalization R.append(regressionWrapper(Model, modelType, curFV)) # classification return R, regressionNames
def hmmSegmentation(wavFileName, hmmModelName, PLOT=False, gtFileName=""): [Fs, x] = audioBasicIO.readAudioFile(wavFileName) # read audio data try: fo = open(hmmModelName, "rb") except IOError: print "didn't find file" return try: hmm = cPickle.load(fo) classesAll = cPickle.load(fo) mtWin = cPickle.load(fo) mtStep = cPickle.load(fo) except: fo.close() fo.close() #Features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs); # feature extraction [Features, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)) flagsInd = hmm.predict(Features.T) # apply model #for i in range(len(flagsInd)): # if classesAll[flagsInd[i]]=="silence": # flagsInd[i]=classesAll.index("speech") # plot results if os.path.isfile(gtFileName): [segStart, segEnd, segLabels] = readSegmentGT(gtFileName) flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) flagsGTNew = [] for j, fl in enumerate(flagsGT): # "align" labels with GT if classNamesGT[flagsGT[j]] in classesAll: flagsGTNew.append(classesAll.index(classNamesGT[flagsGT[j]])) else: flagsGTNew.append(-1) CM = numpy.zeros((len(classNamesGT), len(classNamesGT))) flagsIndGT = numpy.array(flagsGTNew) for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])): CM[int(flagsIndGT[i]),int(flagsInd[i])] += 1 else: flagsIndGT = numpy.array([]) acc = plotSegmentationResults(flagsInd, flagsIndGT, classesAll, mtStep, not PLOT) if acc >= 0: print "Overall Accuracy: {0:.2f}".format(acc) return (flagsInd, classNamesGT, acc, CM) else: return (flagsInd, classesAll, -1, -1)
def dirWavFeatureExtractionNoAveraging(dirName, mtWin, mtStep, stWin, stStep): """ This function extracts the mid-term features of the WAVE files of a particular folder without averaging each file. ARGUMENTS: - dirName: the path of the WAVE directory - mtWin, mtStep: mid-term window and step (in seconds) - stWin, stStep: short-term window and step (in seconds) RETURNS: - X: A feature matrix - Y: A matrix of file labels - filenames: """ allMtFeatures = numpy.array([]) signalIndices = numpy.array([]) processingTimes = [] types = ('*.wav', '*.aif', '*.aiff') wavFilesList = [] for files in types: wavFilesList.extend(glob.glob(os.path.join(dirName, files))) wavFilesList = sorted(wavFilesList) for i, wavFile in enumerate(wavFilesList): [Fs, x] = audioBasicIO.readAudioFile(wavFile) # read file if isinstance(x, int): continue x = audioBasicIO.stereo2mono(x) # convert stereo to mono [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) # mid-term feature MidTermFeatures = numpy.transpose(MidTermFeatures) # MidTermFeatures = MidTermFeatures.mean(axis=0) # long term averaging of mid-term statistics if len(allMtFeatures) == 0: # append feature vector allMtFeatures = MidTermFeatures signalIndices = numpy.zeros((MidTermFeatures.shape[0], )) else: allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures)) signalIndices = numpy.append(signalIndices, i * numpy.ones((MidTermFeatures.shape[0], ))) return (allMtFeatures, signalIndices, wavFilesList)
def fileClassification(inputFile, modelName, modelType): # Load classifier: if not os.path.isfile(modelName): print "fileClassification: input modelName not found!" return (-1, -1, -1) if not os.path.isfile(inputFile): print "fileClassification: wav file not found!" return (-1, -1, -1) if modelType == 'svm': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadSVModel(modelName) elif modelType == 'knn': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = loadKNNModel(modelName) [Fs, x] = audioBasicIO.readAudioFile( inputFile) # read audio file and convert to mono x = audioBasicIO.stereo2mono(x) # feature extraction: [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = MidTermFeatures.mean( axis=1) # long term averaging of mid-term statistics if computeBEAT: [beat, beatConf] = aF.beatExtraction(s, stStep) MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) curFV = (MidTermFeatures - MEAN) / STD # normalization [Result, P] = classifierWrapper(Classifier, modelType, curFV) # classification return Result, P, classNames
def hmmSegmentation(wavFileName, hmmModelName, PLOT=False, gtFileName=""): [Fs, x] = audioBasicIO.readAudioFile(wavFileName) # read audio data try: fo = open(hmmModelName, "rb") except IOError: print "didn't find file" return try: hmm = cPickle.load(fo) classesAll = cPickle.load(fo) mtWin = cPickle.load(fo) mtStep = cPickle.load(fo) except: fo.close() fo.close() #Features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs); # feature extraction [Features, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)) flagsInd = hmm.predict(Features.T) # apply model #for i in range(len(flagsInd)): # if classesAll[flagsInd[i]]=="silence": # flagsInd[i]=classesAll.index("speech") # plot results if os.path.isfile(gtFileName): [segStart, segEnd, segLabels] = readSegmentGT(gtFileName) flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) flagsGTNew = [] for j, fl in enumerate(flagsGT): # "align" labels with GT if classNamesGT[flagsGT[j]] in classesAll: flagsGTNew.append(classesAll.index(classNamesGT[flagsGT[j]])) else: flagsGTNew.append(-1) flagsIndGT = numpy.array(flagsGTNew) else: flagsIndGT = numpy.array([]) acc = plotSegmentationResults(flagsInd, flagsIndGT, classesAll, mtStep, not PLOT) if acc >= 0: print "Overall Accuracy: {0:.2f}".format(acc) return flagsInd, classesAll, acc
def dirWavFeatureExtractionNoAveraging(dirName, mtWin, mtStep, stWin, stStep): """ This function extracts the mid-term features of the WAVE files of a particular folder without averaging each file. ARGUMENTS: - dirName: the path of the WAVE directory - mtWin, mtStep: mid-term window and step (in seconds) - stWin, stStep: short-term window and step (in seconds) RETURNS: - X: A feature matrix - Y: A matrix of file labels - filenames: """ allMtFeatures = numpy.array([]) signalIndices = numpy.array([]) processingTimes = [] types = ('*.wav', '*.aif', '*.aiff') wavFilesList = [] for files in types: wavFilesList.extend(glob.glob(os.path.join(dirName, files))) wavFilesList = sorted(wavFilesList) for i, wavFile in enumerate(wavFilesList): [Fs, x] = audioBasicIO.readAudioFile(wavFile) # read file x = audioBasicIO.stereo2mono(x); # convert stereo to mono [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin*Fs), round(mtStep*Fs), round(Fs*stWin), round(Fs*stStep)) # mid-term feature MidTermFeatures = numpy.transpose(MidTermFeatures) # MidTermFeatures = MidTermFeatures.mean(axis=0) # long term averaging of mid-term statistics if len(allMtFeatures)==0: # append feature vector allMtFeatures = MidTermFeatures signalIndices = numpy.zeros( (MidTermFeatures.shape[0], ) ) else: allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures)) signalIndices = numpy.append( signalIndices, i*numpy.ones( (MidTermFeatures.shape[0], ) )) return (allMtFeatures, signalIndices, wavFilesList)
def fileClassification(inputFile, modelName, modelType): # Load classifier: if not os.path.isfile(modelName): print ("fileClassification: input modelName not found!") return (-1, -1, -1) if not os.path.isfile(inputFile): print ("fileClassification: wav file not found!") return (-1, -1, -1) if (modelType) == 'svm' or (modelType == 'svm_rbf'): [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(modelName) elif modelType == 'knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(modelName) elif modelType == 'randomforest': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadRandomForestModel(modelName) elif modelType == 'gradientboosting': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadGradientBoostingModel(modelName) elif modelType == 'extratrees': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadExtraTreesModel(modelName) [Fs, x] = audioBasicIO.readAudioFile(inputFile) # read audio file and convert to mono x = audioBasicIO.stereo2mono(x) if isinstance(x, int): # audio file IO problem return (-1, -1, -1) if x.shape[0] / float(Fs) <= mtWin: return (-1, -1, -1) # feature extraction: [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = MidTermFeatures.mean(axis=1) # long term averaging of mid-term statistics if computeBEAT: [beat, beatConf] = aF.beatExtraction(s, stStep) MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) curFV = (MidTermFeatures - MEAN) / STD # normalization [Result, P] = classifierWrapper(Classifier, modelType, curFV) # classification return Result, P, classNames
def fileClassification(inputFile, modelName, modelType): # Load classifier: if not os.path.isfile(modelName): print "fileClassification: input modelName not found!" return (-1, -1, -1) if not os.path.isfile(inputFile): print "fileClassification: wav file not found!" return (-1, -1, -1) if (modelType) == 'svm' or (modelType == 'svm_rbf'): [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(modelName) elif modelType == 'knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(modelName) elif modelType == 'randomforest': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadRandomForestModel(modelName) elif modelType == 'gradientboosting': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadGradientBoostingModel(modelName) elif modelType == 'extratrees': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadExtraTreesModel(modelName) [Fs, x] = audioBasicIO.readAudioFile(inputFile) # read audio file and convert to mono x = audioBasicIO.stereo2mono(x) if isinstance(x, int): # audio file IO problem return (-1, -1, -1) if x.shape[0] / float(Fs) <= mtWin: return (-1, -1, -1) # feature extraction: [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = MidTermFeatures.mean(axis=1) # long term averaging of mid-term statistics if computeBEAT: [beat, beatConf] = aF.beatExtraction(s, stStep) MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) curFV = (MidTermFeatures - MEAN) / STD # normalization [Result, P] = classifierWrapper(Classifier, modelType, curFV) # classification return Result, P, classNames
def trainHMM_fromFile(wavFile, gtFile, hmmModelName, mtWin, mtStep): """ This function trains a HMM model for segmentation-classification using a single annotated audio file ARGUMENTS: - wavFile: the path of the audio filename - gtFile: the path of the ground truth filename (a csv file of the form <segment start in seconds>,<segment end in seconds>,<segment label> in each row - hmmModelName: the name of the HMM model to be stored - mtWin: mid-term window size - mtStep: mid-term window step RETURNS: - hmm: an object to the resulting HMM - classNames: a list of classNames After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file """ [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read ground truth data flags, classNames = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to fix-sized sequence of flags [Fs, x] = audioBasicIO.readAudioFile(wavFile) # read audio data # F = aF.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs); [F, _] = aF.mtFeatureExtraction( x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050) ) # feature extraction startprob, transmat, means, cov = trainHMM_computeStatistics( F, flags ) # compute HMM statistics (priors, transition matrix, etc) hmm = sklearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat) # hmm training hmm.means_ = means hmm.covars_ = cov fo = open(hmmModelName, "wb") # output to file cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classNames
def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile, storeStFeatures=False, storeToCSV=False, PLOT=False): """ This function is used as a wrapper to: a) read the content of a WAV file b) perform mid-term feature extraction on that signal c) write the mid-term feature sequences to a numpy file """ [Fs, x] = audioBasicIO.readAudioFile(fileName) # read the wav file # print("fs is ........",Fs) x = audioBasicIO.stereo2mono(x) # convert to MONO if required if storeStFeatures: [mtF, stF] = mtFeatureExtraction(x, Fs, round(Fs * midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep)) else: [mtF, _] = mtFeatureExtraction(x, Fs, round(Fs*midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep)) numpy.save(outPutFile, mtF) # save mt features to numpy file if PLOT: print "Mid-term numpy file: " + outPutFile + ".npy saved" if storeToCSV: path="/Users/somyagoel/Flask/basic_app/music/mt/" newpath = outPutFile.split("/")[-1] numpy.savetxt(path+newpath+".csv", mtF.T, delimiter=",") if PLOT: print "Mid-term CSV file: " + outPutFile + ".csv saved" if storeStFeatures: numpy.save(outPutFile+"_st", stF) # save st features to numpy file if PLOT: print "Short-term numpy file: " + outPutFile + "_st.npy saved" if storeToCSV: # os.getcwd() path="/Users/somyagoel/Flask/basic_app/music/st/" # os.chdir(path) newpath = outPutFile.split("/")[-1] numpy.savetxt(path+newpath+"_st.csv", stF.T, delimiter=",") # store st features to CSV file # path="/Users/somyagoel/dir_fe" #os.chdir(path) if PLOT: print "Short-term CSV file: " + outPutFile + "_st.csv saved"
def trainHMM_fromFile(wavFile, gtFile, hmmModelName, mtWin, mtStep): ''' This function trains a HMM model for segmentation-classification using a single annotated audio file ARGUMENTS: - wavFile: the path of the audio filename - gtFile: the path of the ground truth filename (a csv file of the form <segment start in seconds>,<segment end in seconds>,<segment label> in each row - hmmModelName: the name of the HMM model to be stored - mtWin: mid-term window size - mtStep: mid-term window step RETURNS: - hmm: an object to the resulting HMM - classNames: a list of classNames After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file ''' [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read ground truth data flags, classNames = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to fix-sized sequence of flags [Fs, x] = audioBasicIO.readAudioFile(wavFile) # read audio data #F = aF.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs); [F, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)) # feature extraction startprob, transmat, means, cov = trainHMM_computeStatistics(F, flags) # compute HMM statistics (priors, transition matrix, etc) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # hmm training hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov fo = open(hmmModelName, "wb") # output to file cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classNames
def annotation2files(wavFile, csvFile): ''' Break an audio stream to segments of interest, defined by a csv file - wavFile: path to input wavfile - csvFile: path to csvFile of segment limits Input CSV file must be of the format <T1>\t<T2>\t<Label> ''' [Fs, x] = audioBasicIO.readAudioFile(wavFile) with open(csvFile, 'rb') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='|') for j, row in enumerate(reader): T1 = float(row[0].replace(",",".")) T2 = float(row[1].replace(",",".")) label = "%s_%s_%.2f_%.2f.wav" % (wavFile, row[2], T1, T2) label = label.replace(" ", "_") xtemp = x[int(round(T1*Fs)):int(round(T2*Fs))] print T1, T2, label, xtemp.shape wavfile.write(label, Fs, xtemp)
def annotation2files(wavFile, csvFile): ''' Break an audio stream to segments of interest, defined by a csv file - wavFile: path to input wavfile - csvFile: path to csvFile of segment limits Input CSV file must be of the format <T1>\t<T2>\t<Label> ''' [Fs, x] = audioBasicIO.readAudioFile(wavFile) with open(csvFile, 'rb') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='|') for j, row in enumerate(reader): T1 = float(row[0].replace(",", ".")) T2 = float(row[1].replace(",", ".")) label = "%s_%s_%.2f_%.2f.wav" % (wavFile, row[2], T1, T2) label = label.replace(" ", "_") xtemp = x[int(round(T1 * Fs)):int(round(T2 * Fs))] print(T1, T2, label, xtemp.shape) wavfile.write(label, Fs, xtemp)
def trainHMM_fromFile(wav_file, gt_file, hmm_model_name, mt_win, mt_step): ''' This function trains a HMM model for segmentation-classification using a single annotated audio file ARGUMENTS: - wav_file: the path of the audio filename - gt_file: the path of the ground truth filename (a csv file of the form <segment start in seconds>,<segment end in seconds>,<segment label> in each row - hmm_model_name: the name of the HMM model to be stored - mt_win: mid-term window size - mt_step: mid-term window step RETURNS: - hmm: an object to the resulting HMM - class_names: a list of class_names After training, hmm, class_names, along with the mt_win and mt_step values are stored in the hmm_model_name file ''' [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step) [fs, x] = audioBasicIO.readAudioFile(wav_file) [F, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs, round(fs * 0.050), round(fs * 0.050)) start_prob, transmat, means, cov = trainHMM_computeStatistics(F, flags) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov fo = open(hmm_model_name, "wb") cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(class_names, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, class_names
def main(argv): #filename = "diarizationExample.wav" filename = argv[1] for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile(filename) tcls = aS.speakerDiarization(filename, 2, LDAdim=0, PLOT=False) audio = AudioSegment.from_wav(filename) audio_length = len(audio) change_point = [[] for i in range(2)] for j in range(len(tcls) - 1): if tcls[j] != tcls[j + 1]: change_point[0].append(j) change_point[1].append(int(tcls[j])) if j == len(tcls) - 2: change_point[1].append(int(tcls[j])) for num in range(len(change_point[1])): if num == 0: seg_audio = audio[:audio_length * change_point[0][0] / len(tcls)] seg_audio.export(os.path.join( argv[2], "seg0_speaker{0}.wav".format(change_point[1][num])), format="wav") elif num == len(change_point[1]) - 1: seg_audio = audio[audio_length * change_point[0][num - 1] / len(tcls):] seg_audio.export(os.path.join( argv[2], "seg{0}_speaker{1}.wav".format(num, change_point[1][num])), format="wav") else: seg_audio = audio[audio_length * change_point[0][num - 1] / len(tcls):audio_length * change_point[0][num] / len(tcls)] seg_audio.export(os.path.join( argv[2], "seg{0}_speaker{1}.wav".format(num, change_point[1][num])), format="wav")
def mtFeatureExtraction(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep): allMtFeatures = numpy.array([]) numpy.set_printoptions(suppress=True) """ This function is used as a wrapper to: a) read the content of a WAV file b) perform mid-term feature extraction on that signal c) write the mid-term feature sequences to a numpy file """ [Fs, x] = audioBasicIO.readAudioFile(fileName) # read the wav file x = audioBasicIO.stereo2mono(x) # convert to MONO if required mtWinRatio = int(round(midTermSize / shortTermStep)) mtStepRatio = int(round(midTermStep / shortTermStep)) mtFeatures = [] stFeatures = stFeatureExtraction(x, Fs, shortTermSize * Fs, shortTermStep * Fs) numOfFeatures = len(stFeatures) numOfStatistics = 2 mtFeatures = [] #for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeatures.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(stFeatures[i]) while (curPos < N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = stFeatures[i][N1:N2] mtFeatures[i].append(numpy.mean(curStFeatures)) mtFeatures[i + numOfFeatures].append(numpy.std(curStFeatures)) #mtFeatures[i+2*numOfFeatures].append(numpy.std(curStFeatures) / (numpy.mean(curStFeatures)+0.00000010)) curPos += mtStepRatio return (numpy.array(mtFeatures), stFeatures, Fs, x)
def getMusicSegmentsFromFile(inputFile): modelType = "svm" modelName = "data/svmMovies8classes" dirOutput = inputFile[0:-4] + "_musicSegments" if os.path.exists(dirOutput) and dirOutput != ".": shutil.rmtree(dirOutput) os.makedirs(dirOutput) [Fs, x] = audioBasicIO.readAudioFile(inputFile) if modelType == 'svm': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat ] = aT.load_model(modelName) elif modelType == 'knn': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat ] = aT.load_model_knn(modelName) flagsInd, classNames, acc, CM = aS.mtFileClassification(inputFile, modelName, modelType, plotResults=False, gtFile="") segs, classes = aS.flags2segs(flagsInd, mtStep) for i, s in enumerate(segs): if (classNames[int(classes[i])] == "Music") and (s[1] - s[0] >= minDuration): strOut = "{0:s}{1:.3f}-{2:.3f}.wav".format(dirOutput + os.sep, s[0], s[1]) wavfile.write(strOut, Fs, x[int(Fs * s[0]):int(Fs * s[1])])
def getMusicSegmentsFromFile(inputFile): modelType = "svm" modelName = "data/svmMovies8classes" dirOutput = inputFile[0:-4] + "_musicSegments" if os.path.exists(dirOutput) and dirOutput!=".": shutil.rmtree(dirOutput) os.makedirs(dirOutput) [Fs, x] = audioBasicIO.readAudioFile(inputFile) if modelType=='svm': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName) elif modelType=='knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName) flagsInd, classNames, acc = aS.mtFileClassification(inputFile, modelName, modelType, plotResults = False, gtFile = "") segs, classes = aS.flags2segs(flagsInd, mtStep) for i, s in enumerate(segs): if (classNames[int(classes[i])] == "Music") and (s[1] - s[0] >= minDuration): strOut = "{0:s}{1:.3f}-{2:.3f}.wav".format(dirOutput+os.sep, s[0], s[1]) wavfile.write( strOut, Fs, x[int(Fs*s[0]):int(Fs*s[1])])
def thumbnailWrapper(inputFile, thumbnailWrapperSize): stWindow = 1.0 stStep = 1.0 if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") [Fs, x] = audioBasicIO.readAudioFile(inputFile) # read file if Fs == -1: # could not read file return [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing( x, Fs, stWindow, stStep, thumbnailWrapperSize) # find thumbnailWrapper endpoints # write thumbnailWrappers to WAV files: thumbnailWrapperFileName1 = inputFile.replace(".wav", "_thumb1.wav") thumbnailWrapperFileName2 = inputFile.replace(".wav", "_thumb2.wav") wavfile.write(thumbnailWrapperFileName1, Fs, x[int(Fs * A1):int(Fs * A2)]) wavfile.write(thumbnailWrapperFileName2, Fs, x[int(Fs * B1):int(Fs * B2)]) print "1st thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format( thumbnailWrapperFileName1, A1, A2) print "2nd thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format( thumbnailWrapperFileName2, B1, B2) # Plot self-similarity matrix: fig = plt.figure() ax = fig.add_subplot(111, aspect="auto") plt.imshow(Smatrix) # Plot best-similarity diagonal: Xcenter = (A1 / stStep + A2 / stStep) / 2.0 Ycenter = (B1 / stStep + B2 / stStep) / 2.0 e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter), thumbnailWrapperSize * 1.4, 3, angle=45, linewidth=3, fill=False) ax.add_patch(e1) plt.plot([B1, Smatrix.shape[0]], [A1, A1], color="k", linestyle="--", linewidth=2) plt.plot([B2, Smatrix.shape[0]], [A2, A2], color="k", linestyle="--", linewidth=2) plt.plot([B1, B1], [A1, Smatrix.shape[0]], color="k", linestyle="--", linewidth=2) plt.plot([B2, B2], [A2, Smatrix.shape[0]], color="k", linestyle="--", linewidth=2) plt.xlim([0, Smatrix.shape[0]]) plt.ylim([Smatrix.shape[1], 0]) ax.yaxis.set_label_position("right") ax.yaxis.tick_right() plt.xlabel("frame no") plt.ylabel("frame no") plt.title("Self-similarity matrix") plt.show()
import audioBasicIO import audioFeatureExtraction import matplotlib.pyplot as plt import numpy print("COUNT 1\n") [Fs1, x1] = audioBasicIO.readAudioFile("data/practice.wav") F1 = audioFeatureExtraction.stFeatureExtraction(x1, Fs1, 0.050 * Fs1, 0.025 * Fs1) # F1[12*420] MATRIX print(len(F1[9:21]), len(F1[9:21][0])) print("\n\nCOUNT 2\n") [Fs2, x2] = audioBasicIO.readAudioFile("data/practice2.wav") F2 = audioFeatureExtraction.stFeatureExtraction(x2, Fs2, 0.050 * Fs2, 0.025 * Fs2) print(len(F2[9:21]), len(F2[9:21][0])) size = min(len(F2[9:21][0]), len(F1[9:21][0])) print(size, "\n") print("\n\nCORRCOEF\n") print(numpy.corrcoef(F1[9:21, 0:size], F2[9:21, 0:size]) * 0.5 + 0.5) print("\n\nE Distance\n") print(numpy.linalg.norm(F1[9:21, 0:size] - F2[9:21, 0:size]))
def speakerDiarization(filename, n_speakers, mt_size=2.0, mt_step=0.2, st_win=0.05, lda_dim=35, plot_res=False): ''' ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mt_size (opt) mid-term window size - mt_step (opt) mid-term window step - st_win (opt) short-term window size - lda_dim (opt) LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plottingy ''' [fs, x] = audioBasicIO.readAudioFile(filename) x = audioBasicIO.stereo2mono(x) duration = len(x) / fs # [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerAll")) # [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerFemaleMale")) [ classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 ] = aT.load_model_knn("data/knnSpeakerAll") [ classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 ] = aT.load_model_knn("data/knnSpeakerFemaleMale") [mt_feats, st_feats, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5)) MidTermFeatures2 = numpy.zeros( (mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T n_wins = mt_feats.shape[1] # remove outliers: dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0) m_dist_all = numpy.mean(dist_all) i_non_outliers = numpy.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(mt_feats[1,:]) #EnergyMean = numpy.mean(mt_feats[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #i_non_outliers = numpy.nonzero(mt_feats[1,:] > Thres)[0] #print i_non_outliers perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins mt_feats_norm_or = mt_feats_norm mt_feats_norm = mt_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, st_win * fs, round(fs*st_win), round(fs*st_win)); # extract mid-term features with minimum step: mt_win_ratio = int(round(mt_size / st_win)) mt_step_ratio = int(round(st_win / st_win)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 #for i in range(num_of_stats * num_of_features + 1): for i in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) for i in range( num_of_features): # for each of the short-term features: curPos = 0 N = len(st_feats[i]) while (curPos < N): N1 = curPos N2 = curPos + mt_win_ratio if N2 > N: N2 = N curStFeatures = st_feats[i][N1:N2] mt_feats_to_red[i].append(numpy.mean(curStFeatures)) mt_feats_to_red[i + num_of_features].append( numpy.std(curStFeatures)) curPos += mt_step_ratio mt_feats_to_red = numpy.array(mt_feats_to_red) mt_feats_to_red_2 = numpy.zeros( (mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2), mt_feats_to_red.shape[1])) for i in range(mt_feats_to_red.shape[1]): cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i] mt_feats_to_red_2[ mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001 mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :] #mt_feats_to_red += numpy.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010 (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures([mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T #dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0) #m_dist_all = numpy.mean(dist_all) #iNonOutLiers2 = numpy.nonzero(dist_all < 3.0*m_dist_all)[0] #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2] Labels = numpy.zeros((mt_feats_to_red.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / st_win #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i * st_win / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=lda_dim) clf.fit(mt_feats_to_red.T, Labels) mt_feats_norm = (clf.transform(mt_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] clsAll = [] sil_all = [] centersAll = [] for iSpeakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(mt_feats_norm.T)) clsAll.append(cls) centersAll.append(means) sil_1 = [] sil_2 = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = numpy.nonzero(cls == c)[0].shape[0] / \ float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mt_feats_norm[:, cls == c] # compute average distance between samples # that belong to the cluster (a values) Yt = distance.pdist(mt_feats_norm_temp.T) sil_1.append(numpy.mean(Yt) * clust_per_cent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = numpy.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2] Yt = distance.cdist(mt_feats_norm_temp.T, MidTermFeaturesNormTemp2.T) silBs.append( numpy.mean(Yt) * (clust_per_cent + clust_per_cent_2) / 2.0) silBs = numpy.array(silBs) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(silBs)) sil_1 = numpy.array(sil_1) sil_2 = numpy.array(sil_2) sil = [] for c in range(iSpeakers): # for each cluster (speaker) compute silhouette sil.append( (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001)) # keep the AVERAGE SILLOUETTE sil_all.append(numpy.mean(sil)) imax = numpy.argmax(sil_all) # optimal number of clusters nSpeakersFinal = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = numpy.zeros((n_wins, )) for i in range(n_wins): j = numpy.argmin(numpy.abs(i - i_non_outliers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): # hmm training start_prob, transmat, means, cov = \ trainHMM_computeStatistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = sil_all[imax] class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundturh exists if os.path.isfile(gt_file): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(numpy.array(range(len(cls))) * mt_step + mt_step / 2.0, cls) if os.path.isfile(gt_file): if plot_res: ax1.plot( numpy.array(range(len(flags_gt))) * mt_step + mt_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluateSpeakerDiarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format( 100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") #print s_range, sil_all if n_speakers <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") #plt.show() plt.savefig('output/outImg.jpg') return cls
def mtFileClassification(input_file, model_name, model_type, plot_results=False, gt_file=""): ''' This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment ''' if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return (-1, -1, -1, -1) # Load classifier: if model_type == "knn": [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = \ aT.load_model_knn(model_name) else: [ classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat ] = aT.load_model(model_name) if compute_beat: print("Model " + model_name + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return (-1, -1, -1, -1) [fs, x] = audioBasicIO.readAudioFile(input_file) # load input file if fs == -1: # could not read file return (-1, -1, -1, -1) x = audioBasicIO.stereo2mono(x) # convert stereo (if) to mono duration = len(x) / fs # mid-term feature extraction: [mt_feats, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs, round(fs * st_win), round(fs * st_step)) flags = [] Ps = [] flags_ind = [] for i in range( mt_feats.shape[1] ): # for each feature vector (i.e. for each fix-sized segment): cur_fv = (mt_feats[:, i] - MEAN) / STD # normalize current feature vector [res, P] = aT.classifierWrapper(classifier, model_type, cur_fv) # classify vector flags_ind.append(res) flags.append(class_names[int(res)]) # update class label matrix Ps.append(numpy.max(P)) # update probability matrix flags_ind = numpy.array(flags_ind) # 1-window smoothing for i in range(1, len(flags_ind) - 1): if flags_ind[i - 1] == flags_ind[i + 1]: flags_ind[i] = flags_ind[i + 1] # convert fix-sized flags to segments and classes (segs, classes) = flags2segs(flags, mt_step) segs[-1] = len(x) / float(fs) # Load grount-truth: if os.path.isfile(gt_file): [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt, seg_l_gt, mt_step) flags_ind_gt = [] for j, fl in enumerate(flags_gt): # "align" labels with GT if class_names_gt[flags_gt[j]] in class_names: flags_ind_gt.append( class_names.index(class_names_gt[flags_gt[j]])) else: flags_ind_gt.append(-1) flags_ind_gt = numpy.array(flags_ind_gt) cm = numpy.zeros((len(class_names_gt), len(class_names_gt))) for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])): cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1 else: cm = [] flags_ind_gt = numpy.array([]) acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names, mt_step, not plot_results) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc)) return (flags_ind, class_names_gt, acc, cm) else: return (flags_ind, class_names, acc, cm)
import audioBasicIO import audioFeatureExtraction import matplotlib.pyplot as plt import audioTrainTest as aT plot = False [Fs, x] = audioBasicIO.readAudioFile("data/Heavy.wav") F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs) # ZCR: The rate of sign-changes of the signal during the duration of a particular frame. if plot: plt.subplot(2, 2, 1) plt.plot(F[0, :]) plt.xlabel('Frame no') plt.ylabel('ZCR') plt.subplot(2, 2, 2) plt.plot(F[1, :]) plt.xlabel('Frame no') plt.ylabel('Energy') plt.subplot(2, 2, 3) plt.plot(F[2, :]) plt.xlabel('Frame no') plt.ylabel('Entropy of Energy') plt.subplot(2, 2, 4) plt.plot(F[3, :]) plt.xlabel('Frame no') plt.ylabel('Spectral Centroid') plt.show() Result, P, classNames = aT.fileClassification("data/Heavy.wav", "data/svmMusicGenre3", "svm") print Result
def classifyFolderWrapper(inputFolder, modelType, modelName, outputMode=False): if not os.path.isfile(modelName): raise Exception("Input modelName not found!") if modelType == 'svm': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat ] = aT.load_model(modelName) elif modelType == 'knn': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, compute_beat ] = aT.load_model_knn(modelName) PsAll = numpy.zeros((len(classNames), )) files = "*.wav" if os.path.isdir(inputFolder): strFilePattern = os.path.join(inputFolder, files) else: strFilePattern = inputFolder + files wavFilesList = [] wavFilesList.extend(glob.glob(strFilePattern)) wavFilesList = sorted(wavFilesList) if len(wavFilesList) == 0: print "No WAV files found!" return Results = [] for wavFile in wavFilesList: [Fs, x] = audioBasicIO.readAudioFile(wavFile) signalLength = x.shape[0] / float(Fs) [Result, P, classNames] = aT.file_classification(wavFile, modelName, modelType) PsAll += (numpy.array(P) * signalLength) Result = int(Result) Results.append(Result) if outputMode: print "{0:s}\t{1:s}".format(wavFile, classNames[Result]) Results = numpy.array(Results) # print distribution of classes: [Histogram, _] = numpy.histogram(Results, bins=numpy.arange(len(classNames) + 1)) if outputMode: for i, h in enumerate(Histogram): print "{0:20s}\t\t{1:d}".format(classNames[i], h) PsAll = PsAll / numpy.sum(PsAll) if outputMode: fig = plt.figure() ax = fig.add_subplot(111) plt.title("Classes percentage " + inputFolder.replace('Segments', '')) ax.axis((0, len(classNames) + 1, 0, 1)) ax.set_xticks(numpy.array(range(len(classNames) + 1))) ax.set_xticklabels([" "] + classNames) ax.bar(numpy.array(range(len(classNames))) + 0.5, PsAll) plt.show() return classNames, PsAll
def speakerDiarization(fileName, numOfSpeakers, mtSize = 2.0, mtStep=0.2, stWin=0.05, LDAdim = 35, PLOT = False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' [Fs, x] = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x); Duration = len(x) / Fs [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("data/knnSpeakerAll") [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("data/knnSpeakerFemaleMale") [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs*stWin), round(Fs*stWin*0.5)); MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1] ) ) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:,i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:,i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001; MidTermFeatures2[MidTermFeatures.shape[0]+len(classNames1)::, i] = P2 + 0.0001; MidTermFeatures = MidTermFeatures2 # TODO # SELECT FEATURES: #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 0C iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 1A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C #iFeaturesSelect = range(100); # SET 3 #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 MidTermFeatures = MidTermFeatures[iFeaturesSelect,:] (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] # remove outliers: DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2*MDistancesAll)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(MidTermFeatures[1,:]) #EnergyMean = numpy.mean(MidTermFeatures[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0] #print iNonOutLiers perOutLier = (100.0*(numOfWindows-iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] # LDA dimensionality reduction: if LDAdim > 0: #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin)); # extract mid-term features with minimum step: mtWinRatio = int(round(mtSize / stWin)); mtStepRatio = int(round(stWin / stWin)); mtFeaturesToReduce = [] numOfFeatures = len(ShortTermFeatures) numOfStatistics = 2; #for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(ShortTermFeatures[i]) while (curPos<N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1] ) ) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:,i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:,i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0]+len(classNames1), i] = P1 + 0.0001; mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001; mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect,:] #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) #MDistancesAll = numpy.mean(DistancesAll) #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0] #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] Labels = numpy.zeros((mtFeaturesToReduce.shape[1],)); LDAstep = 1.0 LDAstepRatio = LDAstep / stWin #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*stWin/LDAstepRatio); clf = LDA(n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T if numOfSpeakers<=0: sRange = range(2,10) else: sRange = [numOfSpeakers] clsAll = []; silAll = []; centersAll = [] for iSpeakers in sRange: cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True) # perform k-means clustering #YDist = distance.pdist(MidTermFeaturesNorm.T, metric='euclidean') #print distance.squareform(YDist).shape #hc = mlpy.HCluster() #hc.linkage(YDist) #cls = hc.cut(14.5) #print cls # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) clsAll.append(cls) centersAll.append(means) silA = []; silB = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.020: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c] # get subset of feature vectors Yt = distance.pdist(MidTermFeaturesNormTemp.T) # compute average distance between samples that belong to the cluster (a values) silA.append(numpy.mean(Yt)*clusterPerCent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2!=c: clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) silA = numpy.array(silA); silB = numpy.array(silB); sil = [] for c in range(iSpeakers): # for each cluster (speaker) sil.append( ( silB[c] - silA[c]) / (max(silB[c], silA[c])+0.00001) ) # compute silhouette silAll.append(numpy.mean(sil)) # keep the AVERAGE SILLOUETTE #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5))) imax = numpy.argmax(silAll) # position of the maximum sillouette value nSpeakersFinal = sRange[imax] # optimal number of clusters # generate the final set of cluster labels # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window) cls = numpy.zeros((numOfWindows,)) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i-iNonOutLiers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls) hmm = sklearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat) # hmm training hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] # final sillouette classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]; # load ground-truth if available gtFile = fileName.replace('.wav', '.segments'); # open for annotated file if os.path.isfile(gtFile): # if groundturh exists [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags if PLOT: fig = plt.figure() if numOfSpeakers>0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(classNames)))) ax1.axis((0, Duration, -1, len(classNames))) ax1.set_yticklabels(classNames) ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls) if os.path.isfile(gtFile): if PLOT: ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r') purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT) print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean) if PLOT: plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) ) if PLOT: plt.xlabel("time (seconds)") #print sRange, silAll if numOfSpeakers<=0: plt.subplot(212) plt.plot(sRange, silAll) plt.xlabel("number of clusters"); plt.ylabel("average clustering's sillouette"); plt.show()
def main(argv): if argv[1] == "-dirMp3toWAV": # convert mp3 to wav (batch) if len(argv)==5: path = argv[2] if argv[3] not in ["8000", "16000", "32000", "44100"]: print "Error. Unsupported sampling rate (must be: 8000, 16000, 32000 or 44100)."; return if argv[4] not in ["1","2"]: print "Error. Number of output channels must be 1 or 2"; return if not os.path.isdir(path): raise Exception("Input path not found!") useMp3TagsAsNames = True audioBasicIO.convertDirMP3ToWav(path, int(argv[3]), int(argv[4]), useMp3TagsAsNames) else: print "Error.\nSyntax: " + argv[0] + " -dirMp3toWAV <dirName> <sampling Freq> <numOfChannels>" if argv[1] == "-dirWAVChangeFs": # convert mp3 to wav (batch) if len(argv)==5: path = argv[2] if argv[3] not in ["8000", "16000", "32000", "44100"]: print "Error. Unsupported sampling rate (must be: 8000, 16000, 32000 or 44100)."; return if argv[4] not in ["1","2"]: print "Error. Number of output channels must be 1 or 2"; return if not os.path.isdir(path): raise Exception("Input path not found!") audioBasicIO.convertFsDirWavToWav(path, int(argv[3]), int(argv[4])) else: print "Error.\nSyntax: " + argv[0] + " -dirMp3toWAV <dirName> <sampling Freq> <numOfChannels>" elif argv[1] == "-featureExtractionFile": # short-term and mid-term feature extraction to files (csv and numpy) if len(argv)==7: wavFileName = argv[2] if not os.path.isfile(wavFileName): raise Exception("Input audio file not found!") if not (uT.isNum(argv[3]) and uT.isNum(argv[4]) and uT.isNum(argv[5]) and uT.isNum(argv[6])): raise Exception("Mid-term and short-term window sizes and steps must be numbers!") mtWin = float(argv[3]) mtStep = float(argv[4]) stWin = float(argv[5]) stStep = float(argv[6]) outFile = wavFileName aF.mtFeatureExtractionToFile(wavFileName, mtWin, mtStep, stWin, stStep, outFile, True, True, True) else: print "Error.\nSyntax: " + argv[0] + " -featureExtractionFile <wavFileName> <mtWin> <mtStep> <stWin> <stStep>" elif argv[1] == "-beatExtraction": if len(argv)==4: wavFileName = argv[2] if not os.path.isfile(wavFileName): raise Exception("Input audio file not found!") if not (uT.isNum(argv[3])): raise Exception("PLOT must be either 0 or 1") if not ( (int(argv[3]) == 0) or (int(argv[3]) == 1) ): raise Exception("PLOT must be either 0 or 1") [Fs, x] = audioBasicIO.readAudioFile(wavFileName); F = aF.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs); BPM, ratio = aF.beatExtraction(F, 0.050, int(argv[3])==1) print "Beat: {0:d} bpm ".format(int(BPM)) print "Ratio: {0:.2f} ".format(ratio) else: print "Error.\nSyntax: " + argv[0] + " -beatExtraction <wavFileName> <PLOT (0 or 1)>" elif argv[1] == '-featureExtractionDir': # same as -featureExtractionFile, in a batch mode (i.e. for each WAV file in the provided path) if len(argv)==7: path = argv[2] if not os.path.isdir(path): raise Exception("Input path not found!") if not (uT.isNum(argv[3]) and uT.isNum(argv[4]) and uT.isNum(argv[5]) and uT.isNum(argv[6])): raise Exception("Mid-term and short-term window sizes and steps must be numbers!") mtWin = float(argv[3]) mtStep = float(argv[4]) stWin = float(argv[5]) stStep = float(argv[6]) aF.mtFeatureExtractionToFileDir(path, mtWin, mtStep, stWin, stStep, True, True, True) else: print "Error.\nSyntax: " + argv[0] + " -featureExtractionDir <path> <mtWin> <mtStep> <stWin> <stStep>" elif argv[1] == '-featureVisualizationDir': # visualize the content relationships between recordings stored in a folder if len(argv)==3: if not os.path.isdir(argv[2]): raise Exception("Input folder not found!") aV.visualizeFeaturesFolder(argv[2], "pca", "") elif argv[1] == '-fileSpectrogram': # show spectogram of a sound stored in a file if len(argv)==3: wavFileName = argv[2] if not os.path.isfile(wavFileName): raise Exception("Input audio file not found!") [Fs, x] = audioBasicIO.readAudioFile(wavFileName) x = audioBasicIO.stereo2mono(x) specgram, TimeAxis, FreqAxis = aF.stSpectogram(x, Fs, round(Fs*0.040), round(Fs*0.040), True) else: print "Error.\nSyntax: " + argv[0] + " -fileSpectrogram <fileName>" elif argv[1] == '-fileChromagram': # show spectogram of a sound stored in a file if len(argv)==3: wavFileName = argv[2] if not os.path.isfile(wavFileName): raise Exception("Input audio file not found!") [Fs, x] = audioBasicIO.readAudioFile(wavFileName) x = audioBasicIO.stereo2mono(x) specgram, TimeAxis, FreqAxis = aF.stChromagram(x, Fs, round(Fs*0.040), round(Fs*0.040), True) else: print "Error.\nSyntax: " + argv[0] + " -fileSpectrogram <fileName>" elif argv[1] == "-trainClassifier": # Segment classifier training (OK) if len(argv)>6: method = argv[2] beatFeatures = (int(argv[3])==1) listOfDirs = argv[4:len(argv)-1] modelName = argv[-1] aT.featureAndTrain(listOfDirs, 1, 1, aT.shortTermWindow, aT.shortTermStep, method.lower(), modelName, computeBEAT = beatFeatures) else: print "Error.\nSyntax: " + argv[0] + " -trainClassifier <method(svm or knn)> <beat features> <directory 1> <directory 2> ... <directory N> <modelName>" elif argv[1] == "-trainRegression": # Segment regression model if len(argv)==6: method = argv[2] beatFeatures = (int(argv[3])==1) dirName = argv[4] modelName = argv[5] aT.featureAndTrainRegression(dirName, 1, 1, aT.shortTermWindow, aT.shortTermStep, method.lower(), modelName, computeBEAT = beatFeatures) else: print "Error.\nSyntax: " + argv[0] + " -trainRegression <method(svm or knn)> <beat features> <directory> <modelName>" elif argv[1] == "-classifyFile": # Single File Classification (OK) if len(argv)==5: modelType = argv[2] modelName = argv[3] inputFile = argv[4] if modelType not in ["svm", "knn"]: raise Exception("ModelType has to be either svm or knn!") if not os.path.isfile(modelName): raise Exception("Input modelName not found!") if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") [Result, P, classNames] = aT.fileClassification(inputFile, modelName, modelType) print "{0:s}\t{1:s}".format("Class","Probability") for i,c in enumerate(classNames): print "{0:s}\t{1:.2f}".format(c,P[i]) print "Winner class: " + classNames[int(Result)] else: print "Error.\nSyntax: " + argv[0] + " -classifyFile <method(svm or knn)> <modelName> <fileName>" elif argv[1] == "-regressionFile": # Single File Classification (OK) if len(argv)==5: modelType = argv[2] modelName = argv[3] inputFile = argv[4] if modelType not in ["svm", "knn"]: raise Exception("ModelType has to be either svm or knn!") if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") R, regressionNames = aT.fileRegression(inputFile, modelName, modelType) for i in range(len(R)): print "{0:s}\t{1:.3f}".format(regressionNames[i], R[i]) #print "{0:s}\t{1:.2f}".format(c,P[i]) else: print "Error.\nSyntax: " + argv[0] + " -regressionFile <method(svm or knn)> <modelName> <fileName>" elif argv[1] == "-classifyFolder": # Directory classification (Ok) if len(argv)==6 or len(argv)==5: modelType = argv[2] modelName = argv[3] inputFolder = argv[4] if len(argv)==6: outputMode = argv[5] else: outputMode = "0" if modelType not in ["svm", "knn"]: raise Exception("ModelType has to be either svm or knn!") if outputMode not in ["0","1"]: raise Exception("outputMode has to be 0 or 1") if not os.path.isfile(modelName): raise Exception("Input modelName not found!") files = '*.wav' if os.path.isdir(inputFolder): strFilePattern = os.path.join(inputFolder, files) else: strFilePattern = inputFolder + files wavFilesList = [] wavFilesList.extend(glob.glob(strFilePattern)) wavFilesList = sorted(wavFilesList) if len(wavFilesList)==0: print "No WAV files found!" return Results = [] for wavFile in wavFilesList: [Result, P, classNames] = aT.fileClassification(wavFile, modelName, modelType) Result = int(Result) Results.append(Result) if outputMode=="1": print "{0:s}\t{1:s}".format(wavFile,classNames[Result]) Results = numpy.array(Results) # print distribution of classes: [Histogram, _] = numpy.histogram(Results, bins=numpy.arange(len(classNames)+1)) for i,h in enumerate(Histogram): print "{0:20s}\t\t{1:d}".format(classNames[i], h) else: print "Error.\nSyntax: " + argv[0] + " -classifyFolder <method(svm or knn)> <modelName> <folderName> <outputMode(0 or 1)" elif argv[1] == "-regressionFolder": # Regression applied on the WAV files of a folder if len(argv)==5: modelType = argv[2] modelName = argv[3] inputFolder = argv[4] if modelType not in ["svm", "knn"]: raise Exception("ModelType has to be either svm or knn!") files = '*.wav' if os.path.isdir(inputFolder): strFilePattern = os.path.join(inputFolder, files) else: strFilePattern = inputFolder + files wavFilesList = [] wavFilesList.extend(glob.glob(strFilePattern)) wavFilesList = sorted(wavFilesList) if len(wavFilesList)==0: print "No WAV files found!" return Results = [] for wavFile in wavFilesList: R, regressionNames = aT.fileRegression(wavFile, modelName, modelType) Results.append(R) Results = numpy.array(Results) for i, r in enumerate(regressionNames): [Histogram, bins] = numpy.histogram(Results[:, i]) centers = (bins[0:-1] + bins[1::]) / 2.0 plt.subplot(len(regressionNames), 1, i); plt.plot(centers, Histogram) plt.title(r) plt.show() # for h in Histogram: # print "{0:20d}".format(h), # if outputMode=="1": # for i,h in enumerate(Histogram): # print "{0:20s}\t\t{1:d}".format(classNames[i], h) else: print "Error.\nSyntax: " + argv[0] + " -regressionFolder <method(svm or knn)> <modelName> <folderName>" elif argv[1] == '-trainHMMsegmenter_fromfile': if len(argv)==7: wavFile = argv[2] gtFile = argv[3] hmmModelName = argv[4] if not uT.isNum(argv[5]): print "Error: mid-term window size must be float!"; return if not uT.isNum(argv[6]): print "Error: mid-term window step must be float!"; return mtWin = float(argv[5]) mtStep = float(argv[6]) if not os.path.isfile(wavFile): print "Error: wavfile does not exist!"; return if not os.path.isfile(gtFile): print "Error: groundtruth does not exist!"; return aS.trainHMM_fromFile(wavFile, gtFile, hmmModelName, mtWin, mtStep) else: print "Error.\nSyntax: " + argv[0] + " -trainHMMsegmenter_fromfile <wavFilePath> <gtSegmentFilePath> <hmmModelFileName> <mtWin> <mtStep>" elif argv[1] == '-trainHMMsegmenter_fromdir': if len(argv)==6: dirPath = argv[2] hmmModelName = argv[3] if not uT.isNum(argv[4]): print "Error: mid-term window size must be float!" if not uT.isNum(argv[5]): print "Error: mid-term window step must be float!" mtWin = float(argv[4]) mtStep = float(argv[5]) aS.trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep) else: print "Error.\nSyntax: " + argv[0] + " -trainHMMsegmenter_fromdir <dirPath> <hmmModelFileName> <mtWin> <mtStep>" elif argv[1] == "-segmentClassifyFileHMM": # HMM-based segmentation-classification if len(argv)==4: hmmModelName = argv[2] wavFile = argv[3] gtFile = wavFile.replace('.wav', '.segments'); aS.hmmSegmentation(wavFile, hmmModelName, PLOT = True, gtFileName = gtFile) else: print "Error.\nSyntax: " + argv[0] + " -segmentClassifyHMM <hmmModelName> <fileName>" elif argv[1] == '-segmentClassifyFile': # Segmentation-classification (fix-sized segment using knn or svm) if (len(argv)==5): modelType = argv[2] modelName = argv[3] inputWavFile = argv[4] if modelType not in ["svm", "knn"]: raise Exception("ModelType has to be either svm or knn!") if not os.path.isfile(modelName): raise Exception("Input modelName not found!") if not os.path.isfile(inputWavFile): raise Exception("Input audio file not found!") gtFile = inputWavFile.replace('.wav', '.segments'); aS.mtFileClassification(inputWavFile, modelName, modelType, True, gtFile) else: print "Error.\nSyntax: " + argv[0] + " -segmentClassifyFile <method(svm or knn)> <modelName> <fileName>" elif argv[1] == "-segmentationEvaluation": if len(argv)==5: methodName = argv[2] modelName = argv[3] dirName = argv[4] aS.evaluateSegmentationClassificationDir(dirName, modelName, methodName) else: print "Error.\nSyntax: " + argv[0] + " -segmentationEvaluation <method(svm or knn)> <modelName> <directoryName>" elif argv[1] == "-silenceRemoval": if len(argv)==5: inputFile = argv[2] if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") smoothingWindow = float(argv[3]) weight = float(argv[4]) [Fs, x] = audioBasicIO.readAudioFile(inputFile) # read audio signal segmentLimits = aS.silenceRemoval(x, Fs, 0.05, 0.05, smoothingWindow, weight, False) # get onsets for i, s in enumerate(segmentLimits): strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(inputFile[0:-4], s[0], s[1]) wavfile.write( strOut, Fs, x[int(Fs*s[0]):int(Fs*s[1])]) else: print "Error.\nSyntax: " + argv[0] + " -silenceRemoval <inputFile> <smoothinWindow(secs)> <Threshold Weight>" elif argv[1] == '-speakerDiarization': # speaker diarization (from file): TODO inputFile = argv[2] nSpeakers = int(argv[3]) useLDA = (int(argv[4])==1) if useLDA: aS.speakerDiarization(inputFile, nSpeakers, PLOT = True); else: aS.speakerDiarization(inputFile, nSpeakers, LDAdim = 0, PLOT = True); #print speechLimits elif argv[1] == "-speakerDiarizationScriptEval": dir = argv[2] listOfLDAs = [int(l) for l in argv[3::]] aS.speakerDiarizationEvaluateScript(dir, listOfLDAs) elif argv[1] == '-thumbnail': # music thumbnailing (OK) if len(argv)==4: inputFile = argv[2] stWindow = 1.0 stStep = 1.0 if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") [Fs, x] = audioBasicIO.readAudioFile(inputFile) # read file if Fs == -1: # could not read file return try: thumbnailSize = float(argv[3]) except ValueError: print "Thumbnail size must be a float (in seconds)" return [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x, Fs, stWindow, stStep, thumbnailSize) # find thumbnail endpoints # write thumbnails to WAV files: thumbnailFileName1 = inputFile.replace(".wav","_thumb1.wav") thumbnailFileName2 = inputFile.replace(".wav","_thumb2.wav") wavfile.write(thumbnailFileName1, Fs, x[int(Fs*A1):int(Fs*A2)]) wavfile.write(thumbnailFileName2, Fs, x[int(Fs*B1):int(Fs*B2)]) print "1st thumbnail (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format(thumbnailFileName1, A1, A2) print "2nd thumbnail (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format(thumbnailFileName2, B1, B2) # Plot self-similarity matrix: fig = plt.figure() ax = fig.add_subplot(111, aspect='auto') plt.imshow(Smatrix) # Plot best-similarity diagonal: Xcenter = (A1/stStep + A2/stStep) / 2.0 Ycenter = (B1/stStep + B2/stStep) / 2.0 e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter), thumbnailSize * 1.4, 3, angle=45, linewidth=3, fill=False) ax.add_patch(e1) plt.plot([B1, Smatrix.shape[0]], [A1, A1], color='k', linestyle='--', linewidth=2) plt.plot([B2, Smatrix.shape[0]], [A2, A2], color='k', linestyle='--', linewidth=2) plt.plot([B1, B1], [A1, Smatrix.shape[0]], color='k', linestyle='--', linewidth=2) plt.plot([B2, B2], [A2, Smatrix.shape[0]], color='k', linestyle='--', linewidth=2) plt.xlim([0, Smatrix.shape[0]]) plt.ylim([Smatrix.shape[1], 0]) ax.yaxis.set_label_position("right") ax.yaxis.tick_right() plt.xlabel('frame no') plt.ylabel('frame no') plt.title('Self-similarity matrix') plt.show() else: print "Error.\nSyntax: " + argv[0] + " -thumbnail <filename> <thumbnailsize(seconds)>"
def dirWavFeatureExtraction(dirName, mt_win, mt_step, st_win, st_step, compute_beat=False): """ This function extracts the mid-term features of the WAVE files of a particular folder. The resulting feature vector is extracted by long-term averaging the mid-term features. Therefore ONE FEATURE VECTOR is extracted for each WAV file. ARGUMENTS: - dirName: the path of the WAVE directory - mt_win, mt_step: mid-term window and step (in seconds) - st_win, st_step: short-term window and step (in seconds) """ all_mt_feats = numpy.array([]) process_times = [] types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au', '*.ogg') wav_file_list = [] for files in types: wav_file_list.extend(glob.glob(os.path.join(dirName, files))) wav_file_list = sorted(wav_file_list) wav_file_list2, mt_feature_names = [], [] for i, wavFile in enumerate(wav_file_list): print("Analyzing file {0:d} of " "{1:d}: {2:s}".format(i + 1, len(wav_file_list), wavFile)) if os.stat(wavFile).st_size == 0: print(" (EMPTY FILE -- SKIPPING)") continue [fs, x] = audioBasicIO.readAudioFile(wavFile) if isinstance(x, int): continue t1 = time.clock() x = audioBasicIO.stereo2mono(x) if x.shape[0] < float(fs) / 5: print(" (AUDIO FILE TOO SMALL - SKIPPING)") continue wav_file_list2.append(wavFile) if compute_beat: [mt_term_feats, st_features, mt_feature_names] = \ mtFeatureExtraction(x, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step)) [beat, beat_conf] = beatExtraction(st_features, st_step) else: [mt_term_feats, _, mt_feature_names] = \ mtFeatureExtraction(x, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step)) mt_term_feats = numpy.transpose(mt_term_feats) mt_term_feats = mt_term_feats.mean(axis=0) # long term averaging of mid-term statistics if (not numpy.isnan(mt_term_feats).any()) and \ (not numpy.isinf(mt_term_feats).any()): if compute_beat: mt_term_feats = numpy.append(mt_term_feats, beat) mt_term_feats = numpy.append(mt_term_feats, beat_conf) if len(all_mt_feats) == 0: # append feature vector all_mt_feats = mt_term_feats else: all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats)) t2 = time.clock() duration = float(len(x)) / fs process_times.append((t2 - t1) / duration) if len(process_times) > 0: print("Feature extraction complexity ratio: " "{0:.1f} x realtime".format( (1.0 / numpy.mean(numpy.array(process_times))))) return (all_mt_feats, wav_file_list2, mt_feature_names)
def main(argv): if argv[1] == "-dirMp3toWAV": # convert mp3 to wav (batch) if len(argv) == 5: path = argv[2] if argv[3] not in ["8000", "16000", "32000", "44100"]: print "Error. Unsupported sampling rate (must be: 8000, 16000, 32000 or 44100)." return if argv[4] not in ["1", "2"]: print "Error. Number of output channels must be 1 or 2" return if not os.path.isdir(path): raise Exception("Input path not found!") useMp3TagsAsNames = True audioBasicIO.convertDirMP3ToWav(path, int(argv[3]), int(argv[4]), useMp3TagsAsNames) else: print "Error.\nSyntax: " + argv[ 0] + " -dirMp3toWAV <dirName> <sampling Freq> <numOfChannels>" if argv[1] == "-dirWAVChangeFs": # convert mp3 to wav (batch) if len(argv) == 5: path = argv[2] if argv[3] not in ["8000", "16000", "32000", "44100"]: print "Error. Unsupported sampling rate (must be: 8000, 16000, 32000 or 44100)." return if argv[4] not in ["1", "2"]: print "Error. Number of output channels must be 1 or 2" return if not os.path.isdir(path): raise Exception("Input path not found!") audioBasicIO.convertFsDirWavToWav(path, int(argv[3]), int(argv[4])) else: print "Error.\nSyntax: " + argv[ 0] + " -dirMp3toWAV <dirName> <sampling Freq> <numOfChannels>" elif argv[ 1] == "-featureExtractionFile": # short-term and mid-term feature extraction to files (csv and numpy) if len(argv) == 7: wavFileName = argv[2] if not os.path.isfile(wavFileName): raise Exception("Input audio file not found!") if not (uT.isNum(argv[3]) and uT.isNum(argv[4]) and uT.isNum(argv[5]) and uT.isNum(argv[6])): raise Exception( "Mid-term and short-term window sizes and steps must be numbers!" ) mtWin = float(argv[3]) mtStep = float(argv[4]) stWin = float(argv[5]) stStep = float(argv[6]) outFile = wavFileName aF.mtFeatureExtractionToFile(wavFileName, mtWin, mtStep, stWin, stStep, outFile, True, True, True) else: print "Error.\nSyntax: " + argv[ 0] + " -featureExtractionFile <wavFileName> <mtWin> <mtStep> <stWin> <stStep>" elif argv[1] == "-beatExtraction": if len(argv) == 4: wavFileName = argv[2] if not os.path.isfile(wavFileName): raise Exception("Input audio file not found!") if not (uT.isNum(argv[3])): raise Exception("PLOT must be either 0 or 1") if not ((int(argv[3]) == 0) or (int(argv[3]) == 1)): raise Exception("PLOT must be either 0 or 1") [Fs, x] = audioBasicIO.readAudioFile(wavFileName) F = aF.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.050 * Fs) BPM, ratio = aF.beatExtraction(F, 0.050, int(argv[3]) == 1) print "Beat: {0:d} bpm ".format(int(BPM)) print "Ratio: {0:.2f} ".format(ratio) else: print "Error.\nSyntax: " + argv[ 0] + " -beatExtraction <wavFileName> <PLOT (0 or 1)>" elif argv[ 1] == '-featureExtractionDir': # same as -featureExtractionFile, in a batch mode (i.e. for each WAV file in the provided path) if len(argv) == 7: path = argv[2] if not os.path.isdir(path): raise Exception("Input path not found!") if not (uT.isNum(argv[3]) and uT.isNum(argv[4]) and uT.isNum(argv[5]) and uT.isNum(argv[6])): raise Exception( "Mid-term and short-term window sizes and steps must be numbers!" ) mtWin = float(argv[3]) mtStep = float(argv[4]) stWin = float(argv[5]) stStep = float(argv[6]) aF.mtFeatureExtractionToFileDir(path, mtWin, mtStep, stWin, stStep, True, True, True) else: print "Error.\nSyntax: " + argv[ 0] + " -featureExtractionDir <path> <mtWin> <mtStep> <stWin> <stStep>" elif argv[ 1] == '-featureVisualizationDir': # visualize the content relationships between recordings stored in a folder if len(argv) == 3: if not os.path.isdir(argv[2]): raise Exception("Input folder not found!") aV.visualizeFeaturesFolder(argv[2], "pca", "") elif argv[ 1] == '-fileSpectrogram': # show spectogram of a sound stored in a file if len(argv) == 3: wavFileName = argv[2] if not os.path.isfile(wavFileName): raise Exception("Input audio file not found!") [Fs, x] = audioBasicIO.readAudioFile(wavFileName) x = audioBasicIO.stereo2mono(x) specgram, TimeAxis, FreqAxis = aF.stSpectogram( x, Fs, round(Fs * 0.040), round(Fs * 0.040), True) else: print "Error.\nSyntax: " + argv[0] + " -fileSpectrogram <fileName>" elif argv[ 1] == '-fileChromagram': # show spectogram of a sound stored in a file if len(argv) == 3: wavFileName = argv[2] if not os.path.isfile(wavFileName): raise Exception("Input audio file not found!") [Fs, x] = audioBasicIO.readAudioFile(wavFileName) x = audioBasicIO.stereo2mono(x) specgram, TimeAxis, FreqAxis = aF.stChromagram( x, Fs, round(Fs * 0.040), round(Fs * 0.040), True) else: print "Error.\nSyntax: " + argv[0] + " -fileSpectrogram <fileName>" elif argv[1] == "-trainClassifier": # Segment classifier training (OK) if len(argv) > 6: method = argv[2] beatFeatures = (int(argv[3]) == 1) listOfDirs = argv[4:len(argv) - 1] modelName = argv[-1] aT.featureAndTrain(listOfDirs, 1, 1, aT.shortTermWindow, aT.shortTermStep, method.lower(), modelName, computeBEAT=beatFeatures) else: print "Error.\nSyntax: " + argv[ 0] + " -trainClassifier <method(svm or knn)> <beat features> <directory 1> <directory 2> ... <directory N> <modelName>" elif argv[1] == "-trainRegression": # Segment regression model if len(argv) == 6: method = argv[2] beatFeatures = (int(argv[3]) == 1) dirName = argv[4] modelName = argv[5] aT.featureAndTrainRegression(dirName, 1, 1, aT.shortTermWindow, aT.shortTermStep, method.lower(), modelName, computeBEAT=beatFeatures) else: print "Error.\nSyntax: " + argv[ 0] + " -trainRegression <method(svm or knn)> <beat features> <directory> <modelName>" elif argv[1] == "-classifyFile": # Single File Classification (OK) if len(argv) == 5: modelType = argv[2] modelName = argv[3] inputFile = argv[4] if modelType not in ["svm", "knn"]: raise Exception("ModelType has to be either svm or knn!") if not os.path.isfile(modelName): raise Exception("Input modelName not found!") if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") [Result, P, classNames] = aT.fileClassification(inputFile, modelName, modelType) print "{0:s}\t{1:s}".format("Class", "Probability") for i, c in enumerate(classNames): print "{0:s}\t{1:.2f}".format(c, P[i]) print "Winner class: " + classNames[int(Result)] else: print "Error.\nSyntax: " + argv[ 0] + " -classifyFile <method(svm or knn)> <modelName> <fileName>" elif argv[1] == "-regressionFile": # Single File Classification (OK) if len(argv) == 5: modelType = argv[2] modelName = argv[3] inputFile = argv[4] if modelType not in ["svm", "knn"]: raise Exception("ModelType has to be either svm or knn!") if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") R, regressionNames = aT.fileRegression(inputFile, modelName, modelType) for i in range(len(R)): print "{0:s}\t{1:.3f}".format(regressionNames[i], R[i]) #print "{0:s}\t{1:.2f}".format(c,P[i]) else: print "Error.\nSyntax: " + argv[ 0] + " -regressionFile <method(svm or knn)> <modelName> <fileName>" elif argv[1] == "-classifyFolder": # Directory classification (Ok) if len(argv) == 6 or len(argv) == 5: modelType = argv[2] modelName = argv[3] inputFolder = argv[4] if len(argv) == 6: outputMode = argv[5] else: outputMode = "0" if modelType not in ["svm", "knn"]: raise Exception("ModelType has to be either svm or knn!") if outputMode not in ["0", "1"]: raise Exception("outputMode has to be 0 or 1") if not os.path.isfile(modelName): raise Exception("Input modelName not found!") files = '*.wav' if os.path.isdir(inputFolder): strFilePattern = os.path.join(inputFolder, files) else: strFilePattern = inputFolder + files wavFilesList = [] wavFilesList.extend(glob.glob(strFilePattern)) wavFilesList = sorted(wavFilesList) if len(wavFilesList) == 0: print "No WAV files found!" return Results = [] for wavFile in wavFilesList: [Result, P, classNames] = aT.fileClassification(wavFile, modelName, modelType) Result = int(Result) Results.append(Result) if outputMode == "1": print "{0:s}\t{1:s}".format(wavFile, classNames[Result]) Results = numpy.array(Results) # print distribution of classes: [Histogram, _] = numpy.histogram(Results, bins=numpy.arange(len(classNames) + 1)) for i, h in enumerate(Histogram): print "{0:20s}\t\t{1:d}".format(classNames[i], h) else: print "Error.\nSyntax: " + argv[ 0] + " -classifyFolder <method(svm or knn)> <modelName> <folderName> <outputMode(0 or 1)" elif argv[ 1] == "-regressionFolder": # Regression applied on the WAV files of a folder if len(argv) == 5: modelType = argv[2] modelName = argv[3] inputFolder = argv[4] if modelType not in ["svm", "knn"]: raise Exception("ModelType has to be either svm or knn!") files = '*.wav' if os.path.isdir(inputFolder): strFilePattern = os.path.join(inputFolder, files) else: strFilePattern = inputFolder + files wavFilesList = [] wavFilesList.extend(glob.glob(strFilePattern)) wavFilesList = sorted(wavFilesList) if len(wavFilesList) == 0: print "No WAV files found!" return Results = [] for wavFile in wavFilesList: R, regressionNames = aT.fileRegression(wavFile, modelName, modelType) Results.append(R) Results = numpy.array(Results) for i, r in enumerate(regressionNames): [Histogram, bins] = numpy.histogram(Results[:, i]) centers = (bins[0:-1] + bins[1::]) / 2.0 plt.subplot(len(regressionNames), 1, i) plt.plot(centers, Histogram) plt.title(r) plt.show() # for h in Histogram: # print "{0:20d}".format(h), # if outputMode=="1": # for i,h in enumerate(Histogram): # print "{0:20s}\t\t{1:d}".format(classNames[i], h) else: print "Error.\nSyntax: " + argv[ 0] + " -regressionFolder <method(svm or knn)> <modelName> <folderName>" elif argv[1] == '-trainHMMsegmenter_fromfile': if len(argv) == 7: wavFile = argv[2] gtFile = argv[3] hmmModelName = argv[4] if not uT.isNum(argv[5]): print "Error: mid-term window size must be float!" return if not uT.isNum(argv[6]): print "Error: mid-term window step must be float!" return mtWin = float(argv[5]) mtStep = float(argv[6]) if not os.path.isfile(wavFile): print "Error: wavfile does not exist!" return if not os.path.isfile(gtFile): print "Error: groundtruth does not exist!" return aS.trainHMM_fromFile(wavFile, gtFile, hmmModelName, mtWin, mtStep) else: print "Error.\nSyntax: " + argv[ 0] + " -trainHMMsegmenter_fromfile <wavFilePath> <gtSegmentFilePath> <hmmModelFileName> <mtWin> <mtStep>" elif argv[1] == '-trainHMMsegmenter_fromdir': if len(argv) == 6: dirPath = argv[2] hmmModelName = argv[3] if not uT.isNum(argv[4]): print "Error: mid-term window size must be float!" if not uT.isNum(argv[5]): print "Error: mid-term window step must be float!" mtWin = float(argv[4]) mtStep = float(argv[5]) aS.trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep) else: print "Error.\nSyntax: " + argv[ 0] + " -trainHMMsegmenter_fromdir <dirPath> <hmmModelFileName> <mtWin> <mtStep>" elif argv[ 1] == "-segmentClassifyFileHMM": # HMM-based segmentation-classification if len(argv) == 4: hmmModelName = argv[2] wavFile = argv[3] gtFile = wavFile.replace('.wav', '.segments') aS.hmmSegmentation(wavFile, hmmModelName, PLOT=True, gtFileName=gtFile) else: print "Error.\nSyntax: " + argv[ 0] + " -segmentClassifyHMM <hmmModelName> <fileName>" elif argv[ 1] == '-segmentClassifyFile': # Segmentation-classification (fix-sized segment using knn or svm) if (len(argv) == 5): modelType = argv[2] modelName = argv[3] inputWavFile = argv[4] if modelType not in ["svm", "knn"]: raise Exception("ModelType has to be either svm or knn!") if not os.path.isfile(modelName): raise Exception("Input modelName not found!") if not os.path.isfile(inputWavFile): raise Exception("Input audio file not found!") gtFile = inputWavFile.replace('.wav', '.segments') aS.mtFileClassification(inputWavFile, modelName, modelType, True, gtFile) else: print "Error.\nSyntax: " + argv[ 0] + " -segmentClassifyFile <method(svm or knn)> <modelName> <fileName>" elif argv[1] == "-segmentationEvaluation": if len(argv) == 5: methodName = argv[2] modelName = argv[3] dirName = argv[4] aS.evaluateSegmentationClassificationDir(dirName, modelName, methodName) else: print "Error.\nSyntax: " + argv[ 0] + " -segmentationEvaluation <method(svm or knn)> <modelName> <directoryName>" elif argv[1] == "-silenceRemoval": if len(argv) == 5: inputFile = argv[2] if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") smoothingWindow = float(argv[3]) weight = float(argv[4]) [Fs, x] = audioBasicIO.readAudioFile(inputFile) # read audio signal segmentLimits = aS.silenceRemoval(x, Fs, 0.05, 0.05, smoothingWindow, weight, False) # get onsets for i, s in enumerate(segmentLimits): strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format( inputFile[0:-4], s[0], s[1]) wavfile.write(strOut, Fs, x[int(Fs * s[0]):int(Fs * s[1])]) else: print "Error.\nSyntax: " + argv[ 0] + " -silenceRemoval <inputFile> <smoothinWindow(secs)> <Threshold Weight>" elif argv[ 1] == '-speakerDiarization': # speaker diarization (from file): TODO inputFile = argv[2] nSpeakers = int(argv[3]) useLDA = (int(argv[4]) == 1) if useLDA: aS.speakerDiarization(inputFile, nSpeakers, PLOT=True) else: aS.speakerDiarization(inputFile, nSpeakers, LDAdim=0, PLOT=True) #print speechLimits elif argv[1] == "-speakerDiarizationScriptEval": dir = argv[2] listOfLDAs = [int(l) for l in argv[3::]] aS.speakerDiarizationEvaluateScript(dir, listOfLDAs) elif argv[1] == '-thumbnail': # music thumbnailing (OK) if len(argv) == 4: inputFile = argv[2] stWindow = 1.0 stStep = 1.0 if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") [Fs, x] = audioBasicIO.readAudioFile(inputFile) # read file if Fs == -1: # could not read file return try: thumbnailSize = float(argv[3]) except ValueError: print "Thumbnail size must be a float (in seconds)" return [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing( x, Fs, stWindow, stStep, thumbnailSize) # find thumbnail endpoints # write thumbnails to WAV files: thumbnailFileName1 = inputFile.replace(".wav", "_thumb1.wav") thumbnailFileName2 = inputFile.replace(".wav", "_thumb2.wav") wavfile.write(thumbnailFileName1, Fs, x[int(Fs * A1):int(Fs * A2)]) wavfile.write(thumbnailFileName2, Fs, x[int(Fs * B1):int(Fs * B2)]) print "1st thumbnail (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format( thumbnailFileName1, A1, A2) print "2nd thumbnail (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format( thumbnailFileName2, B1, B2) # Plot self-similarity matrix: fig = plt.figure() ax = fig.add_subplot(111, aspect='auto') plt.imshow(Smatrix) # Plot best-similarity diagonal: Xcenter = (A1 / stStep + A2 / stStep) / 2.0 Ycenter = (B1 / stStep + B2 / stStep) / 2.0 e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter), thumbnailSize * 1.4, 3, angle=45, linewidth=3, fill=False) ax.add_patch(e1) plt.plot([B1, Smatrix.shape[0]], [A1, A1], color='k', linestyle='--', linewidth=2) plt.plot([B2, Smatrix.shape[0]], [A2, A2], color='k', linestyle='--', linewidth=2) plt.plot([B1, B1], [A1, Smatrix.shape[0]], color='k', linestyle='--', linewidth=2) plt.plot([B2, B2], [A2, Smatrix.shape[0]], color='k', linestyle='--', linewidth=2) plt.xlim([0, Smatrix.shape[0]]) plt.ylim([Smatrix.shape[1], 0]) ax.yaxis.set_label_position("right") ax.yaxis.tick_right() plt.xlabel('frame no') plt.ylabel('frame no') plt.title('Self-similarity matrix') plt.show() else: print "Error.\nSyntax: " + argv[ 0] + " -thumbnail <filename> <thumbnailsize(seconds)>"
def _record(self): print("please speak a word into the microphone") self.CHUNK = 2048 self.FORMAT = pyaudio.paInt16 self.CHANNELS = 2 self.RATE = 44100 self.RECORD_SECONDS = 6 self.count = 0 self.current = 0 self.cutinterval = 105 self.segment = [] self.fullFrame = [] print("saving and segmenting") while (self.isrecording): self.p = pyaudio.PyAudio() self.stream = self.p.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, frames_per_buffer=self.CHUNK) for i in range(0, int(self.RATE / self.CHUNK * self.RECORD_SECONDS)): data = self.stream.read(self.CHUNK) self.frames.append(data) self.fullFrames = self.frames print(len(self.fullFrames)) self.path = "Record/recordedSegment" + str(int( time.time())) + ".wav" self.stream.stop_stream() self.stream.close() self.p.terminate() self.segment = self.frames[self.count:len(self.frames)] self.count = self.count + self.cutinterval wf = wave.open(self.path, 'wb') wf.setnchannels(self.CHANNELS) wf.setsampwidth(self.p.get_sample_size(self.FORMAT)) wf.setframerate(self.RATE) wf.writeframes(b''.join(self.segment)) wf.close() print("segment Recording Finished") if (len(self.segment) > 100): audiofile = AudioSegment.from_wav(self.path) [Fs, x] = aIO.readAudioFile(self.path) segments = aS.silenceRemoval(x, Fs, 0.020, 0.020, smoothWindow=1.0, Weight=0.3, plot=False) newname = time.time() for i in range(0, len(segments)): z = segments[i] startpoint = z[0] endpoint = z[1] sliceSound = audiofile[startpoint * 1000:endpoint * 1000] sliceSound.export(self.path, format="wav") newname = newname + 1 print("--Segmenter finished--") result = self.model.cnn_predict(self.path) print("predicting......") print(result) print("done.....") if (len(self.frames) != 0 & self.isrecording == False): print(len(self.frames)) self.stream.stop_stream() self.stream.close() self.p.terminate() self.paths = "Record/FullRecord" + str(int(time.time())) + ".wav" wf = wave.open(self.paths, 'wb') wf.setnchannels(self.CHANNELS) wf.setsampwidth(self.p.get_sample_size(self.FORMAT)) wf.setframerate(self.RATE) wf.writeframes(b''.join(self.fullFrames)) wf.close() self.frames = [] self.segment = [] print("--Finished Recording--")
def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT=False): """ This function extracts the mid-term features of the WAVE files of a particular folder. The resulting feature vector is extracted by long-term averaging the mid-term features. Therefore ONE FEATURE VECTOR is extracted for each WAV file. ARGUMENTS: - dirName: the path of the WAVE directory - mtWin, mtStep: mid-term window and step (in seconds) - stWin, stStep: short-term window and step (in seconds) """ allMtFeatures = numpy.array([]) processingTimes = [] types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au') wavFilesList = [] for files in types: wavFilesList.extend(glob.glob(os.path.join(dirName, files))) wavFilesList = sorted(wavFilesList) wavFilesList2 = [] for i, wavFile in enumerate(wavFilesList): print("Analyzing file {0:d} of {1:d}: {2:s}".format( i + 1, len(wavFilesList), wavFile.encode('utf-8'))) if os.stat(wavFile).st_size == 0: print(" (EMPTY FILE -- SKIPPING)") continue [Fs, x] = audioBasicIO.readAudioFile(wavFile) # read file if isinstance(x, int): continue t1 = time.clock() # convert stereo to mono x = audioBasicIO.stereo2mono(x) if x.shape[0] < float(Fs) / 10: print(" (AUDIO FILE TOO SMALL - SKIPPING)") continue wavFilesList2.append(wavFile) if computeBEAT: # mid-term feature extraction for current file [MidTermFeatures, stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) [beat, beatConf] = beatExtraction(stFeatures, stStep) else: [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = numpy.transpose(MidTermFeatures) # long term averaging of mid-term statistics MidTermFeatures = MidTermFeatures.mean(axis=0) if (not numpy.isnan(MidTermFeatures).any()) and ( not numpy.isinf(MidTermFeatures).any()): if computeBEAT: MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) if len(allMtFeatures) == 0: # append feature vector allMtFeatures = MidTermFeatures else: allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures)) t2 = time.clock() duration = float(len(x)) / Fs processingTimes.append((t2 - t1) / duration) if len(processingTimes) > 0: print("Feature extraction complexity ratio: {0:.1f} x realtime".format( (1.0 / numpy.mean(numpy.array(processingTimes))))) return (allMtFeatures, wavFilesList2)
# for x in range(len(starts)): # CnumSum += 1 # for y in range(len(change_point)): # if starts[x]-10 <= change_point[y] and starts[x]+10 >= change_point[y]: # CnumTrue += 1 # for a in range(len(change_point)): # PnumSum += 1 # for b in range(len(starts)): # if starts[b]+20 <= change_point[a] and ends[b]-20 >= change_point[a]: # PnumTrue += 1 # print(audiofile) # Coverage = float(CnumTrue)/float(CnumSum) # Purity = 1 - float(PnumTrue)/float(PnumSum) # print("Coverage:{0} Purity:{1}".format(Coverage,Purity)) for i in range(nExp): [Fs1, x1] = audioBasicIO.readAudioFile(filename) tcls = aS.speakerDiarization(filename, 2, LDAdim=0, PLOT=False) audio = AudioSegment.from_wav(filename) audio_length = len(audio) change_point = [[] for i in range(2)] for j in range(len(tcls) - 1): if tcls[j] != tcls[j + 1]: change_point[0].append(j) change_point[1].append(int(tcls[j])) if j == len(tcls) - 2: change_point[1].append(int(tcls[j])) for num in range(len(change_point[1])): if num == 0: seg_audio = audio[:audio_length * change_point[0][0] / len(tcls)] seg_audio.export(os.path.join(
def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT=False): """ This function extracts the mid-term features of the WAVE files of a particular folder. The resulting feature vector is extracted by long-term averaging the mid-term features. Therefore ONE FEATURE VECTOR is extracted for each WAV file. ARGUMENTS: - dirName: the path of the WAVE directory - mtWin, mtStep: mid-term window and step (in seconds) - stWin, stStep: short-term window and step (in seconds) """ allMtFeatures = numpy.array([]) processingTimes = [] types = ('*.wav', '*.aif', '*.aiff') wavFilesList = [] for files in types: wavFilesList.extend(glob.glob(os.path.join(dirName, files))) wavFilesList = sorted(wavFilesList) for wavFile in wavFilesList: [Fs, x] = audioBasicIO.readAudioFile(wavFile) # read file t1 = time.clock() x = audioBasicIO.stereo2mono(x) # convert stereo to mono if computeBEAT: # mid-term feature extraction for current file [MidTermFeatures, stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) [beat, beatConf] = beatExtraction(stFeatures, stStep) else: [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = numpy.transpose(MidTermFeatures) MidTermFeatures = MidTermFeatures.mean( axis=0) # long term averaging of mid-term statistics if computeBEAT: MidTermFeatures = numpy.append(MidTermFeatures, beat) MidTermFeatures = numpy.append(MidTermFeatures, beatConf) if len(allMtFeatures) == 0: # append feature vector allMtFeatures = MidTermFeatures else: allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures)) t2 = time.clock() duration = float(len(x)) / Fs processingTimes.append((t2 - t1) / duration) if len(processingTimes) > 0: print "Feature extraction complexity ratio: {0:.1f} x realtime".format( (1.0 / numpy.mean(numpy.array(processingTimes)))) return (allMtFeatures, wavFilesList)
import csv import random as rn import math import operator import numpy as np import audioBasicIO import audioFeatureExtraction dire = r"G:\5th sem\ee320 DSP\project\csv\newdata\dataset\testdata\angry" c = 1 for filename in os.walk(dire): for x in filename[2]: label1 = [] label = [] file = filename[0] + "\\" + str(x) [Fs, x] = audioBasicIO.readAudioFile(file) F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.50 * Fs, 0.25 * Fs) for i in range(len(F[0])): label1.append(3) label.append(label1) G = np.append(F, label, axis=0) loc = r"G:\5th sem\ee320 DSP\project\csv\newdata\dataset\testdata\angrycsv\an" + str( c) + ".csv" c = c + 1 np.savetxt(loc, G, delimiter=",") dire = r"G:\5th sem\ee320 DSP\project\csv\newdata\dataset\testdata\sad" c = 1 for filename in os.walk(dire):
def mtFileClassification(inputFile, modelName, modelType, plotResults = False, gtFile = ""): ''' This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - inputFile: path of the input WAV file - modelName: name of the classification model - modelType: svm or knn depending on the classifier type - plotResults: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment ''' if not os.path.isfile(modelName): print "mtFileClassificationError: input modelType not found!" return (-1,-1,-1) # Load classifier: if modelType=='svm': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName) elif modelType=='knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName) if computeBEAT: print "Model " + modelName + " contains long-term music features (beat etc) and cannot be used in segmentation" return (-1,-1,-1) [Fs, x] = audioBasicIO.readAudioFile(inputFile) # load input file if Fs == -1: # could not read file return (-1,-1,-1) x = audioBasicIO.stereo2mono(x); # convert stereo (if) to mono Duration = len(x) / Fs # mid-term feature extraction: [MidTermFeatures, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs*stWin), round(Fs*stStep)); flags = []; Ps = []; flagsInd = [] for i in range(MidTermFeatures.shape[1]): # for each feature vector (i.e. for each fix-sized segment): curFV = (MidTermFeatures[:, i] - MEAN) / STD; # normalize current feature vector [Result, P] = aT.classifierWrapper(Classifier, modelType, curFV) # classify vector flagsInd.append(Result) flags.append(classNames[int(Result)]) # update class label matrix Ps.append(numpy.max(P)) # update probability matrix flagsInd = numpy.array(flagsInd) # 1-window smoothing for i in range(1, len(flagsInd)-1): if flagsInd[i-1]==flagsInd[i+1]: flagsInd[i] = flagsInd[i+1] (segs, classes) = flags2segs(flags, mtStep) # convert fix-sized flags to segments and classes segs[-1] = len(x) / float(Fs) # Load grount-truth: if os.path.isfile(gtFile): [segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile) flagsGT, classNamesGT = segs2flags(segStartGT, segEndGT, segLabelsGT, mtStep) flagsIndGT = [] for j, fl in enumerate(flagsGT): # "align" labels with GT if classNamesGT[flagsGT[j]] in classNames: flagsIndGT.append( classNames.index( classNamesGT[flagsGT[j]] ) ) else: flagsIndGT.append( -1 ) flagsIndGT = numpy.array(flagsIndGT) else: flagsIndGT = numpy.array([]) acc = plotSegmentationResults(flagsInd, flagsIndGT, classNames, mtStep, not plotResults) if acc>=0: print "Overall Accuracy: {0:.3f}".format(acc) return (flagsInd, classNames, acc)
#!/usr/bin/python import sys import audioBasicIO as aIO import audioSegmentation as aS [Fs, x] = aIO.readAudioFile(sys.argv[1]) segments = aS.silenceRemoval(x, Fs, 0.020, 0.020, smoothWindow = 1.0, Weight = 0.3, plot = False) sid = 1 for i in segments: print('%s,%.4f,%.4f' % (str(sid).zfill(4),i[0],i[1])) sid += 1
def trainHMM_fromDir(dirPath, hmm_model_name, mt_win, mt_step): ''' This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored ARGUMENTS: - dirPath: the path of the data diretory - hmm_model_name: the name of the HMM model to be stored - mt_win: mid-term window size - mt_step: mid-term window step RETURNS: - hmm: an object to the resulting HMM - class_names: a list of class_names After training, hmm, class_names, along with the mt_win and mt_step values are stored in the hmm_model_name file ''' flags_all = numpy.array([]) classes_all = [] for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')): # for each WAV file wav_file = f gt_file = f.replace('.wav', '.segments') if not os.path.isfile(gt_file): continue [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step) for c in class_names: # update class names: if c not in classes_all: classes_all.append(c) [fs, x] = audioBasicIO.readAudioFile(wav_file) [F, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs, round(fs * 0.050), round(fs * 0.050)) lenF = F.shape[1] lenL = len(flags) min_sm = min(lenF, lenL) F = F[:, 0:min_sm] flags = flags[0:min_sm] flagsNew = [] for j, fl in enumerate(flags): # append features and labels flagsNew.append(classes_all.index(class_names[flags[j]])) flags_all = numpy.append(flags_all, numpy.array(flagsNew)) if i == 0: f_all = F else: f_all = numpy.concatenate((f_all, F), axis=1) start_prob, transmat, means, cov = trainHMM_computeStatistics( f_all, flags_all) # compute HMM statistics hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") # train HMM hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov fo = open(hmm_model_name, "wb") # save HMM model cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classes_all, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classes_all
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35, PLOT=False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' [Fs, x] = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) Duration = len(x) / Fs [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel( "data/knnSpeakerAll") [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel( "data/knnSpeakerFemaleMale") [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 # TODO # SELECT FEATURES: # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96, # 97,98, 99,100]; # SET 0C iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] # SET 1A # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C # iFeaturesSelect = range(100); # SET 3 # MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] # remove outliers: DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] # TODO: Combine energy threshold for outlier removal: # EnergyMin = numpy.min(MidTermFeatures[1,:]) # EnergyMean = numpy.mean(MidTermFeatures[1,:]) # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 # iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0] # print iNonOutLiers perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] # LDA dimensionality reduction: if LDAdim > 0: # [mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin)); # extract mid-term features with minimum step: mtWinRatio = int(round(mtSize / stWin)) mtStepRatio = int(round(stWin / stWin)) mtFeaturesToReduce = [] numOfFeatures = len(ShortTermFeatures) numOfStatistics = 2 # for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] # mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T # DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) # MDistancesAll = numpy.mean(DistancesAll) # iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0] # mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] Labels = numpy.zeros((mtFeaturesToReduce.shape[1],)) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin # print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio); clf = LDA(n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T if numOfSpeakers <= 0: sRange = range(2, 10) else: sRange = [numOfSpeakers] clsAll = [] silAll = [] centersAll = [] for iSpeakers in sRange: cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True) # perform k-means clustering # YDist = distance.pdist(MidTermFeaturesNorm.T, metric='euclidean') # print distance.squareform(YDist).shape # hc = mlpy.HCluster() # hc.linkage(YDist) # cls = hc.cut(14.5) # print cls # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) clsAll.append(cls) centersAll.append(means) silA = []; silB = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.020: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] # get subset of feature vectors Yt = distance.pdist( MidTermFeaturesNormTemp.T) # compute average distance between samples that belong to the cluster (a values) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) silA = numpy.array(silA) silB = numpy.array(silB) sil = [] for c in range(iSpeakers): # for each cluster (speaker) sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) # compute silhouette silAll.append(numpy.mean(sil)) # keep the AVERAGE SILLOUETTE # silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5))) imax = numpy.argmax(silAll) # position of the maximum sillouette value nSpeakersFinal = sRange[imax] # optimal number of clusters return nSpeakersFinal
def trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep): ''' This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored ARGUMENTS: - dirPath: the path of the data diretory - hmmModelName: the name of the HMM model to be stored - mtWin: mid-term window size - mtStep: mid-term window step RETURNS: - hmm: an object to the resulting HMM - classNames: a list of classNames After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file ''' flagsAll = numpy.array([]) classesAll = [] for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')): # for each WAV file wavFile = f gtFile = f.replace('.wav', '.segments') # open for annotated file if not os.path.isfile( gtFile ): # if current WAV file does not have annotation -> skip continue [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flags, classNames = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags for c in classNames: # update classnames: if c not in classesAll: classesAll.append(c) [Fs, x] = audioBasicIO.readAudioFile(wavFile) # read audio data [F, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)) # feature extraction lenF = F.shape[1] lenL = len(flags) MIN = min(lenF, lenL) F = F[:, 0:MIN] flags = flags[0:MIN] flagsNew = [] for j, fl in enumerate(flags): # append features and labels flagsNew.append(classesAll.index(classNames[flags[j]])) flagsAll = numpy.append(flagsAll, numpy.array(flagsNew)) if i == 0: Fall = F else: Fall = numpy.concatenate((Fall, F), axis=1) startprob, transmat, means, cov = trainHMM_computeStatistics( Fall, flagsAll) # compute HMM statistics hmm = sklearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat) # train HMM hmm.means_ = means hmm.covars_ = cov fo = open(hmmModelName, "wb") # save HMM model cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classesAll, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classesAll