def fileChromagramWrapper(wavFileName):
    if not os.path.isfile(wavFileName):
        raise Exception("Input audio file not found!")
    [Fs, x] = audioBasicIO.readAudioFile(wavFileName)
    x = audioBasicIO.stereo2mono(x)
    specgram, TimeAxis, FreqAxis = aF.stChromagram(x, Fs, round(Fs * 0.040),
                                                   round(Fs * 0.040), True)
def fileClassification(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(modelName):
        print "fileClassification: input modelName not found!"
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print "fileClassification: wav file not found!"
        return (-1, -1, -1)

    if modelType == 'svm':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(modelName)
    elif modelType == 'knn':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(modelName)

    [Fs, x] = audioBasicIO.readAudioFile(inputFile)  # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)
    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(axis=1)  # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
    curFV = (MidTermFeatures - MEAN) / STD  # normalization

    [Result, P] = classifierWrapper(Classifier, modelType, curFV)  # classification
    return Result, P, classNames
def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile,
                              storeStFeatures=False, storeToCSV=False, PLOT=False):
    """
    This function is used as a wrapper to:
    a) read the content of a WAV file
    b) perform mid-term feature extraction on that signal
    c) write the mid-term feature sequences to a numpy file
    """
    [Fs, x] = audioBasicIO.readAudioFile(fileName)            # read the wav file
    x = audioBasicIO.stereo2mono(x)                           # convert to MONO if required
    if storeStFeatures:
        [mtF, stF] = mtFeatureExtraction(x, Fs, round(Fs * midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep))
    else:
        [mtF, _] = mtFeatureExtraction(x, Fs, round(Fs*midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep))

    numpy.save(outPutFile, mtF)                              # save mt features to numpy file
    if PLOT:
        print "Mid-term numpy file: " + outPutFile + ".npy saved"
    if storeToCSV:
        numpy.savetxt(outPutFile+".csv", mtF.T, delimiter=",")
        if PLOT:
            print "Mid-term CSV file: " + outPutFile + ".csv saved"

    if storeStFeatures:
        numpy.save(outPutFile+"_st", stF)                    # save st features to numpy file
        if PLOT:
            print "Short-term numpy file: " + outPutFile + "_st.npy saved"
        if storeToCSV:
            numpy.savetxt(outPutFile+"_st.csv", stF.T, delimiter=",")    # store st features to CSV file
            if PLOT:
                print "Short-term CSV file: " + outPutFile + "_st.csv saved"
def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mtWin, mtStep:    mid-term window and step (in seconds)
        - stWin, stStep:    short-term window and step (in seconds)
    """

    allMtFeatures = numpy.array([])
    processingTimes = []

    types = ('*.wav', '*.aif',  '*.aiff', '*.mp3','*.au')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(dirName, files)))

    wavFilesList = sorted(wavFilesList)    
    wavFilesList2 = []
    for i, wavFile in enumerate(wavFilesList):        
        print "Analyzing file {0:d} of {1:d}: {2:s}".format(i+1, len(wavFilesList), wavFile.encode('utf-8'))
        if os.stat(wavFile).st_size == 0:
            print "   (EMPTY FILE -- SKIPPING)"
            continue        
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)            # read file    
        if isinstance(x, int):
            continue        

        t1 = time.clock()        
        x = audioBasicIO.stereo2mono(x)                          # convert stereo to mono                
        if x.shape[0]<float(Fs)/10:
            print "  (AUDIO FILE TOO SMALL - SKIPPING)"
            continue
        wavFilesList2.append(wavFile)
        if computeBEAT:                                          # mid-term feature extraction for current file
            [MidTermFeatures, stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep))
            [beat, beatConf] = beatExtraction(stFeatures, stStep)
        else:
            [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep))

        MidTermFeatures = numpy.transpose(MidTermFeatures)
        MidTermFeatures = MidTermFeatures.mean(axis=0)         # long term averaging of mid-term statistics
        if (not numpy.isnan(MidTermFeatures).any()) and (not numpy.isinf(MidTermFeatures).any()):            
            if computeBEAT:
                MidTermFeatures = numpy.append(MidTermFeatures, beat)
                MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
            if len(allMtFeatures) == 0:                              # append feature vector
                allMtFeatures = MidTermFeatures
            else:
                allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
            t2 = time.clock()
            duration = float(len(x)) / Fs
            processingTimes.append((t2 - t1) / duration)
    if len(processingTimes) > 0:
        print "Feature extraction complexity ratio: {0:.1f} x realtime".format((1.0 / numpy.mean(numpy.array(processingTimes))))
    return (allMtFeatures, wavFilesList2)
def thumbnailWrapper(inputFile, thumbnailWrapperSize):
    stWindow = 1.0
    stStep = 1.0
    if not os.path.isfile(inputFile):
        raise Exception("Input audio file not found!")

    [Fs, x] = audioBasicIO.readAudioFile(inputFile)
    if Fs == -1:    # could not read file
        return

    [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x, Fs, stWindow, stStep,
                                                     thumbnailWrapperSize)

    # write thumbnailWrappers to WAV files:
    if inputFile.endswith(".wav"):
        thumbnailWrapperFileName1 = inputFile.replace(".wav", "_thumb1.wav")
        thumbnailWrapperFileName2 = inputFile.replace(".wav", "_thumb2.wav")
    if inputFile.endswith(".mp3"):
        thumbnailWrapperFileName1 = inputFile.replace(".mp3", "_thumb1.mp3")
        thumbnailWrapperFileName2 = inputFile.replace(".mp3", "_thumb2.mp3")        
    wavfile.write(thumbnailWrapperFileName1, Fs, x[int(Fs * A1):int(Fs * A2)])
    wavfile.write(thumbnailWrapperFileName2, Fs, x[int(Fs * B1):int(Fs * B2)])
    print "1st thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec" \
          " -- {2:4.1f}sec".format(thumbnailWrapperFileName1, A1, A2)
    print "2nd thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec" \
          " -- {2:4.1f}sec".format(thumbnailWrapperFileName2, B1, B2)

    # Plot self-similarity matrix:
    fig = plt.figure()
    ax = fig.add_subplot(111, aspect="auto")
    plt.imshow(Smatrix)
    # Plot best-similarity diagonal:
    Xcenter = (A1 / stStep + A2 / stStep) / 2.0
    Ycenter = (B1 / stStep + B2 / stStep) / 2.0

    e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter),
                                    thumbnailWrapperSize * 1.4, 3, angle=45,
                                    linewidth=3, fill=False)
    ax.add_patch(e1)

    plt.plot([B1, Smatrix.shape[0]], [A1, A1], color="k",
             linestyle="--", linewidth=2)
    plt.plot([B2, Smatrix.shape[0]], [A2, A2], color="k",
             linestyle="--", linewidth=2)
    plt.plot([B1, B1], [A1, Smatrix.shape[0]], color="k",
             linestyle="--", linewidth=2)
    plt.plot([B2, B2], [A2, Smatrix.shape[0]], color="k",
             linestyle="--", linewidth=2)

    plt.xlim([0, Smatrix.shape[0]])
    plt.ylim([Smatrix.shape[1], 0])

    ax.yaxis.set_label_position("right")
    ax.yaxis.tick_right()

    plt.xlabel("frame no")
    plt.ylabel("frame no")
    plt.title("Self-similarity matrix")

    plt.show()
def trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep):
    '''
    This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored
    ARGUMENTS:
     - dirPath:        the path of the data diretory
     - hmmModelName:    the name of the HMM model to be stored
     - mtWin:        mid-term window size
     - mtStep:        mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - classNames:        a list of classNames

    After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file
    '''

    flagsAll = numpy.array([])
    classesAll = []
    for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')):               # for each WAV file
        wavFile = f
        gtFile = f.replace('.wav', '.segments')                                 # open for annotated file
        if not os.path.isfile(gtFile):                                          # if current WAV file does not have annotation -> skip
            continue
        [segStart, segEnd, segLabels] = readSegmentGT(gtFile)                   # read GT data
        flags, classNames = segs2flags(segStart, segEnd, segLabels, mtStep)     # convert to flags
        for c in classNames:                                                    # update classnames:
            if c not in classesAll:
                classesAll.append(c)
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)                           # read audio data
        [F, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050))     # feature extraction

        lenF = F.shape[1]
        lenL = len(flags)
        MIN = min(lenF, lenL)
        F = F[:, 0:MIN]
        flags = flags[0:MIN]

        flagsNew = []
        for j, fl in enumerate(flags):      # append features and labels
            flagsNew.append(classesAll.index(classNames[flags[j]]))

        flagsAll = numpy.append(flagsAll, numpy.array(flagsNew))

        if i == 0:
            Fall = F
        else:
            Fall = numpy.concatenate((Fall, F), axis=1)
    startprob, transmat, means, cov = trainHMM_computeStatistics(Fall, flagsAll)        # compute HMM statistics
    hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat)      # train HMM
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmmModelName, "wb")   # save HMM model
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classesAll, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classesAll
Example #7
0
def beatExtractionWrapper(wavFileName, plot):
    if not os.path.isfile(wavFileName):
        raise Exception("Input audio file not found!")
    [Fs, x] = audioBasicIO.readAudioFile(wavFileName)
    F = aF.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.050 * Fs)
    BPM, ratio = aF.beatExtraction(F, 0.050, plot)
    print "Beat: {0:d} bpm ".format(int(BPM))
    print "Ratio: {0:.2f} ".format(ratio)
Example #8
0
def silenceRemovalWrapper(inputFile, smoothingWindow, weight):
	if not os.path.isfile(inputFile):
		raise Exception("Input audio file not found!")
	
	[Fs, x] = audioBasicIO.readAudioFile(inputFile)						# read audio signal
	segmentLimits = aS.silenceRemoval(x, Fs, 0.05, 0.05, smoothingWindow, weight, True)	# get onsets
	for i, s in enumerate(segmentLimits):
		strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(inputFile[0:-4], s[0], s[1])
		wavfile.write( strOut, Fs, x[int(Fs*s[0]):int(Fs*s[1])])
def classifyFolderWrapper(inputFolder, modelType, modelName, outputMode=False):
	if not os.path.isfile(modelName):
		raise Exception("Input modelName not found!")

	if modelType=='svm':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName)
	elif modelType=='knn':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName)

	PsAll = numpy.zeros((len(classNames), ))	
		
	files = "*.wav"
	if os.path.isdir(inputFolder):
		strFilePattern = os.path.join(inputFolder, files)
	else:
		strFilePattern = inputFolder + files

	wavFilesList = []
	wavFilesList.extend(glob.glob(strFilePattern))
	wavFilesList = sorted(wavFilesList)
	if len(wavFilesList)==0:
		print "No WAV files found!"
		return 
	
	Results = []
	for wavFile in wavFilesList:	
		[Fs, x] = audioBasicIO.readAudioFile(wavFile)	
		signalLength = x.shape[0] / float(Fs)
		[Result, P, classNames] = aT.fileClassification(wavFile, modelName, modelType)					
		PsAll += (numpy.array(P) * signalLength)		
		Result = int(Result)
		Results.append(Result)
		if outputMode:
			print "{0:s}\t{1:s}".format(wavFile,classNames[Result])
	Results = numpy.array(Results)
	
	# print distribution of classes:
	[Histogram, _] = numpy.histogram(Results, bins=numpy.arange(len(classNames)+1))
	if outputMode:	
		for i,h in enumerate(Histogram):
			print "{0:20s}\t\t{1:d}".format(classNames[i], h)
	PsAll = PsAll / numpy.sum(PsAll)


	if outputMode:	
		fig = plt.figure()
		ax = fig.add_subplot(111)
		plt.title("Classes percentage " + inputFolder.replace('Segments',''))
		ax.axis((0, len(classNames)+1, 0, 1))
		ax.set_xticks(numpy.array(range(len(classNames)+1)))
		ax.set_xticklabels([" "] + classNames)
		ax.bar(numpy.array(range(len(classNames)))+0.5, PsAll)
		plt.show()
	return classNames, PsAll
def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mtWin, mtStep:    mid-term window and step (in seconds)
        - stWin, stStep:    short-term window and step (in seconds)
    """

    allMtFeatures = numpy.array([])
    processingTimes = []

    types = ('*.wav', '*.aif', '*.aiff')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(dirName, files)))

    wavFilesList = sorted(wavFilesList)

    for wavFile in wavFilesList:
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)  # read file
        t1 = time.clock()
        x = audioBasicIO.stereo2mono(x)  # convert stereo to mono
        if computeBEAT:  # mid-term feature extraction for current file
            [MidTermFeatures, stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs),
                                                                round(Fs * stWin), round(Fs * stStep))
            [beat, beatConf] = beatExtraction(stFeatures, stStep)
        else:
            [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin),
                                                       round(Fs * stStep))

        MidTermFeatures = numpy.transpose(MidTermFeatures)
        MidTermFeatures = MidTermFeatures.mean(axis=0)  # long term averaging of mid-term statistics
        if computeBEAT:
            MidTermFeatures = numpy.append(MidTermFeatures, beat)
            MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
        if len(allMtFeatures) == 0:  # append feature vector
            allMtFeatures = MidTermFeatures
        else:
            allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
        t2 = time.clock()
        duration = float(len(x)) / Fs
        processingTimes.append((t2 - t1) / duration)
    if len(processingTimes) > 0:
        print "Feature extraction complexity ratio: {0:.1f} x realtime".format(
            (1.0 / numpy.mean(numpy.array(processingTimes))))
    return (allMtFeatures, wavFilesList)
def fileRegression(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(inputFile):
        print "fileClassification: wav file not found!"
        return (-1, -1, -1)

    regressionModels = glob.glob(modelName + "_*")
    regressionModels2 = []
    for r in regressionModels:
        if r[-5::] != "MEANS":
            regressionModels2.append(r)
    regressionModels = regressionModels2
    regressionNames = []
    for r in regressionModels:
        regressionNames.append(r[r.rfind("_") + 1::])

    # FEATURE EXTRACTION
    # LOAD ONLY THE FIRST MODEL (for mtWin, etc)
    if modelType == 'svm':
        [_, _, _, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(regressionModels[0], True)
    elif modelType == 'knn':
        [_, _, _, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(regressionModels[0], True)

    [Fs, x] = audioBasicIO.readAudioFile(inputFile)  # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)
    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(axis=1)  # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)

    # REGRESSION
    R = []
    for ir, r in enumerate(regressionModels):
        if not os.path.isfile(r):
            print "fileClassification: input modelName not found!"
            return (-1, -1, -1)
        if modelType == 'svm':
            [Model, MEAN, STD, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(r, True)
        elif modelType == 'knn':
            [Model, MEAN, STD, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(r, True)
        curFV = (MidTermFeatures - MEAN) / STD  # normalization
        R.append(regressionWrapper(Model, modelType, curFV))  # classification
    return R, regressionNames
def hmmSegmentation(wavFileName, hmmModelName, PLOT=False, gtFileName=""):
    [Fs, x] = audioBasicIO.readAudioFile(wavFileName)          # read audio data

    try:
        fo = open(hmmModelName, "rb")
    except IOError:
        print "didn't find file"
        return

    try:
        hmm = cPickle.load(fo)
        classesAll = cPickle.load(fo)
        mtWin = cPickle.load(fo)
        mtStep = cPickle.load(fo)
    except:
        fo.close()
    fo.close()

    #Features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);    # feature extraction
    [Features, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050))
    flagsInd = hmm.predict(Features.T)                            # apply model
    #for i in range(len(flagsInd)):
    #    if classesAll[flagsInd[i]]=="silence":
    #        flagsInd[i]=classesAll.index("speech")
                   
                                                                             # plot results
    if os.path.isfile(gtFileName):
        [segStart, segEnd, segLabels] = readSegmentGT(gtFileName)
        flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep)
        flagsGTNew = []
        for j, fl in enumerate(flagsGT):                        # "align" labels with GT
            if classNamesGT[flagsGT[j]] in classesAll:
                flagsGTNew.append(classesAll.index(classNamesGT[flagsGT[j]]))
            else:
                flagsGTNew.append(-1)
        CM = numpy.zeros((len(classNamesGT), len(classNamesGT)))
        flagsIndGT = numpy.array(flagsGTNew)
        for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])):
            CM[int(flagsIndGT[i]),int(flagsInd[i])] += 1                
    else:
        flagsIndGT = numpy.array([])    
    acc = plotSegmentationResults(flagsInd, flagsIndGT, classesAll, mtStep, not PLOT)
    if acc >= 0:
        print "Overall Accuracy: {0:.2f}".format(acc)
        return (flagsInd, classNamesGT, acc, CM)
    else:
        return (flagsInd, classesAll, -1, -1)
def dirWavFeatureExtractionNoAveraging(dirName, mtWin, mtStep, stWin, stStep):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder without averaging each file.

    ARGUMENTS:
        - dirName:          the path of the WAVE directory
        - mtWin, mtStep:    mid-term window and step (in seconds)
        - stWin, stStep:    short-term window and step (in seconds)
    RETURNS:
        - X:                A feature matrix
        - Y:                A matrix of file labels
        - filenames:
    """

    allMtFeatures = numpy.array([])
    signalIndices = numpy.array([])
    processingTimes = []

    types = ('*.wav', '*.aif',  '*.aiff')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(dirName, files)))

    wavFilesList = sorted(wavFilesList)

    for i, wavFile in enumerate(wavFilesList):
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)            # read file
        if isinstance(x, int):
            continue        
        
        x = audioBasicIO.stereo2mono(x)                          # convert stereo to mono
        [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep))  # mid-term feature

        MidTermFeatures = numpy.transpose(MidTermFeatures)
#        MidTermFeatures = MidTermFeatures.mean(axis=0)        # long term averaging of mid-term statistics
        if len(allMtFeatures) == 0:                # append feature vector
            allMtFeatures = MidTermFeatures
            signalIndices = numpy.zeros((MidTermFeatures.shape[0], ))
        else:
            allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
            signalIndices = numpy.append(signalIndices, i * numpy.ones((MidTermFeatures.shape[0], )))

    return (allMtFeatures, signalIndices, wavFilesList)
Example #14
0
def fileClassification(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(modelName):
        print "fileClassification: input modelName not found!"
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print "fileClassification: wav file not found!"
        return (-1, -1, -1)

    if modelType == 'svm':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadSVModel(modelName)
    elif modelType == 'knn':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadKNNModel(modelName)

    [Fs, x] = audioBasicIO.readAudioFile(
        inputFile)  # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)
    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs,
                                                  mtWin * Fs, mtStep * Fs,
                                                  round(Fs * stWin),
                                                  round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(
        axis=1)  # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
    curFV = (MidTermFeatures - MEAN) / STD
    # normalization

    [Result, P] = classifierWrapper(Classifier, modelType,
                                    curFV)  # classification
    return Result, P, classNames
Example #15
0
def hmmSegmentation(wavFileName, hmmModelName, PLOT=False, gtFileName=""):
    [Fs, x] = audioBasicIO.readAudioFile(wavFileName)          # read audio data

    try:
        fo = open(hmmModelName, "rb")
    except IOError:
        print "didn't find file"
        return

    try:
        hmm = cPickle.load(fo)
        classesAll = cPickle.load(fo)
        mtWin = cPickle.load(fo)
        mtStep = cPickle.load(fo)
    except:
        fo.close()
    fo.close()
    #Features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);    # feature extraction
    [Features, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050))
    flagsInd = hmm.predict(Features.T)                            # apply model
    #for i in range(len(flagsInd)):
    #    if classesAll[flagsInd[i]]=="silence":
    #        flagsInd[i]=classesAll.index("speech")
                                                # plot results
    if os.path.isfile(gtFileName):
        [segStart, segEnd, segLabels] = readSegmentGT(gtFileName)
        flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep)
        flagsGTNew = []
        for j, fl in enumerate(flagsGT):                        # "align" labels with GT
            if classNamesGT[flagsGT[j]] in classesAll:
                flagsGTNew.append(classesAll.index(classNamesGT[flagsGT[j]]))
            else:
                flagsGTNew.append(-1)
        flagsIndGT = numpy.array(flagsGTNew)
    else:
        flagsIndGT = numpy.array([])
    acc = plotSegmentationResults(flagsInd, flagsIndGT, classesAll, mtStep, not PLOT)
    if acc >= 0:
        print "Overall Accuracy: {0:.2f}".format(acc)

    return flagsInd, classesAll, acc
Example #16
0
def dirWavFeatureExtractionNoAveraging(dirName, mtWin, mtStep, stWin, stStep):
	"""
	This function extracts the mid-term features of the WAVE files of a particular folder without averaging each file.

	
	ARGUMENTS:
		- dirName:		the path of the WAVE directory
		- mtWin, mtStep:	mid-term window and step (in seconds)
		- stWin, stStep:	short-term window and step (in seconds)
	RETURNS:
		- X:			A feature matrix
		- Y:			A matrix of file labels 
		- filenames:
	"""

	allMtFeatures = numpy.array([])
	signalIndices = numpy.array([])
 	processingTimes = []

	types = ('*.wav', '*.aif',  '*.aiff')
	wavFilesList = []
	for files in types:
		wavFilesList.extend(glob.glob(os.path.join(dirName, files)))
	
	wavFilesList = sorted(wavFilesList)
	
	for i, wavFile in enumerate(wavFilesList):
		[Fs, x] = audioBasicIO.readAudioFile(wavFile)			# read file	
		x = audioBasicIO.stereo2mono(x);				# convert stereo to mono
		[MidTermFeatures, _] = 	mtFeatureExtraction(x, Fs, round(mtWin*Fs), round(mtStep*Fs), round(Fs*stWin), round(Fs*stStep)) # mid-term feature

		MidTermFeatures = numpy.transpose(MidTermFeatures)
#		MidTermFeatures = MidTermFeatures.mean(axis=0)		# long term averaging of mid-term statistics
		if len(allMtFeatures)==0:				# append feature vector
			allMtFeatures = MidTermFeatures
			signalIndices = numpy.zeros( (MidTermFeatures.shape[0], ) )
		else:
			allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))			
			signalIndices = numpy.append( signalIndices, i*numpy.ones( (MidTermFeatures.shape[0], ) ))

	return (allMtFeatures, signalIndices, wavFilesList)
Example #17
0
def fileClassification(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(modelName):
        print ("fileClassification: input modelName not found!")
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print ("fileClassification: wav file not found!")
        return (-1, -1, -1)

    if (modelType) == 'svm' or (modelType == 'svm_rbf'):
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(modelName)
    elif modelType == 'knn':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(modelName)
    elif modelType == 'randomforest':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadRandomForestModel(modelName)
    elif modelType == 'gradientboosting':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadGradientBoostingModel(modelName)
    elif modelType == 'extratrees':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadExtraTreesModel(modelName)

    [Fs, x] = audioBasicIO.readAudioFile(inputFile)        # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)

    if isinstance(x, int):                                 # audio file IO problem
        return (-1, -1, -1)
    if x.shape[0] / float(Fs) <= mtWin:
        return (-1, -1, -1)

    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(axis=1)        # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
    curFV = (MidTermFeatures - MEAN) / STD                # normalization

    [Result, P] = classifierWrapper(Classifier, modelType, curFV)    # classification        
    return Result, P, classNames
def fileClassification(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(modelName):
        print "fileClassification: input modelName not found!"
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print "fileClassification: wav file not found!"
        return (-1, -1, -1)

    if (modelType) == 'svm' or (modelType == 'svm_rbf'):
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(modelName)
    elif modelType == 'knn':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(modelName)
    elif modelType == 'randomforest':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadRandomForestModel(modelName)
    elif modelType == 'gradientboosting':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadGradientBoostingModel(modelName)
    elif modelType == 'extratrees':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadExtraTreesModel(modelName)

    [Fs, x] = audioBasicIO.readAudioFile(inputFile)        # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)

    if isinstance(x, int):                                 # audio file IO problem
        return (-1, -1, -1)
    if x.shape[0] / float(Fs) <= mtWin:
        return (-1, -1, -1)

    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(axis=1)        # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
    curFV = (MidTermFeatures - MEAN) / STD                # normalization

    [Result, P] = classifierWrapper(Classifier, modelType, curFV)    # classification        
    return Result, P, classNames
def trainHMM_fromFile(wavFile, gtFile, hmmModelName, mtWin, mtStep):
    """
    This function trains a HMM model for segmentation-classification using a single annotated audio file
    ARGUMENTS:
     - wavFile:        the path of the audio filename
     - gtFile:         the path of the ground truth filename
                       (a csv file of the form <segment start in seconds>,<segment end in seconds>,<segment label> in each row
     - hmmModelName:   the name of the HMM model to be stored
     - mtWin:          mid-term window size
     - mtStep:         mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - classNames:     a list of classNames

    After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file
    """

    [segStart, segEnd, segLabels] = readSegmentGT(gtFile)  # read ground truth data
    flags, classNames = segs2flags(segStart, segEnd, segLabels, mtStep)  # convert to fix-sized sequence of flags

    [Fs, x] = audioBasicIO.readAudioFile(wavFile)  # read audio data
    # F = aF.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);
    [F, _] = aF.mtFeatureExtraction(
        x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050)
    )  # feature extraction
    startprob, transmat, means, cov = trainHMM_computeStatistics(
        F, flags
    )  # compute HMM statistics (priors, transition matrix, etc)

    hmm = sklearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat)  # hmm training
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmmModelName, "wb")  # output to file
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classNames
def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile,
                              storeStFeatures=False, storeToCSV=False, PLOT=False):
    """
    This function is used as a wrapper to:
    a) read the content of a WAV file
    b) perform mid-term feature extraction on that signal
    c) write the mid-term feature sequences to a numpy file
    """
    [Fs, x] = audioBasicIO.readAudioFile(fileName)            # read the wav file
 #   print("fs is ........",Fs)
    x = audioBasicIO.stereo2mono(x)                           # convert to MONO if required
    if storeStFeatures:
        [mtF, stF] = mtFeatureExtraction(x, Fs, round(Fs * midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep))
    else:
        [mtF, _] = mtFeatureExtraction(x, Fs, round(Fs*midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep))

    numpy.save(outPutFile, mtF)                              # save mt features to numpy file
    if PLOT:
        print "Mid-term numpy file: " + outPutFile + ".npy saved"
    if storeToCSV:
        path="/Users/somyagoel/Flask/basic_app/music/mt/"
        newpath = outPutFile.split("/")[-1]
        numpy.savetxt(path+newpath+".csv", mtF.T, delimiter=",")
        if PLOT:
            print "Mid-term CSV file: " + outPutFile + ".csv saved"

    if storeStFeatures:
        numpy.save(outPutFile+"_st", stF)                    # save st features to numpy file
        if PLOT:
            print "Short-term numpy file: " + outPutFile + "_st.npy saved"
        if storeToCSV:
            #  os.getcwd()
            path="/Users/somyagoel/Flask/basic_app/music/st/"
            #  os.chdir(path)
            newpath = outPutFile.split("/")[-1]
            numpy.savetxt(path+newpath+"_st.csv", stF.T, delimiter=",")    # store st features to CSV file
            #      path="/Users/somyagoel/dir_fe"
            #os.chdir(path)
            if PLOT:
                print "Short-term CSV file: " + outPutFile + "_st.csv saved"
def trainHMM_fromFile(wavFile, gtFile, hmmModelName, mtWin, mtStep):
    '''
    This function trains a HMM model for segmentation-classification using a single annotated audio file
    ARGUMENTS:
     - wavFile:        the path of the audio filename
     - gtFile:         the path of the ground truth filename
                       (a csv file of the form <segment start in seconds>,<segment end in seconds>,<segment label> in each row
     - hmmModelName:   the name of the HMM model to be stored
     - mtWin:          mid-term window size
     - mtStep:         mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - classNames:     a list of classNames

    After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file
    '''

    [segStart, segEnd, segLabels] = readSegmentGT(gtFile)                        # read ground truth data
    flags, classNames = segs2flags(segStart, segEnd, segLabels, mtStep)          # convert to fix-sized sequence of flags

    [Fs, x] = audioBasicIO.readAudioFile(wavFile)                                # read audio data
    #F = aF.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);
    [F, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050))    # feature extraction
    startprob, transmat, means, cov = trainHMM_computeStatistics(F, flags)                    # compute HMM statistics (priors, transition matrix, etc)
    
    hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag")            # hmm training

    hmm.startprob_ = startprob
    hmm.transmat_ = transmat    
    hmm.means_ = means
    hmm.covars_ = cov
    
    fo = open(hmmModelName, "wb")                                                             # output to file
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classNames
def annotation2files(wavFile, csvFile):
    '''
        Break an audio stream to segments of interest, 
        defined by a csv file
        
        - wavFile:    path to input wavfile
        - csvFile:    path to csvFile of segment limits
        
        Input CSV file must be of the format <T1>\t<T2>\t<Label>
    '''    
    
    [Fs, x] = audioBasicIO.readAudioFile(wavFile)
    with open(csvFile, 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter='\t', quotechar='|')
        for j, row in enumerate(reader):
            T1 = float(row[0].replace(",","."))
            T2 = float(row[1].replace(",","."))            
            label = "%s_%s_%.2f_%.2f.wav" % (wavFile, row[2], T1, T2)
            label = label.replace(" ", "_")
            xtemp = x[int(round(T1*Fs)):int(round(T2*Fs))]            
            print T1, T2, label, xtemp.shape
            wavfile.write(label, Fs, xtemp)  
def annotation2files(wavFile, csvFile):
    '''
        Break an audio stream to segments of interest, 
        defined by a csv file

        - wavFile:    path to input wavfile
        - csvFile:    path to csvFile of segment limits

        Input CSV file must be of the format <T1>\t<T2>\t<Label>
    '''

    [Fs, x] = audioBasicIO.readAudioFile(wavFile)
    with open(csvFile, 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter='\t', quotechar='|')
        for j, row in enumerate(reader):
            T1 = float(row[0].replace(",", "."))
            T2 = float(row[1].replace(",", "."))
            label = "%s_%s_%.2f_%.2f.wav" % (wavFile, row[2], T1, T2)
            label = label.replace(" ", "_")
            xtemp = x[int(round(T1 * Fs)):int(round(T2 * Fs))]
            print(T1, T2, label, xtemp.shape)
            wavfile.write(label, Fs, xtemp)
def trainHMM_fromFile(wav_file, gt_file, hmm_model_name, mt_win, mt_step):
    '''
	This function trains a HMM model for segmentation-classification using a single annotated audio file
	ARGUMENTS:
	 - wav_file:        the path of the audio filename
	 - gt_file:         the path of the ground truth filename
					   (a csv file of the form <segment start in seconds>,<segment end in seconds>,<segment label> in each row
	 - hmm_model_name:   the name of the HMM model to be stored
	 - mt_win:          mid-term window size
	 - mt_step:         mid-term window step
	RETURNS:
	 - hmm:            an object to the resulting HMM
	 - class_names:     a list of class_names

	After training, hmm, class_names, along with the mt_win and mt_step values are stored in the hmm_model_name file
	'''

    [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
    flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step)
    [fs, x] = audioBasicIO.readAudioFile(wav_file)
    [F, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs,
                                       round(fs * 0.050), round(fs * 0.050))
    start_prob, transmat, means, cov = trainHMM_computeStatistics(F, flags)
    hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")

    hmm.startprob_ = start_prob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmm_model_name, "wb")
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(class_names, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, class_names
Example #25
0
def main(argv):
    #filename = "diarizationExample.wav"
    filename = argv[1]
    for i in range(nExp):
        [Fs1, x1] = audioBasicIO.readAudioFile(filename)
        tcls = aS.speakerDiarization(filename, 2, LDAdim=0, PLOT=False)
        audio = AudioSegment.from_wav(filename)
        audio_length = len(audio)
        change_point = [[] for i in range(2)]
        for j in range(len(tcls) - 1):
            if tcls[j] != tcls[j + 1]:
                change_point[0].append(j)
                change_point[1].append(int(tcls[j]))
            if j == len(tcls) - 2:
                change_point[1].append(int(tcls[j]))
        for num in range(len(change_point[1])):
            if num == 0:
                seg_audio = audio[:audio_length * change_point[0][0] /
                                  len(tcls)]
                seg_audio.export(os.path.join(
                    argv[2],
                    "seg0_speaker{0}.wav".format(change_point[1][num])),
                                 format="wav")
            elif num == len(change_point[1]) - 1:
                seg_audio = audio[audio_length * change_point[0][num - 1] /
                                  len(tcls):]
                seg_audio.export(os.path.join(
                    argv[2],
                    "seg{0}_speaker{1}.wav".format(num, change_point[1][num])),
                                 format="wav")
            else:
                seg_audio = audio[audio_length * change_point[0][num - 1] /
                                  len(tcls):audio_length *
                                  change_point[0][num] / len(tcls)]
                seg_audio.export(os.path.join(
                    argv[2],
                    "seg{0}_speaker{1}.wav".format(num, change_point[1][num])),
                                 format="wav")
Example #26
0
def mtFeatureExtraction(fileName, midTermSize, midTermStep, shortTermSize,
                        shortTermStep):
    allMtFeatures = numpy.array([])
    numpy.set_printoptions(suppress=True)
    """
    This function is used as a wrapper to:
    a) read the content of a WAV file
    b) perform mid-term feature extraction on that signal
    c) write the mid-term feature sequences to a numpy file
    """
    [Fs, x] = audioBasicIO.readAudioFile(fileName)  # read the wav file
    x = audioBasicIO.stereo2mono(x)  # convert to MONO if required
    mtWinRatio = int(round(midTermSize / shortTermStep))
    mtStepRatio = int(round(midTermStep / shortTermStep))
    mtFeatures = []
    stFeatures = stFeatureExtraction(x, Fs, shortTermSize * Fs,
                                     shortTermStep * Fs)
    numOfFeatures = len(stFeatures)
    numOfStatistics = 2
    mtFeatures = []
    #for i in range(numOfStatistics * numOfFeatures + 1):
    for i in range(numOfStatistics * numOfFeatures):
        mtFeatures.append([])
    for i in range(numOfFeatures):  # for each of the short-term features:
        curPos = 0
        N = len(stFeatures[i])
        while (curPos < N):
            N1 = curPos
            N2 = curPos + mtWinRatio
            if N2 > N:
                N2 = N
            curStFeatures = stFeatures[i][N1:N2]
            mtFeatures[i].append(numpy.mean(curStFeatures))
            mtFeatures[i + numOfFeatures].append(numpy.std(curStFeatures))
            #mtFeatures[i+2*numOfFeatures].append(numpy.std(curStFeatures) / (numpy.mean(curStFeatures)+0.00000010))
            curPos += mtStepRatio
    return (numpy.array(mtFeatures), stFeatures, Fs, x)
Example #27
0
def getMusicSegmentsFromFile(inputFile):
    modelType = "svm"
    modelName = "data/svmMovies8classes"

    dirOutput = inputFile[0:-4] + "_musicSegments"

    if os.path.exists(dirOutput) and dirOutput != ".":
        shutil.rmtree(dirOutput)
    os.makedirs(dirOutput)

    [Fs, x] = audioBasicIO.readAudioFile(inputFile)

    if modelType == 'svm':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            compute_beat
        ] = aT.load_model(modelName)
    elif modelType == 'knn':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            compute_beat
        ] = aT.load_model_knn(modelName)

    flagsInd, classNames, acc, CM = aS.mtFileClassification(inputFile,
                                                            modelName,
                                                            modelType,
                                                            plotResults=False,
                                                            gtFile="")
    segs, classes = aS.flags2segs(flagsInd, mtStep)

    for i, s in enumerate(segs):
        if (classNames[int(classes[i])]
                == "Music") and (s[1] - s[0] >= minDuration):
            strOut = "{0:s}{1:.3f}-{2:.3f}.wav".format(dirOutput + os.sep,
                                                       s[0], s[1])
            wavfile.write(strOut, Fs, x[int(Fs * s[0]):int(Fs * s[1])])
def getMusicSegmentsFromFile(inputFile):	
	modelType = "svm"
	modelName = "data/svmMovies8classes"
	
	dirOutput = inputFile[0:-4] + "_musicSegments"
	
	if os.path.exists(dirOutput) and dirOutput!=".":
		shutil.rmtree(dirOutput)	
	os.makedirs(dirOutput)	
	
	[Fs, x] = audioBasicIO.readAudioFile(inputFile)	

	if modelType=='svm':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName)
	elif modelType=='knn':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName)

	flagsInd, classNames, acc = aS.mtFileClassification(inputFile, modelName, modelType, plotResults = False, gtFile = "")
	segs, classes = aS.flags2segs(flagsInd, mtStep)

	for i, s in enumerate(segs):
		if (classNames[int(classes[i])] == "Music") and (s[1] - s[0] >= minDuration):
			strOut = "{0:s}{1:.3f}-{2:.3f}.wav".format(dirOutput+os.sep, s[0], s[1])	
			wavfile.write( strOut, Fs, x[int(Fs*s[0]):int(Fs*s[1])])
def thumbnailWrapper(inputFile, thumbnailWrapperSize):
    stWindow = 1.0
    stStep = 1.0
    if not os.path.isfile(inputFile):
        raise Exception("Input audio file not found!")

    [Fs, x] = audioBasicIO.readAudioFile(inputFile)  # read file
    if Fs == -1:  # could not read file
        return

    [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(
        x, Fs, stWindow, stStep,
        thumbnailWrapperSize)  # find thumbnailWrapper endpoints

    # write thumbnailWrappers to WAV files:
    thumbnailWrapperFileName1 = inputFile.replace(".wav", "_thumb1.wav")
    thumbnailWrapperFileName2 = inputFile.replace(".wav", "_thumb2.wav")
    wavfile.write(thumbnailWrapperFileName1, Fs, x[int(Fs * A1):int(Fs * A2)])
    wavfile.write(thumbnailWrapperFileName2, Fs, x[int(Fs * B1):int(Fs * B2)])
    print "1st thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format(
        thumbnailWrapperFileName1, A1, A2)
    print "2nd thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format(
        thumbnailWrapperFileName2, B1, B2)

    # Plot self-similarity matrix:
    fig = plt.figure()
    ax = fig.add_subplot(111, aspect="auto")
    plt.imshow(Smatrix)
    # Plot best-similarity diagonal:
    Xcenter = (A1 / stStep + A2 / stStep) / 2.0
    Ycenter = (B1 / stStep + B2 / stStep) / 2.0

    e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter),
                                    thumbnailWrapperSize * 1.4,
                                    3,
                                    angle=45,
                                    linewidth=3,
                                    fill=False)
    ax.add_patch(e1)

    plt.plot([B1, Smatrix.shape[0]], [A1, A1],
             color="k",
             linestyle="--",
             linewidth=2)
    plt.plot([B2, Smatrix.shape[0]], [A2, A2],
             color="k",
             linestyle="--",
             linewidth=2)
    plt.plot([B1, B1], [A1, Smatrix.shape[0]],
             color="k",
             linestyle="--",
             linewidth=2)
    plt.plot([B2, B2], [A2, Smatrix.shape[0]],
             color="k",
             linestyle="--",
             linewidth=2)

    plt.xlim([0, Smatrix.shape[0]])
    plt.ylim([Smatrix.shape[1], 0])

    ax.yaxis.set_label_position("right")
    ax.yaxis.tick_right()

    plt.xlabel("frame no")
    plt.ylabel("frame no")
    plt.title("Self-similarity matrix")

    plt.show()
Example #30
0
import audioBasicIO
import audioFeatureExtraction
import matplotlib.pyplot as plt
import numpy
print("COUNT 1\n")
[Fs1, x1] = audioBasicIO.readAudioFile("data/practice.wav")
F1 = audioFeatureExtraction.stFeatureExtraction(x1, Fs1, 0.050 * Fs1,
                                                0.025 * Fs1)
# F1[12*420] MATRIX
print(len(F1[9:21]), len(F1[9:21][0]))

print("\n\nCOUNT 2\n")
[Fs2, x2] = audioBasicIO.readAudioFile("data/practice2.wav")
F2 = audioFeatureExtraction.stFeatureExtraction(x2, Fs2, 0.050 * Fs2,
                                                0.025 * Fs2)

print(len(F2[9:21]), len(F2[9:21][0]))

size = min(len(F2[9:21][0]), len(F1[9:21][0]))
print(size, "\n")

print("\n\nCORRCOEF\n")
print(numpy.corrcoef(F1[9:21, 0:size], F2[9:21, 0:size]) * 0.5 + 0.5)

print("\n\nE Distance\n")
print(numpy.linalg.norm(F1[9:21, 0:size] - F2[9:21, 0:size]))
def speakerDiarization(filename,
                       n_speakers,
                       mt_size=2.0,
                       mt_step=0.2,
                       st_win=0.05,
                       lda_dim=35,
                       plot_res=False):
    '''
	ARGUMENTS:
		- filename:        the name of the WAV file to be analyzed
		- n_speakers    the number of speakers (clusters) in the recording (<=0 for unknown)
		- mt_size (opt)     mid-term window size
		- mt_step (opt)     mid-term window step
		- st_win  (opt)     short-term window size
		- lda_dim (opt)     LDA dimension (0 for no LDA)
		- plot_res     (opt)   0 for not plotting the results 1 for plottingy
	'''
    [fs, x] = audioBasicIO.readAudioFile(filename)
    x = audioBasicIO.stereo2mono(x)
    duration = len(x) / fs

    # [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerAll"))
    # [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerFemaleMale"))
    [
        classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1,
        stStep1, computeBEAT1
    ] = aT.load_model_knn("data/knnSpeakerAll")
    [
        classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2,
        stStep2, computeBEAT2
    ] = aT.load_model_knn("data/knnSpeakerFemaleMale")

    [mt_feats, st_feats, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs,
                                                     mt_step * fs,
                                                     round(fs * st_win),
                                                     round(fs * st_win * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (mt_feats.shape[0] + len(classNames1) + len(classNames2),
         mt_feats.shape[1]))

    for i in range(mt_feats.shape[1]):
        cur_f1 = (mt_feats[:, i] - MEAN1) / STD1
        cur_f2 = (mt_feats[:, i] - MEAN2) / STD2
        [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
        [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
        MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
        MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::,
                         i] = P2 + 0.0001

    mt_feats = MidTermFeatures2  # TODO
    iFeaturesSelect = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]

    mt_feats = mt_feats[iFeaturesSelect, :]

    (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    n_wins = mt_feats.shape[1]

    # remove outliers:
    dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_norm.T)),
                         axis=0)
    m_dist_all = numpy.mean(dist_all)
    i_non_outliers = numpy.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = numpy.min(mt_feats[1,:])
    #EnergyMean = numpy.mean(mt_feats[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #i_non_outliers = numpy.nonzero(mt_feats[1,:] > Thres)[0]
    #print i_non_outliers

    perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins
    mt_feats_norm_or = mt_feats_norm
    mt_feats_norm = mt_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, st_win * fs, round(fs*st_win), round(fs*st_win));
        # extract mid-term features with minimum step:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        #for i in range(num_of_stats * num_of_features + 1):
        for i in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        for i in range(
                num_of_features):  # for each of the short-term features:
            curPos = 0
            N = len(st_feats[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mt_win_ratio
                if N2 > N:
                    N2 = N
                curStFeatures = st_feats[i][N1:N2]
                mt_feats_to_red[i].append(numpy.mean(curStFeatures))
                mt_feats_to_red[i + num_of_features].append(
                    numpy.std(curStFeatures))
                curPos += mt_step_ratio
        mt_feats_to_red = numpy.array(mt_feats_to_red)
        mt_feats_to_red_2 = numpy.zeros(
            (mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2),
             mt_feats_to_red.shape[1]))
        for i in range(mt_feats_to_red.shape[1]):
            cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1
            cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2
            [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
            [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0],
                              i] = mt_feats_to_red[:, i]
            mt_feats_to_red_2[
                mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::,
                              i] = P2 + 0.0001
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :]
        #mt_feats_to_red += numpy.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010
        (mt_feats_to_red, MEAN,
         STD) = aT.normalizeFeatures([mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        #dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0)
        #m_dist_all = numpy.mean(dist_all)
        #iNonOutLiers2 = numpy.nonzero(dist_all < 3.0*m_dist_all)[0]
        #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2]
        Labels = numpy.zeros((mt_feats_to_red.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / st_win
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * st_win / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, Labels)
        mt_feats_norm = (clf.transform(mt_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    clsAll = []
    sil_all = []
    centersAll = []

    for iSpeakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(mt_feats_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(mt_feats_norm.T))
        clsAll.append(cls)
        centersAll.append(means)
        sil_1 = []
        sil_2 = []
        for c in range(iSpeakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = numpy.nonzero(cls == c)[0].shape[0] / \
                 float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mt_feats_norm[:, cls == c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                Yt = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(numpy.mean(Yt) * clust_per_cent)
                silBs = []
                for c2 in range(iSpeakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = numpy.nonzero(cls == c2)[0].shape[0] /\
                               float(len(cls))
                        MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2]
                        Yt = distance.cdist(mt_feats_norm_temp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clust_per_cent + clust_per_cent_2) / 2.0)
                silBs = numpy.array(silBs)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(silBs))
        sil_1 = numpy.array(sil_1)
        sil_2 = numpy.array(sil_2)
        sil = []
        for c in range(iSpeakers):
            # for each cluster (speaker) compute silhouette
            sil.append(
                (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(numpy.mean(sil))

    imax = numpy.argmax(sil_all)
    # optimal number of clusters
    nSpeakersFinal = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = numpy.zeros((n_wins, ))
    for i in range(n_wins):
        j = numpy.argmin(numpy.abs(i - i_non_outliers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
         trainHMM_computeStatistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]
    class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundturh exists
    if os.path.isfile(gt_file):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs,
                                              mt_step)

    if plot_res:
        fig = plt.figure()
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(numpy.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(numpy.array(range(len(cls))) * mt_step + mt_step / 2.0, cls)

    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(
                numpy.array(range(len(flags_gt))) * mt_step + mt_step / 2.0,
                flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
         evaluateSpeakerDiarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(
                          100 * purity_cluster_m, 100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        #print s_range, sil_all
        if n_speakers <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        #plt.show()
        plt.savefig('output/outImg.jpg')
    return cls
def mtFileClassification(input_file,
                         model_name,
                         model_type,
                         plot_results=False,
                         gt_file=""):
    '''
	This function performs mid-term classification of an audio stream.
	Towards this end, supervised knowledge is used, i.e. a pre-trained classifier.
	ARGUMENTS:
		- input_file:        path of the input WAV file
		- model_name:        name of the classification model
		- model_type:        svm or knn depending on the classifier type
		- plot_results:      True if results are to be plotted using
							 matplotlib along with a set of statistics

	RETURNS:
		  - segs:           a sequence of segment's endpoints: segs[i] is the
							endpoint of the i-th segment (in seconds)
		  - classes:        a sequence of class flags: class[i] is the
							class ID of the i-th segment
	'''

    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if model_type == "knn":
        [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = \
         aT.load_model_knn(model_name)
    else:
        [
            classifier, MEAN, STD, class_names, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = aT.load_model(model_name)

    if compute_beat:
        print("Model " + model_name + " contains long-term music features "
              "(beat etc) and cannot be used in "
              "segmentation")
        return (-1, -1, -1, -1)
    [fs, x] = audioBasicIO.readAudioFile(input_file)  # load input file
    if fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audioBasicIO.stereo2mono(x)  # convert stereo (if) to mono
    duration = len(x) / fs
    # mid-term feature extraction:
    [mt_feats, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs,
                                              round(fs * st_win),
                                              round(fs * st_step))
    flags = []
    Ps = []
    flags_ind = []
    for i in range(
            mt_feats.shape[1]
    ):  # for each feature vector (i.e. for each fix-sized segment):
        cur_fv = (mt_feats[:, i] -
                  MEAN) / STD  # normalize current feature vector
        [res, P] = aT.classifierWrapper(classifier, model_type,
                                        cur_fv)  # classify vector
        flags_ind.append(res)
        flags.append(class_names[int(res)])  # update class label matrix
        Ps.append(numpy.max(P))  # update probability matrix
    flags_ind = numpy.array(flags_ind)

    # 1-window smoothing
    for i in range(1, len(flags_ind) - 1):
        if flags_ind[i - 1] == flags_ind[i + 1]:
            flags_ind[i] = flags_ind[i + 1]
    # convert fix-sized flags to segments and classes
    (segs, classes) = flags2segs(flags, mt_step)
    segs[-1] = len(x) / float(fs)

    # Load grount-truth:
    if os.path.isfile(gt_file):
        [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt,
                                              seg_l_gt, mt_step)
        flags_ind_gt = []
        for j, fl in enumerate(flags_gt):
            # "align" labels with GT
            if class_names_gt[flags_gt[j]] in class_names:
                flags_ind_gt.append(
                    class_names.index(class_names_gt[flags_gt[j]]))
            else:
                flags_ind_gt.append(-1)
        flags_ind_gt = numpy.array(flags_ind_gt)
        cm = numpy.zeros((len(class_names_gt), len(class_names_gt)))
        for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])):
            cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1
    else:
        cm = []
        flags_ind_gt = numpy.array([])
    acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names,
                                  mt_step, not plot_results)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc))
        return (flags_ind, class_names_gt, acc, cm)
    else:
        return (flags_ind, class_names, acc, cm)
Example #33
0
import audioBasicIO
import audioFeatureExtraction
import matplotlib.pyplot as plt
import audioTrainTest as aT

plot = False
[Fs, x] = audioBasicIO.readAudioFile("data/Heavy.wav")
F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs)
# ZCR: The rate of sign-changes of the signal during the duration of a particular frame.
if plot:
    plt.subplot(2, 2, 1)
    plt.plot(F[0, :])
    plt.xlabel('Frame no')
    plt.ylabel('ZCR')
    plt.subplot(2, 2, 2)
    plt.plot(F[1, :])
    plt.xlabel('Frame no')
    plt.ylabel('Energy')
    plt.subplot(2, 2, 3)
    plt.plot(F[2, :])
    plt.xlabel('Frame no')
    plt.ylabel('Entropy of Energy')
    plt.subplot(2, 2, 4)
    plt.plot(F[3, :])
    plt.xlabel('Frame no')
    plt.ylabel('Spectral Centroid')
    plt.show()

Result, P, classNames = aT.fileClassification("data/Heavy.wav",
                                              "data/svmMusicGenre3", "svm")
print Result
Example #34
0
def classifyFolderWrapper(inputFolder, modelType, modelName, outputMode=False):
    if not os.path.isfile(modelName):
        raise Exception("Input modelName not found!")

    if modelType == 'svm':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            compute_beat
        ] = aT.load_model(modelName)
    elif modelType == 'knn':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            compute_beat
        ] = aT.load_model_knn(modelName)

    PsAll = numpy.zeros((len(classNames), ))

    files = "*.wav"
    if os.path.isdir(inputFolder):
        strFilePattern = os.path.join(inputFolder, files)
    else:
        strFilePattern = inputFolder + files

    wavFilesList = []
    wavFilesList.extend(glob.glob(strFilePattern))
    wavFilesList = sorted(wavFilesList)
    if len(wavFilesList) == 0:
        print "No WAV files found!"
        return

    Results = []
    for wavFile in wavFilesList:
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)
        signalLength = x.shape[0] / float(Fs)
        [Result, P,
         classNames] = aT.file_classification(wavFile, modelName, modelType)
        PsAll += (numpy.array(P) * signalLength)
        Result = int(Result)
        Results.append(Result)
        if outputMode:
            print "{0:s}\t{1:s}".format(wavFile, classNames[Result])
    Results = numpy.array(Results)

    # print distribution of classes:
    [Histogram, _] = numpy.histogram(Results,
                                     bins=numpy.arange(len(classNames) + 1))
    if outputMode:
        for i, h in enumerate(Histogram):
            print "{0:20s}\t\t{1:d}".format(classNames[i], h)
    PsAll = PsAll / numpy.sum(PsAll)

    if outputMode:
        fig = plt.figure()
        ax = fig.add_subplot(111)
        plt.title("Classes percentage " + inputFolder.replace('Segments', ''))
        ax.axis((0, len(classNames) + 1, 0, 1))
        ax.set_xticks(numpy.array(range(len(classNames) + 1)))
        ax.set_xticklabels([" "] + classNames)
        ax.bar(numpy.array(range(len(classNames))) + 0.5, PsAll)
        plt.show()
    return classNames, PsAll
Example #35
0
def speakerDiarization(fileName, numOfSpeakers, mtSize = 2.0, mtStep=0.2, stWin=0.05, LDAdim = 35, PLOT = False):
	'''
	ARGUMENTS:
		- fileName:		the name of the WAV file to be analyzed
		- numOfSpeakers	the number of speakers (clusters) in the recording (<=0 for unknown)
		- mtSize (opt)	mid-term window size
		- mtStep (opt)	mid-term window step
		- stWin  (opt)	short-term window size
		- LDAdim (opt)	LDA dimension (0 for no LDA)
		- PLOT	 (opt)	0 for not plotting the results 1 for plottingy
	'''
	[Fs, x] = audioBasicIO.readAudioFile(fileName)
	x = audioBasicIO.stereo2mono(x);
	Duration = len(x) / Fs

	[Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("data/knnSpeakerAll")
	[Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("data/knnSpeakerFemaleMale")

	[MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs*stWin), round(Fs*stWin*0.5));

	MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1] ) )

	for i in range(MidTermFeatures.shape[1]):
		curF1 = (MidTermFeatures[:,i] - MEAN1)  / STD1
		curF2 = (MidTermFeatures[:,i] - MEAN2)  / STD2
		[Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
		[Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
		MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
		MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001;
		MidTermFeatures2[MidTermFeatures.shape[0]+len(classNames1)::, i] = P2 + 0.0001;
	
	MidTermFeatures = MidTermFeatures2	# TODO	
	# SELECT FEATURES:
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; 																											# SET 0A
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; 																									# SET 0B
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 0C
	
	iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53]; 																	# SET 1A
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; 															# SET 1B
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 1C
	
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; 			# SET 2A		
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; 	# SET 2B
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 2C
	
	#iFeaturesSelect = range(100);																									# SET 3	
	#MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010  
	
	MidTermFeatures = MidTermFeatures[iFeaturesSelect,:]		
	
	(MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T])
	MidTermFeaturesNorm = MidTermFeaturesNorm[0].T	
	numOfWindows = MidTermFeatures.shape[1]

	# remove outliers:
	DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0)
	MDistancesAll = numpy.mean(DistancesAll)
	iNonOutLiers = numpy.nonzero(DistancesAll < 1.2*MDistancesAll)[0]
	
	# TODO: Combine energy threshold for outlier removal:
	#EnergyMin = numpy.min(MidTermFeatures[1,:])
	#EnergyMean = numpy.mean(MidTermFeatures[1,:])
	#Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
	#iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
	#print iNonOutLiers

	perOutLier = (100.0*(numOfWindows-iNonOutLiers.shape[0])) / numOfWindows	
	MidTermFeaturesNormOr = MidTermFeaturesNorm
	MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]
	
	# LDA dimensionality reduction:
	if LDAdim > 0:
		#[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));		
		# extract mid-term features with minimum step:
		mtWinRatio  = int(round(mtSize  / stWin));
		mtStepRatio = int(round(stWin / stWin));
		mtFeaturesToReduce = []			
		numOfFeatures = len(ShortTermFeatures)
		numOfStatistics = 2;			
		#for i in range(numOfStatistics * numOfFeatures + 1):
		for i in range(numOfStatistics * numOfFeatures):
			mtFeaturesToReduce.append([])

		for i in range(numOfFeatures):		# for each of the short-term features:
			curPos = 0
			N = len(ShortTermFeatures[i])
			while (curPos<N):
				N1 = curPos
				N2 = curPos + mtWinRatio
				if N2 > N:
					N2 = N
				curStFeatures = ShortTermFeatures[i][N1:N2]
				mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
				mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures))				
				curPos += mtStepRatio		
		mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
				
		mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1] ) )
		for i in range(mtFeaturesToReduce.shape[1]):
			curF1 = (mtFeaturesToReduce[:,i] - MEAN1)  / STD1
			curF2 = (mtFeaturesToReduce[:,i] - MEAN2)  / STD2
			[Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
			[Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
			mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0]+len(classNames1), i] = P1 + 0.0001;
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001;
		mtFeaturesToReduce = mtFeaturesToReduce2		
		mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect,:]		
		#mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
		(mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])	
		mtFeaturesToReduce = mtFeaturesToReduce[0].T
		#DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
		#MDistancesAll = numpy.mean(DistancesAll)
		#iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
		#mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
		Labels = numpy.zeros((mtFeaturesToReduce.shape[1],));
		LDAstep = 1.0
		LDAstepRatio = LDAstep / stWin
		#print LDAstep, LDAstepRatio
		for i in range(Labels.shape[0]):
			Labels[i] = int(i*stWin/LDAstepRatio);		
		clf = LDA(n_components=LDAdim)
		clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001)
		MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

	if numOfSpeakers<=0:
		sRange = range(2,10)
	else:
		sRange = [numOfSpeakers]
	clsAll = []; silAll = []; centersAll = []
	
	for iSpeakers in sRange:
		cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True)		# perform k-means clustering
		
		#YDist =   distance.pdist(MidTermFeaturesNorm.T, metric='euclidean')
		#print distance.squareform(YDist).shape
		#hc = mlpy.HCluster()
		#hc.linkage(YDist)
		#cls = hc.cut(14.5)
		#print cls

		# Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
		clsAll.append(cls)
		centersAll.append(means)
		silA = []; silB = []
		for c in range(iSpeakers):								# for each speaker (i.e. for each extracted cluster)
			clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls))
			if clusterPerCent < 0.020:
				silA.append(0.0)
				silB.append(0.0)
			else:
				MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c]			# get subset of feature vectors
				Yt = distance.pdist(MidTermFeaturesNormTemp.T)				# compute average distance between samples that belong to the cluster (a values)
				silA.append(numpy.mean(Yt)*clusterPerCent)
				silBs = []
				for c2 in range(iSpeakers):						# compute distances from samples of other clusters
					if c2!=c:
						clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls))
						MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2]
						Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
						silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0)
				silBs = numpy.array(silBs)							
				silB.append(min(silBs))							# ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
		silA = numpy.array(silA); 
		silB = numpy.array(silB); 
		sil = []
		for c in range(iSpeakers):								# for each cluster (speaker)
			sil.append( ( silB[c] - silA[c]) / (max(silB[c],  silA[c])+0.00001)  )		# compute silhouette

		silAll.append(numpy.mean(sil))								# keep the AVERAGE SILLOUETTE

	#silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
	imax = numpy.argmax(silAll)									# position of the maximum sillouette value
	nSpeakersFinal = sRange[imax]									# optimal number of clusters

	# generate the final set of cluster labels
	# (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window)
	cls = numpy.zeros((numOfWindows,))
	for i in range(numOfWindows):
		j = numpy.argmin(numpy.abs(i-iNonOutLiers))		
		cls[i] = clsAll[imax][j]
		
	# Post-process method 1: hmm smoothing
	for i in range(1):
		startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls)
		hmm = sklearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat)			# hmm training
		hmm.means_ = means; hmm.covars_ = cov
		cls = hmm.predict(MidTermFeaturesNormOr.T)					
	
	# Post-process method 2: median filtering:
	cls = scipy.signal.medfilt(cls, 13)
	cls = scipy.signal.medfilt(cls, 11)

	sil = silAll[imax]										# final sillouette
	classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)];


	# load ground-truth if available
	gtFile = fileName.replace('.wav', '.segments');							# open for annotated file
	if os.path.isfile(gtFile):									# if groundturh exists
		[segStart, segEnd, segLabels] = readSegmentGT(gtFile)					# read GT data
		flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep)			# convert to flags

	if PLOT:
		fig = plt.figure()	
		if numOfSpeakers>0:
			ax1 = fig.add_subplot(111)
		else:
			ax1 = fig.add_subplot(211)
		ax1.set_yticks(numpy.array(range(len(classNames))))
		ax1.axis((0, Duration, -1, len(classNames)))
		ax1.set_yticklabels(classNames)
		ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls)

	if os.path.isfile(gtFile):
		if PLOT:
			ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r')
		purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT)
		print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean)
		if PLOT:
			plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) )
	if PLOT:
		plt.xlabel("time (seconds)")
		#print sRange, silAll	
		if numOfSpeakers<=0:
			plt.subplot(212)
			plt.plot(sRange, silAll)
			plt.xlabel("number of clusters");
			plt.ylabel("average clustering's sillouette");
		plt.show()
def main(argv):
	if argv[1] == "-dirMp3toWAV":				# convert mp3 to wav (batch)
		if len(argv)==5:			
			path = argv[2]
			if argv[3] not in ["8000", "16000", "32000", "44100"]:
				print "Error. Unsupported sampling rate (must be: 8000, 16000, 32000 or 44100)."; return
			if argv[4] not in ["1","2"]:
				print "Error. Number of output channels must be 1 or 2"; return
			if not os.path.isdir(path):
				raise Exception("Input path not found!")
			useMp3TagsAsNames = True
			audioBasicIO.convertDirMP3ToWav(path, int(argv[3]), int(argv[4]), useMp3TagsAsNames)
		else:
			print "Error.\nSyntax: " + argv[0] + " -dirMp3toWAV <dirName> <sampling Freq> <numOfChannels>"

	if argv[1] == "-dirWAVChangeFs":				# convert mp3 to wav (batch)
		if len(argv)==5:			
			path = argv[2]
			if argv[3] not in ["8000", "16000", "32000", "44100"]:
				print "Error. Unsupported sampling rate (must be: 8000, 16000, 32000 or 44100)."; return
			if argv[4] not in ["1","2"]:
				print "Error. Number of output channels must be 1 or 2"; return
			if not os.path.isdir(path):
				raise Exception("Input path not found!")
			audioBasicIO.convertFsDirWavToWav(path, int(argv[3]), int(argv[4]))
		else:
			print "Error.\nSyntax: " + argv[0] + " -dirMp3toWAV <dirName> <sampling Freq> <numOfChannels>"

	elif argv[1] == "-featureExtractionFile":		# short-term and mid-term feature extraction to files (csv and numpy)
		if len(argv)==7:
			wavFileName = argv[2]
			if not os.path.isfile(wavFileName):
				raise Exception("Input audio file not found!")
			if not (uT.isNum(argv[3]) and uT.isNum(argv[4]) and uT.isNum(argv[5]) and uT.isNum(argv[6])):
				raise Exception("Mid-term and short-term window sizes and steps must be numbers!")
			mtWin = float(argv[3])
			mtStep = float(argv[4])
			stWin = float(argv[5])
			stStep = float(argv[6])
			outFile = wavFileName
			aF.mtFeatureExtractionToFile(wavFileName, mtWin, mtStep, stWin, stStep, outFile, True, True, True)
		else:
			print "Error.\nSyntax: " + argv[0] + " -featureExtractionFile <wavFileName> <mtWin> <mtStep> <stWin> <stStep>"

	elif argv[1] == "-beatExtraction":
		if len(argv)==4:
			wavFileName = argv[2]
			if not os.path.isfile(wavFileName):
				raise Exception("Input audio file not found!")
			if not (uT.isNum(argv[3])):
				raise Exception("PLOT must be either 0 or 1")
			if not ( (int(argv[3]) == 0) or (int(argv[3]) == 1) ):
				raise Exception("PLOT must be either 0 or 1")

			[Fs, x] = audioBasicIO.readAudioFile(wavFileName);
			F = aF.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);
			BPM, ratio = aF.beatExtraction(F, 0.050, int(argv[3])==1)
			print "Beat: {0:d} bpm ".format(int(BPM))
			print "Ratio: {0:.2f} ".format(ratio)
		else:
			print "Error.\nSyntax: " + argv[0] + " -beatExtraction <wavFileName> <PLOT (0 or 1)>"


	elif argv[1] == '-featureExtractionDir':	# same as -featureExtractionFile, in a batch mode (i.e. for each WAV file in the provided path)
		if len(argv)==7:
			path = argv[2]
			if not os.path.isdir(path):
				raise Exception("Input path not found!")
			if not (uT.isNum(argv[3]) and uT.isNum(argv[4]) and uT.isNum(argv[5]) and uT.isNum(argv[6])):
				raise Exception("Mid-term and short-term window sizes and steps must be numbers!")
			mtWin = float(argv[3])
			mtStep = float(argv[4])
			stWin = float(argv[5])
			stStep = float(argv[6])
			aF.mtFeatureExtractionToFileDir(path, mtWin, mtStep, stWin, stStep, True, True, True)
		else:
			print "Error.\nSyntax: " + argv[0] + " -featureExtractionDir <path> <mtWin> <mtStep> <stWin> <stStep>"

	elif argv[1] == '-featureVisualizationDir':	# visualize the content relationships between recordings stored in a folder
		if len(argv)==3:
			if not os.path.isdir(argv[2]):
				raise Exception("Input folder not found!")
			aV.visualizeFeaturesFolder(argv[2], "pca", "")

	elif argv[1] == '-fileSpectrogram':		# show spectogram of a sound stored in a file
			if len(argv)==3:
				wavFileName = argv[2]		
				if not os.path.isfile(wavFileName):
					raise Exception("Input audio file not found!")
				[Fs, x] = audioBasicIO.readAudioFile(wavFileName)
				x = audioBasicIO.stereo2mono(x)
				specgram, TimeAxis, FreqAxis = aF.stSpectogram(x, Fs, round(Fs*0.040), round(Fs*0.040), True)
			else:
				print "Error.\nSyntax: " + argv[0] + " -fileSpectrogram <fileName>"

	elif argv[1] == '-fileChromagram':		# show spectogram of a sound stored in a file
			if len(argv)==3:
				wavFileName = argv[2]		
				if not os.path.isfile(wavFileName):
					raise Exception("Input audio file not found!")
				[Fs, x] = audioBasicIO.readAudioFile(wavFileName)
				x = audioBasicIO.stereo2mono(x)
				specgram, TimeAxis, FreqAxis = aF.stChromagram(x, Fs, round(Fs*0.040), round(Fs*0.040), True)
			else:
				print "Error.\nSyntax: " + argv[0] + " -fileSpectrogram <fileName>"


	elif argv[1] == "-trainClassifier": 		# Segment classifier training (OK)
			if len(argv)>6: 
				method = argv[2]
				beatFeatures = (int(argv[3])==1)
				listOfDirs = argv[4:len(argv)-1]
				modelName = argv[-1]			
				aT.featureAndTrain(listOfDirs, 1, 1, aT.shortTermWindow, aT.shortTermStep, method.lower(), modelName, computeBEAT = beatFeatures)
			else:
				print "Error.\nSyntax: " + argv[0] + " -trainClassifier <method(svm or knn)> <beat features> <directory 1> <directory 2> ... <directory N> <modelName>"

	elif argv[1] == "-trainRegression": 		# Segment regression model
			if len(argv)==6: 
				method = argv[2]
				beatFeatures = (int(argv[3])==1)
				dirName = argv[4]
				modelName = argv[5]			
				aT.featureAndTrainRegression(dirName, 1, 1, aT.shortTermWindow, aT.shortTermStep, method.lower(), modelName, computeBEAT = beatFeatures)
			else:
				print "Error.\nSyntax: " + argv[0] + " -trainRegression <method(svm or knn)> <beat features> <directory> <modelName>"

	elif argv[1] == "-classifyFile":		# Single File Classification (OK)
			if len(argv)==5: 
				modelType = argv[2]
				modelName = argv[3]
				inputFile = argv[4]

				if modelType not in ["svm", "knn"]:
					raise Exception("ModelType has to be either svm or knn!")
				if not os.path.isfile(modelName):
					raise Exception("Input modelName not found!")
				if not os.path.isfile(inputFile):
					raise Exception("Input audio file not found!")

				[Result, P, classNames] = aT.fileClassification(inputFile, modelName, modelType)
				print "{0:s}\t{1:s}".format("Class","Probability")
				for i,c in enumerate(classNames):
					print "{0:s}\t{1:.2f}".format(c,P[i])
				print "Winner class: " + classNames[int(Result)]
			else:
				print "Error.\nSyntax: " + argv[0] + " -classifyFile <method(svm or knn)> <modelName> <fileName>"

	elif argv[1] == "-regressionFile":		# Single File Classification (OK)
			if len(argv)==5: 
				modelType = argv[2]
				modelName = argv[3]
				inputFile = argv[4]

				if modelType not in ["svm", "knn"]:
					raise Exception("ModelType has to be either svm or knn!")
				if not os.path.isfile(inputFile):
					raise Exception("Input audio file not found!")

				R, regressionNames = aT.fileRegression(inputFile, modelName, modelType)
				for i in range(len(R)):
					print "{0:s}\t{1:.3f}".format(regressionNames[i], R[i])
				
				#print "{0:s}\t{1:.2f}".format(c,P[i])

			else:
				print "Error.\nSyntax: " + argv[0] + " -regressionFile <method(svm or knn)> <modelName> <fileName>"

	elif argv[1] == "-classifyFolder": 			# Directory classification (Ok)
			if len(argv)==6 or len(argv)==5: 
				modelType = argv[2]
				modelName = argv[3]
				inputFolder = argv[4]
				if len(argv)==6:
					outputMode = argv[5]
				else:
					outputMode = "0"

				if modelType not in ["svm", "knn"]:
					raise Exception("ModelType has to be either svm or knn!")
				if outputMode not in ["0","1"]:
					raise Exception("outputMode has to be 0 or 1")
				if not os.path.isfile(modelName):
					raise Exception("Input modelName not found!")
				files = '*.wav'
				if os.path.isdir(inputFolder):
					strFilePattern = os.path.join(inputFolder, files)
				else:
					strFilePattern = inputFolder + files

				wavFilesList = []
				wavFilesList.extend(glob.glob(strFilePattern))
				wavFilesList = sorted(wavFilesList)
				if len(wavFilesList)==0:
					print "No WAV files found!"
					return 
				Results = []
				for wavFile in wavFilesList:	
					[Result, P, classNames] = aT.fileClassification(wavFile, modelName, modelType)	
					Result = int(Result)
					Results.append(Result)
					if outputMode=="1":
						print "{0:s}\t{1:s}".format(wavFile,classNames[Result])
				Results = numpy.array(Results)
				# print distribution of classes:
				[Histogram, _] = numpy.histogram(Results, bins=numpy.arange(len(classNames)+1))
				for i,h in enumerate(Histogram):
					print "{0:20s}\t\t{1:d}".format(classNames[i], h)
			else:
				print "Error.\nSyntax: " + argv[0] + " -classifyFolder <method(svm or knn)> <modelName> <folderName> <outputMode(0 or 1)"

	elif argv[1] == "-regressionFolder": 			# Regression applied on the WAV files of a folder
			if len(argv)==5: 
				modelType = argv[2]
				modelName = argv[3]
				inputFolder = argv[4]

				if modelType not in ["svm", "knn"]:
					raise Exception("ModelType has to be either svm or knn!")

				files = '*.wav'
				if os.path.isdir(inputFolder):
					strFilePattern = os.path.join(inputFolder, files)
				else:
					strFilePattern = inputFolder + files

				wavFilesList = []
				wavFilesList.extend(glob.glob(strFilePattern))
				wavFilesList = sorted(wavFilesList)	
				if len(wavFilesList)==0:
					print "No WAV files found!"
					return 
				Results = []
				for wavFile in wavFilesList:	
					R, regressionNames = aT.fileRegression(wavFile, modelName, modelType)
					Results.append(R)
				Results = numpy.array(Results)
				for i, r in enumerate(regressionNames):
					[Histogram, bins] = numpy.histogram(Results[:, i])
					centers = (bins[0:-1] + bins[1::]) / 2.0
					plt.subplot(len(regressionNames), 1, i);
					plt.plot(centers, Histogram)
					plt.title(r)
				plt.show()
#					for h in Histogram:
#						print "{0:20d}".format(h),
#				if outputMode=="1":
#					for i,h in enumerate(Histogram):
#						print "{0:20s}\t\t{1:d}".format(classNames[i], h)
			else:
				print "Error.\nSyntax: " + argv[0] + " -regressionFolder <method(svm or knn)> <modelName> <folderName>"

	elif argv[1] == '-trainHMMsegmenter_fromfile':
		if len(argv)==7:
			wavFile = argv[2]
			gtFile = argv[3]
			hmmModelName = argv[4]
			if not uT.isNum(argv[5]):
				print "Error: mid-term window size must be float!"; return
			if not uT.isNum(argv[6]):
				print "Error: mid-term window step must be float!"; return
			mtWin = float(argv[5])
			mtStep = float(argv[6])
			if not os.path.isfile(wavFile):
				print "Error: wavfile does not exist!"; return
			if not os.path.isfile(gtFile):
				print "Error: groundtruth does not exist!"; return
			aS.trainHMM_fromFile(wavFile, gtFile, hmmModelName, mtWin, mtStep)
		else:
			print "Error.\nSyntax: " + argv[0] + " -trainHMMsegmenter_fromfile <wavFilePath> <gtSegmentFilePath> <hmmModelFileName> <mtWin> <mtStep>"

	elif argv[1] == '-trainHMMsegmenter_fromdir':
		if len(argv)==6:
			dirPath = argv[2]
			hmmModelName = argv[3]
			if not uT.isNum(argv[4]):
				print "Error: mid-term window size must be float!"
			if not uT.isNum(argv[5]):
				print "Error: mid-term window step must be float!"
			mtWin = float(argv[4])
			mtStep = float(argv[5])
			aS.trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep)
		else:
			print "Error.\nSyntax: " + argv[0] + " -trainHMMsegmenter_fromdir <dirPath> <hmmModelFileName> <mtWin> <mtStep>"

	elif argv[1] == "-segmentClassifyFileHMM":	# HMM-based segmentation-classification
		if len(argv)==4:
			hmmModelName = argv[2]
			wavFile = argv[3]
			gtFile = wavFile.replace('.wav', '.segments');			
			aS.hmmSegmentation(wavFile, hmmModelName, PLOT = True, gtFileName = gtFile)
		else:
			print "Error.\nSyntax: " + argv[0] + " -segmentClassifyHMM <hmmModelName> <fileName>"

	elif argv[1] == '-segmentClassifyFile':		# Segmentation-classification (fix-sized segment using knn or svm)
		if (len(argv)==5):
			modelType = argv[2]
			modelName = argv[3]
			inputWavFile = argv[4]

			if modelType not in ["svm", "knn"]:
				raise Exception("ModelType has to be either svm or knn!")
			if not os.path.isfile(modelName):
				raise Exception("Input modelName not found!")
			if not os.path.isfile(inputWavFile):
				raise Exception("Input audio file not found!")
			gtFile = inputWavFile.replace('.wav', '.segments');
			aS.mtFileClassification(inputWavFile, modelName, modelType, True, gtFile)
		else:
			print "Error.\nSyntax: " + argv[0] + " -segmentClassifyFile <method(svm or knn)> <modelName> <fileName>"

	elif argv[1] == "-segmentationEvaluation":
		if len(argv)==5:
			methodName = argv[2]
			modelName = argv[3]
			dirName = argv[4]
			aS.evaluateSegmentationClassificationDir(dirName, modelName, methodName)
		else:
			print "Error.\nSyntax: " + argv[0] + " -segmentationEvaluation <method(svm or knn)> <modelName> <directoryName>"

	elif argv[1] == "-silenceRemoval":
		if len(argv)==5:
			inputFile = argv[2]
			if not os.path.isfile(inputFile):
				raise Exception("Input audio file not found!")

			smoothingWindow = float(argv[3])
			weight = float(argv[4])
			[Fs, x] = audioBasicIO.readAudioFile(inputFile)						# read audio signal
			segmentLimits = aS.silenceRemoval(x, Fs, 0.05, 0.05, smoothingWindow, weight, False)	# get onsets
			for i, s in enumerate(segmentLimits):
				strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(inputFile[0:-4], s[0], s[1])
				wavfile.write( strOut, Fs, x[int(Fs*s[0]):int(Fs*s[1])])
		else:
			print "Error.\nSyntax: " + argv[0] + " -silenceRemoval <inputFile> <smoothinWindow(secs)> <Threshold Weight>"

	elif argv[1] == '-speakerDiarization':		# speaker diarization (from file): TODO				
			inputFile = argv[2]
			nSpeakers = int(argv[3])
			useLDA = (int(argv[4])==1)			
			if useLDA:
				aS.speakerDiarization(inputFile, nSpeakers, PLOT = True);
			else:
				aS.speakerDiarization(inputFile, nSpeakers, LDAdim = 0, PLOT = True);
			#print speechLimits

	elif argv[1] == "-speakerDiarizationScriptEval":
			dir = argv[2]
			listOfLDAs = [int(l) for l in argv[3::]]
			aS.speakerDiarizationEvaluateScript(dir, listOfLDAs)

	elif argv[1] == '-thumbnail':			# music thumbnailing (OK)
			if len(argv)==4:	
				inputFile = argv[2]
				stWindow = 1.0
				stStep = 1.0
				if not os.path.isfile(inputFile):
					raise Exception("Input audio file not found!")

				[Fs, x] = audioBasicIO.readAudioFile(inputFile)						# read file
				if Fs == -1:	# could not read file
					return
				try:
					thumbnailSize = float(argv[3])
				except ValueError:
					print "Thumbnail size must be a float (in seconds)"
					return 
				[A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x, Fs, stWindow, stStep, thumbnailSize)	# find thumbnail endpoints			

				# write thumbnails to WAV files:
				thumbnailFileName1 = inputFile.replace(".wav","_thumb1.wav")
				thumbnailFileName2 = inputFile.replace(".wav","_thumb2.wav")
				wavfile.write(thumbnailFileName1, Fs, x[int(Fs*A1):int(Fs*A2)])
				wavfile.write(thumbnailFileName2, Fs, x[int(Fs*B1):int(Fs*B2)])
				print "1st thumbnail (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format(thumbnailFileName1, A1, A2)
				print "2nd thumbnail (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format(thumbnailFileName2, B1, B2)

				# Plot self-similarity matrix:
				fig = plt.figure()
				ax = fig.add_subplot(111, aspect='auto')
				plt.imshow(Smatrix)
				# Plot best-similarity diagonal:
				Xcenter = (A1/stStep + A2/stStep) / 2.0
				Ycenter = (B1/stStep + B2/stStep) / 2.0


				e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter), thumbnailSize * 1.4, 3,
			             angle=45, linewidth=3, fill=False)
				ax.add_patch(e1)

				plt.plot([B1, Smatrix.shape[0]], [A1, A1], color='k', linestyle='--', linewidth=2)
				plt.plot([B2, Smatrix.shape[0]], [A2, A2], color='k', linestyle='--', linewidth=2)
				plt.plot([B1, B1], [A1, Smatrix.shape[0]], color='k', linestyle='--', linewidth=2)
				plt.plot([B2, B2], [A2, Smatrix.shape[0]], color='k', linestyle='--', linewidth=2)

				plt.xlim([0, Smatrix.shape[0]])
				plt.ylim([Smatrix.shape[1], 0])



				ax.yaxis.set_label_position("right")
				ax.yaxis.tick_right()


				plt.xlabel('frame no')
				plt.ylabel('frame no')
				plt.title('Self-similarity matrix')

				plt.show()

			else: 
				print "Error.\nSyntax: " + argv[0] + " -thumbnail <filename> <thumbnailsize(seconds)>"
Example #37
0
def dirWavFeatureExtraction(dirName,
                            mt_win,
                            mt_step,
                            st_win,
                            st_step,
                            compute_beat=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mt_win, mt_step:    mid-term window and step (in seconds)
        - st_win, st_step:    short-term window and step (in seconds)
    """

    all_mt_feats = numpy.array([])
    process_times = []

    types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au', '*.ogg')
    wav_file_list = []
    for files in types:
        wav_file_list.extend(glob.glob(os.path.join(dirName, files)))

    wav_file_list = sorted(wav_file_list)
    wav_file_list2, mt_feature_names = [], []
    for i, wavFile in enumerate(wav_file_list):
        print("Analyzing file {0:d} of "
              "{1:d}: {2:s}".format(i + 1, len(wav_file_list), wavFile))
        if os.stat(wavFile).st_size == 0:
            print("   (EMPTY FILE -- SKIPPING)")
            continue
        [fs, x] = audioBasicIO.readAudioFile(wavFile)
        if isinstance(x, int):
            continue

        t1 = time.clock()
        x = audioBasicIO.stereo2mono(x)
        if x.shape[0] < float(fs) / 5:
            print("  (AUDIO FILE TOO SMALL - SKIPPING)")
            continue
        wav_file_list2.append(wavFile)
        if compute_beat:
            [mt_term_feats, st_features, mt_feature_names] = \
                mtFeatureExtraction(x, fs, round(mt_win * fs),
                                    round(mt_step * fs),
                                    round(fs * st_win), round(fs * st_step))
            [beat, beat_conf] = beatExtraction(st_features, st_step)
        else:
            [mt_term_feats, _, mt_feature_names] = \
                mtFeatureExtraction(x, fs, round(mt_win * fs),
                                    round(mt_step * fs),
                                    round(fs * st_win), round(fs * st_step))

        mt_term_feats = numpy.transpose(mt_term_feats)
        mt_term_feats = mt_term_feats.mean(axis=0)
        # long term averaging of mid-term statistics
        if (not numpy.isnan(mt_term_feats).any()) and \
                (not numpy.isinf(mt_term_feats).any()):
            if compute_beat:
                mt_term_feats = numpy.append(mt_term_feats, beat)
                mt_term_feats = numpy.append(mt_term_feats, beat_conf)
            if len(all_mt_feats) == 0:
                # append feature vector
                all_mt_feats = mt_term_feats
            else:
                all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats))
            t2 = time.clock()
            duration = float(len(x)) / fs
            process_times.append((t2 - t1) / duration)
    if len(process_times) > 0:
        print("Feature extraction complexity ratio: "
              "{0:.1f} x realtime".format(
                  (1.0 / numpy.mean(numpy.array(process_times)))))
    return (all_mt_feats, wav_file_list2, mt_feature_names)
Example #38
0
def main(argv):
    if argv[1] == "-dirMp3toWAV":  # convert mp3 to wav (batch)
        if len(argv) == 5:
            path = argv[2]
            if argv[3] not in ["8000", "16000", "32000", "44100"]:
                print "Error. Unsupported sampling rate (must be: 8000, 16000, 32000 or 44100)."
                return
            if argv[4] not in ["1", "2"]:
                print "Error. Number of output channels must be 1 or 2"
                return
            if not os.path.isdir(path):
                raise Exception("Input path not found!")
            useMp3TagsAsNames = True
            audioBasicIO.convertDirMP3ToWav(path, int(argv[3]), int(argv[4]),
                                            useMp3TagsAsNames)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -dirMp3toWAV <dirName> <sampling Freq> <numOfChannels>"

    if argv[1] == "-dirWAVChangeFs":  # convert mp3 to wav (batch)
        if len(argv) == 5:
            path = argv[2]
            if argv[3] not in ["8000", "16000", "32000", "44100"]:
                print "Error. Unsupported sampling rate (must be: 8000, 16000, 32000 or 44100)."
                return
            if argv[4] not in ["1", "2"]:
                print "Error. Number of output channels must be 1 or 2"
                return
            if not os.path.isdir(path):
                raise Exception("Input path not found!")
            audioBasicIO.convertFsDirWavToWav(path, int(argv[3]), int(argv[4]))
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -dirMp3toWAV <dirName> <sampling Freq> <numOfChannels>"

    elif argv[
            1] == "-featureExtractionFile":  # short-term and mid-term feature extraction to files (csv and numpy)
        if len(argv) == 7:
            wavFileName = argv[2]
            if not os.path.isfile(wavFileName):
                raise Exception("Input audio file not found!")
            if not (uT.isNum(argv[3]) and uT.isNum(argv[4])
                    and uT.isNum(argv[5]) and uT.isNum(argv[6])):
                raise Exception(
                    "Mid-term and short-term window sizes and steps must be numbers!"
                )
            mtWin = float(argv[3])
            mtStep = float(argv[4])
            stWin = float(argv[5])
            stStep = float(argv[6])
            outFile = wavFileName
            aF.mtFeatureExtractionToFile(wavFileName, mtWin, mtStep, stWin,
                                         stStep, outFile, True, True, True)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -featureExtractionFile <wavFileName> <mtWin> <mtStep> <stWin> <stStep>"

    elif argv[1] == "-beatExtraction":
        if len(argv) == 4:
            wavFileName = argv[2]
            if not os.path.isfile(wavFileName):
                raise Exception("Input audio file not found!")
            if not (uT.isNum(argv[3])):
                raise Exception("PLOT must be either 0 or 1")
            if not ((int(argv[3]) == 0) or (int(argv[3]) == 1)):
                raise Exception("PLOT must be either 0 or 1")

            [Fs, x] = audioBasicIO.readAudioFile(wavFileName)
            F = aF.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.050 * Fs)
            BPM, ratio = aF.beatExtraction(F, 0.050, int(argv[3]) == 1)
            print "Beat: {0:d} bpm ".format(int(BPM))
            print "Ratio: {0:.2f} ".format(ratio)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -beatExtraction <wavFileName> <PLOT (0 or 1)>"

    elif argv[
            1] == '-featureExtractionDir':  # same as -featureExtractionFile, in a batch mode (i.e. for each WAV file in the provided path)
        if len(argv) == 7:
            path = argv[2]
            if not os.path.isdir(path):
                raise Exception("Input path not found!")
            if not (uT.isNum(argv[3]) and uT.isNum(argv[4])
                    and uT.isNum(argv[5]) and uT.isNum(argv[6])):
                raise Exception(
                    "Mid-term and short-term window sizes and steps must be numbers!"
                )
            mtWin = float(argv[3])
            mtStep = float(argv[4])
            stWin = float(argv[5])
            stStep = float(argv[6])
            aF.mtFeatureExtractionToFileDir(path, mtWin, mtStep, stWin, stStep,
                                            True, True, True)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -featureExtractionDir <path> <mtWin> <mtStep> <stWin> <stStep>"

    elif argv[
            1] == '-featureVisualizationDir':  # visualize the content relationships between recordings stored in a folder
        if len(argv) == 3:
            if not os.path.isdir(argv[2]):
                raise Exception("Input folder not found!")
            aV.visualizeFeaturesFolder(argv[2], "pca", "")

    elif argv[
            1] == '-fileSpectrogram':  # show spectogram of a sound stored in a file
        if len(argv) == 3:
            wavFileName = argv[2]
            if not os.path.isfile(wavFileName):
                raise Exception("Input audio file not found!")
            [Fs, x] = audioBasicIO.readAudioFile(wavFileName)
            x = audioBasicIO.stereo2mono(x)
            specgram, TimeAxis, FreqAxis = aF.stSpectogram(
                x, Fs, round(Fs * 0.040), round(Fs * 0.040), True)
        else:
            print "Error.\nSyntax: " + argv[0] + " -fileSpectrogram <fileName>"

    elif argv[
            1] == '-fileChromagram':  # show spectogram of a sound stored in a file
        if len(argv) == 3:
            wavFileName = argv[2]
            if not os.path.isfile(wavFileName):
                raise Exception("Input audio file not found!")
            [Fs, x] = audioBasicIO.readAudioFile(wavFileName)
            x = audioBasicIO.stereo2mono(x)
            specgram, TimeAxis, FreqAxis = aF.stChromagram(
                x, Fs, round(Fs * 0.040), round(Fs * 0.040), True)
        else:
            print "Error.\nSyntax: " + argv[0] + " -fileSpectrogram <fileName>"

    elif argv[1] == "-trainClassifier":  # Segment classifier training (OK)
        if len(argv) > 6:
            method = argv[2]
            beatFeatures = (int(argv[3]) == 1)
            listOfDirs = argv[4:len(argv) - 1]
            modelName = argv[-1]
            aT.featureAndTrain(listOfDirs,
                               1,
                               1,
                               aT.shortTermWindow,
                               aT.shortTermStep,
                               method.lower(),
                               modelName,
                               computeBEAT=beatFeatures)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -trainClassifier <method(svm or knn)> <beat features> <directory 1> <directory 2> ... <directory N> <modelName>"

    elif argv[1] == "-trainRegression":  # Segment regression model
        if len(argv) == 6:
            method = argv[2]
            beatFeatures = (int(argv[3]) == 1)
            dirName = argv[4]
            modelName = argv[5]
            aT.featureAndTrainRegression(dirName,
                                         1,
                                         1,
                                         aT.shortTermWindow,
                                         aT.shortTermStep,
                                         method.lower(),
                                         modelName,
                                         computeBEAT=beatFeatures)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -trainRegression <method(svm or knn)> <beat features> <directory> <modelName>"

    elif argv[1] == "-classifyFile":  # Single File Classification (OK)
        if len(argv) == 5:
            modelType = argv[2]
            modelName = argv[3]
            inputFile = argv[4]

            if modelType not in ["svm", "knn"]:
                raise Exception("ModelType has to be either svm or knn!")
            if not os.path.isfile(modelName):
                raise Exception("Input modelName not found!")
            if not os.path.isfile(inputFile):
                raise Exception("Input audio file not found!")

            [Result, P,
             classNames] = aT.fileClassification(inputFile, modelName,
                                                 modelType)
            print "{0:s}\t{1:s}".format("Class", "Probability")
            for i, c in enumerate(classNames):
                print "{0:s}\t{1:.2f}".format(c, P[i])
            print "Winner class: " + classNames[int(Result)]
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -classifyFile <method(svm or knn)> <modelName> <fileName>"

    elif argv[1] == "-regressionFile":  # Single File Classification (OK)
        if len(argv) == 5:
            modelType = argv[2]
            modelName = argv[3]
            inputFile = argv[4]

            if modelType not in ["svm", "knn"]:
                raise Exception("ModelType has to be either svm or knn!")
            if not os.path.isfile(inputFile):
                raise Exception("Input audio file not found!")

            R, regressionNames = aT.fileRegression(inputFile, modelName,
                                                   modelType)
            for i in range(len(R)):
                print "{0:s}\t{1:.3f}".format(regressionNames[i], R[i])

            #print "{0:s}\t{1:.2f}".format(c,P[i])

        else:
            print "Error.\nSyntax: " + argv[
                0] + " -regressionFile <method(svm or knn)> <modelName> <fileName>"

    elif argv[1] == "-classifyFolder":  # Directory classification (Ok)
        if len(argv) == 6 or len(argv) == 5:
            modelType = argv[2]
            modelName = argv[3]
            inputFolder = argv[4]
            if len(argv) == 6:
                outputMode = argv[5]
            else:
                outputMode = "0"

            if modelType not in ["svm", "knn"]:
                raise Exception("ModelType has to be either svm or knn!")
            if outputMode not in ["0", "1"]:
                raise Exception("outputMode has to be 0 or 1")
            if not os.path.isfile(modelName):
                raise Exception("Input modelName not found!")
            files = '*.wav'
            if os.path.isdir(inputFolder):
                strFilePattern = os.path.join(inputFolder, files)
            else:
                strFilePattern = inputFolder + files

            wavFilesList = []
            wavFilesList.extend(glob.glob(strFilePattern))
            wavFilesList = sorted(wavFilesList)
            if len(wavFilesList) == 0:
                print "No WAV files found!"
                return
            Results = []
            for wavFile in wavFilesList:
                [Result, P,
                 classNames] = aT.fileClassification(wavFile, modelName,
                                                     modelType)
                Result = int(Result)
                Results.append(Result)
                if outputMode == "1":
                    print "{0:s}\t{1:s}".format(wavFile, classNames[Result])
            Results = numpy.array(Results)
            # print distribution of classes:
            [Histogram,
             _] = numpy.histogram(Results,
                                  bins=numpy.arange(len(classNames) + 1))
            for i, h in enumerate(Histogram):
                print "{0:20s}\t\t{1:d}".format(classNames[i], h)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -classifyFolder <method(svm or knn)> <modelName> <folderName> <outputMode(0 or 1)"

    elif argv[
            1] == "-regressionFolder":  # Regression applied on the WAV files of a folder
        if len(argv) == 5:
            modelType = argv[2]
            modelName = argv[3]
            inputFolder = argv[4]

            if modelType not in ["svm", "knn"]:
                raise Exception("ModelType has to be either svm or knn!")

            files = '*.wav'
            if os.path.isdir(inputFolder):
                strFilePattern = os.path.join(inputFolder, files)
            else:
                strFilePattern = inputFolder + files

            wavFilesList = []
            wavFilesList.extend(glob.glob(strFilePattern))
            wavFilesList = sorted(wavFilesList)
            if len(wavFilesList) == 0:
                print "No WAV files found!"
                return
            Results = []
            for wavFile in wavFilesList:
                R, regressionNames = aT.fileRegression(wavFile, modelName,
                                                       modelType)
                Results.append(R)
            Results = numpy.array(Results)
            for i, r in enumerate(regressionNames):
                [Histogram, bins] = numpy.histogram(Results[:, i])
                centers = (bins[0:-1] + bins[1::]) / 2.0
                plt.subplot(len(regressionNames), 1, i)
                plt.plot(centers, Histogram)
                plt.title(r)
            plt.show()


#					for h in Histogram:
#						print "{0:20d}".format(h),
#				if outputMode=="1":
#					for i,h in enumerate(Histogram):
#						print "{0:20s}\t\t{1:d}".format(classNames[i], h)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -regressionFolder <method(svm or knn)> <modelName> <folderName>"

    elif argv[1] == '-trainHMMsegmenter_fromfile':
        if len(argv) == 7:
            wavFile = argv[2]
            gtFile = argv[3]
            hmmModelName = argv[4]
            if not uT.isNum(argv[5]):
                print "Error: mid-term window size must be float!"
                return
            if not uT.isNum(argv[6]):
                print "Error: mid-term window step must be float!"
                return
            mtWin = float(argv[5])
            mtStep = float(argv[6])
            if not os.path.isfile(wavFile):
                print "Error: wavfile does not exist!"
                return
            if not os.path.isfile(gtFile):
                print "Error: groundtruth does not exist!"
                return
            aS.trainHMM_fromFile(wavFile, gtFile, hmmModelName, mtWin, mtStep)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -trainHMMsegmenter_fromfile <wavFilePath> <gtSegmentFilePath> <hmmModelFileName> <mtWin> <mtStep>"

    elif argv[1] == '-trainHMMsegmenter_fromdir':
        if len(argv) == 6:
            dirPath = argv[2]
            hmmModelName = argv[3]
            if not uT.isNum(argv[4]):
                print "Error: mid-term window size must be float!"
            if not uT.isNum(argv[5]):
                print "Error: mid-term window step must be float!"
            mtWin = float(argv[4])
            mtStep = float(argv[5])
            aS.trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -trainHMMsegmenter_fromdir <dirPath> <hmmModelFileName> <mtWin> <mtStep>"

    elif argv[
            1] == "-segmentClassifyFileHMM":  # HMM-based segmentation-classification
        if len(argv) == 4:
            hmmModelName = argv[2]
            wavFile = argv[3]
            gtFile = wavFile.replace('.wav', '.segments')
            aS.hmmSegmentation(wavFile,
                               hmmModelName,
                               PLOT=True,
                               gtFileName=gtFile)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -segmentClassifyHMM <hmmModelName> <fileName>"

    elif argv[
            1] == '-segmentClassifyFile':  # Segmentation-classification (fix-sized segment using knn or svm)
        if (len(argv) == 5):
            modelType = argv[2]
            modelName = argv[3]
            inputWavFile = argv[4]

            if modelType not in ["svm", "knn"]:
                raise Exception("ModelType has to be either svm or knn!")
            if not os.path.isfile(modelName):
                raise Exception("Input modelName not found!")
            if not os.path.isfile(inputWavFile):
                raise Exception("Input audio file not found!")
            gtFile = inputWavFile.replace('.wav', '.segments')
            aS.mtFileClassification(inputWavFile, modelName, modelType, True,
                                    gtFile)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -segmentClassifyFile <method(svm or knn)> <modelName> <fileName>"

    elif argv[1] == "-segmentationEvaluation":
        if len(argv) == 5:
            methodName = argv[2]
            modelName = argv[3]
            dirName = argv[4]
            aS.evaluateSegmentationClassificationDir(dirName, modelName,
                                                     methodName)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -segmentationEvaluation <method(svm or knn)> <modelName> <directoryName>"

    elif argv[1] == "-silenceRemoval":
        if len(argv) == 5:
            inputFile = argv[2]
            if not os.path.isfile(inputFile):
                raise Exception("Input audio file not found!")

            smoothingWindow = float(argv[3])
            weight = float(argv[4])
            [Fs,
             x] = audioBasicIO.readAudioFile(inputFile)  # read audio signal
            segmentLimits = aS.silenceRemoval(x, Fs, 0.05, 0.05,
                                              smoothingWindow, weight,
                                              False)  # get onsets
            for i, s in enumerate(segmentLimits):
                strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(
                    inputFile[0:-4], s[0], s[1])
                wavfile.write(strOut, Fs, x[int(Fs * s[0]):int(Fs * s[1])])
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -silenceRemoval <inputFile> <smoothinWindow(secs)> <Threshold Weight>"

    elif argv[
            1] == '-speakerDiarization':  # speaker diarization (from file): TODO
        inputFile = argv[2]
        nSpeakers = int(argv[3])
        useLDA = (int(argv[4]) == 1)
        if useLDA:
            aS.speakerDiarization(inputFile, nSpeakers, PLOT=True)
        else:
            aS.speakerDiarization(inputFile, nSpeakers, LDAdim=0, PLOT=True)
        #print speechLimits

    elif argv[1] == "-speakerDiarizationScriptEval":
        dir = argv[2]
        listOfLDAs = [int(l) for l in argv[3::]]
        aS.speakerDiarizationEvaluateScript(dir, listOfLDAs)

    elif argv[1] == '-thumbnail':  # music thumbnailing (OK)
        if len(argv) == 4:
            inputFile = argv[2]
            stWindow = 1.0
            stStep = 1.0
            if not os.path.isfile(inputFile):
                raise Exception("Input audio file not found!")

            [Fs, x] = audioBasicIO.readAudioFile(inputFile)  # read file
            if Fs == -1:  # could not read file
                return
            try:
                thumbnailSize = float(argv[3])
            except ValueError:
                print "Thumbnail size must be a float (in seconds)"
                return
            [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(
                x, Fs, stWindow, stStep,
                thumbnailSize)  # find thumbnail endpoints

            # write thumbnails to WAV files:
            thumbnailFileName1 = inputFile.replace(".wav", "_thumb1.wav")
            thumbnailFileName2 = inputFile.replace(".wav", "_thumb2.wav")
            wavfile.write(thumbnailFileName1, Fs, x[int(Fs * A1):int(Fs * A2)])
            wavfile.write(thumbnailFileName2, Fs, x[int(Fs * B1):int(Fs * B2)])
            print "1st thumbnail (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format(
                thumbnailFileName1, A1, A2)
            print "2nd thumbnail (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format(
                thumbnailFileName2, B1, B2)

            # Plot self-similarity matrix:
            fig = plt.figure()
            ax = fig.add_subplot(111, aspect='auto')
            plt.imshow(Smatrix)
            # Plot best-similarity diagonal:
            Xcenter = (A1 / stStep + A2 / stStep) / 2.0
            Ycenter = (B1 / stStep + B2 / stStep) / 2.0

            e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter),
                                            thumbnailSize * 1.4,
                                            3,
                                            angle=45,
                                            linewidth=3,
                                            fill=False)
            ax.add_patch(e1)

            plt.plot([B1, Smatrix.shape[0]], [A1, A1],
                     color='k',
                     linestyle='--',
                     linewidth=2)
            plt.plot([B2, Smatrix.shape[0]], [A2, A2],
                     color='k',
                     linestyle='--',
                     linewidth=2)
            plt.plot([B1, B1], [A1, Smatrix.shape[0]],
                     color='k',
                     linestyle='--',
                     linewidth=2)
            plt.plot([B2, B2], [A2, Smatrix.shape[0]],
                     color='k',
                     linestyle='--',
                     linewidth=2)

            plt.xlim([0, Smatrix.shape[0]])
            plt.ylim([Smatrix.shape[1], 0])

            ax.yaxis.set_label_position("right")
            ax.yaxis.tick_right()

            plt.xlabel('frame no')
            plt.ylabel('frame no')
            plt.title('Self-similarity matrix')

            plt.show()

        else:
            print "Error.\nSyntax: " + argv[
                0] + " -thumbnail <filename> <thumbnailsize(seconds)>"
Example #39
0
    def _record(self):
        print("please speak a word into the microphone")
        self.CHUNK = 2048
        self.FORMAT = pyaudio.paInt16
        self.CHANNELS = 2
        self.RATE = 44100
        self.RECORD_SECONDS = 6
        self.count = 0
        self.current = 0
        self.cutinterval = 105
        self.segment = []
        self.fullFrame = []
        print("saving and segmenting")

        while (self.isrecording):
            self.p = pyaudio.PyAudio()
            self.stream = self.p.open(format=self.FORMAT,
                                      channels=self.CHANNELS,
                                      rate=self.RATE,
                                      input=True,
                                      frames_per_buffer=self.CHUNK)
            for i in range(0,
                           int(self.RATE / self.CHUNK * self.RECORD_SECONDS)):
                data = self.stream.read(self.CHUNK)
                self.frames.append(data)
            self.fullFrames = self.frames
            print(len(self.fullFrames))
            self.path = "Record/recordedSegment" + str(int(
                time.time())) + ".wav"
            self.stream.stop_stream()
            self.stream.close()
            self.p.terminate()
            self.segment = self.frames[self.count:len(self.frames)]
            self.count = self.count + self.cutinterval
            wf = wave.open(self.path, 'wb')
            wf.setnchannels(self.CHANNELS)
            wf.setsampwidth(self.p.get_sample_size(self.FORMAT))
            wf.setframerate(self.RATE)
            wf.writeframes(b''.join(self.segment))
            wf.close()
            print("segment Recording Finished")
            if (len(self.segment) > 100):
                audiofile = AudioSegment.from_wav(self.path)
                [Fs, x] = aIO.readAudioFile(self.path)
                segments = aS.silenceRemoval(x,
                                             Fs,
                                             0.020,
                                             0.020,
                                             smoothWindow=1.0,
                                             Weight=0.3,
                                             plot=False)
                newname = time.time()
                for i in range(0, len(segments)):
                    z = segments[i]
                    startpoint = z[0]
                    endpoint = z[1]
                    sliceSound = audiofile[startpoint * 1000:endpoint * 1000]
                    sliceSound.export(self.path, format="wav")
                    newname = newname + 1
                    print("--Segmenter finished--")
                result = self.model.cnn_predict(self.path)
                print("predicting......")
                print(result)
                print("done.....")

        if (len(self.frames) != 0 & self.isrecording == False):
            print(len(self.frames))
            self.stream.stop_stream()
            self.stream.close()
            self.p.terminate()
            self.paths = "Record/FullRecord" + str(int(time.time())) + ".wav"
            wf = wave.open(self.paths, 'wb')
            wf.setnchannels(self.CHANNELS)
            wf.setsampwidth(self.p.get_sample_size(self.FORMAT))
            wf.setframerate(self.RATE)
            wf.writeframes(b''.join(self.fullFrames))
            wf.close()
            self.frames = []
            self.segment = []
            print("--Finished Recording--")
Example #40
0
def dirWavFeatureExtraction(dirName,
                            mtWin,
                            mtStep,
                            stWin,
                            stStep,
                            computeBEAT=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mtWin, mtStep:    mid-term window and step (in seconds)
        - stWin, stStep:    short-term window and step (in seconds)
    """

    allMtFeatures = numpy.array([])
    processingTimes = []

    types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(dirName, files)))

    wavFilesList = sorted(wavFilesList)
    wavFilesList2 = []
    for i, wavFile in enumerate(wavFilesList):
        print("Analyzing file {0:d} of {1:d}: {2:s}".format(
            i + 1, len(wavFilesList), wavFile.encode('utf-8')))
        if os.stat(wavFile).st_size == 0:
            print("   (EMPTY FILE -- SKIPPING)")
            continue
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)  # read file
        if isinstance(x, int):
            continue

        t1 = time.clock()
        # convert stereo to mono
        x = audioBasicIO.stereo2mono(x)
        if x.shape[0] < float(Fs) / 10:
            print("  (AUDIO FILE TOO SMALL - SKIPPING)")
            continue
        wavFilesList2.append(wavFile)
        if computeBEAT:  # mid-term feature extraction for current file
            [MidTermFeatures,
             stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs),
                                               round(mtStep * Fs),
                                               round(Fs * stWin),
                                               round(Fs * stStep))
            [beat, beatConf] = beatExtraction(stFeatures, stStep)
        else:
            [MidTermFeatures, _] = mtFeatureExtraction(x, Fs,
                                                       round(mtWin * Fs),
                                                       round(mtStep * Fs),
                                                       round(Fs * stWin),
                                                       round(Fs * stStep))

        MidTermFeatures = numpy.transpose(MidTermFeatures)
        # long term averaging of mid-term statistics
        MidTermFeatures = MidTermFeatures.mean(axis=0)
        if (not numpy.isnan(MidTermFeatures).any()) and (
                not numpy.isinf(MidTermFeatures).any()):
            if computeBEAT:
                MidTermFeatures = numpy.append(MidTermFeatures, beat)
                MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
            if len(allMtFeatures) == 0:  # append feature vector
                allMtFeatures = MidTermFeatures
            else:
                allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
            t2 = time.clock()
            duration = float(len(x)) / Fs
            processingTimes.append((t2 - t1) / duration)
    if len(processingTimes) > 0:
        print("Feature extraction complexity ratio: {0:.1f} x realtime".format(
            (1.0 / numpy.mean(numpy.array(processingTimes)))))
    return (allMtFeatures, wavFilesList2)
Example #41
0
 #             for x in range(len(starts)):
 #                 CnumSum += 1
 #                 for y in range(len(change_point)):
 #                     if starts[x]-10 <= change_point[y] and starts[x]+10 >= change_point[y]:
 #                         CnumTrue += 1
 #             for a in range(len(change_point)):
 #                 PnumSum += 1
 #                 for b in range(len(starts)):
 #                     if starts[b]+20 <= change_point[a] and ends[b]-20 >= change_point[a]:
 #                         PnumTrue += 1
 #             print(audiofile)
 # Coverage = float(CnumTrue)/float(CnumSum)
 # Purity = 1 - float(PnumTrue)/float(PnumSum)
 # print("Coverage:{0} Purity:{1}".format(Coverage,Purity))
 for i in range(nExp):
     [Fs1, x1] = audioBasicIO.readAudioFile(filename)
     tcls = aS.speakerDiarization(filename, 2, LDAdim=0, PLOT=False)
     audio = AudioSegment.from_wav(filename)
     audio_length = len(audio)
     change_point = [[] for i in range(2)]
     for j in range(len(tcls) - 1):
         if tcls[j] != tcls[j + 1]:
             change_point[0].append(j)
             change_point[1].append(int(tcls[j]))
         if j == len(tcls) - 2:
             change_point[1].append(int(tcls[j]))
     for num in range(len(change_point[1])):
         if num == 0:
             seg_audio = audio[:audio_length * change_point[0][0] /
                               len(tcls)]
             seg_audio.export(os.path.join(
Example #42
0
def dirWavFeatureExtraction(dirName,
                            mtWin,
                            mtStep,
                            stWin,
                            stStep,
                            computeBEAT=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mtWin, mtStep:    mid-term window and step (in seconds)
        - stWin, stStep:    short-term window and step (in seconds)
    """

    allMtFeatures = numpy.array([])
    processingTimes = []

    types = ('*.wav', '*.aif', '*.aiff')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(dirName, files)))

    wavFilesList = sorted(wavFilesList)

    for wavFile in wavFilesList:
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)  # read file
        t1 = time.clock()
        x = audioBasicIO.stereo2mono(x)  # convert stereo to mono
        if computeBEAT:  # mid-term feature extraction for current file
            [MidTermFeatures,
             stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs),
                                               round(mtStep * Fs),
                                               round(Fs * stWin),
                                               round(Fs * stStep))
            [beat, beatConf] = beatExtraction(stFeatures, stStep)
        else:
            [MidTermFeatures, _] = mtFeatureExtraction(x, Fs,
                                                       round(mtWin * Fs),
                                                       round(mtStep * Fs),
                                                       round(Fs * stWin),
                                                       round(Fs * stStep))

        MidTermFeatures = numpy.transpose(MidTermFeatures)
        MidTermFeatures = MidTermFeatures.mean(
            axis=0)  # long term averaging of mid-term statistics
        if computeBEAT:
            MidTermFeatures = numpy.append(MidTermFeatures, beat)
            MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
        if len(allMtFeatures) == 0:  # append feature vector
            allMtFeatures = MidTermFeatures
        else:
            allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
        t2 = time.clock()
        duration = float(len(x)) / Fs
        processingTimes.append((t2 - t1) / duration)
    if len(processingTimes) > 0:
        print "Feature extraction complexity ratio: {0:.1f} x realtime".format(
            (1.0 / numpy.mean(numpy.array(processingTimes))))
    return (allMtFeatures, wavFilesList)
import csv
import random as rn
import math
import operator
import numpy as np
import audioBasicIO
import audioFeatureExtraction

dire = r"G:\5th sem\ee320 DSP\project\csv\newdata\dataset\testdata\angry"
c = 1
for filename in os.walk(dire):
    for x in filename[2]:
        label1 = []
        label = []
        file = filename[0] + "\\" + str(x)
        [Fs, x] = audioBasicIO.readAudioFile(file)
        F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.50 * Fs,
                                                       0.25 * Fs)
        for i in range(len(F[0])):
            label1.append(3)
        label.append(label1)
        G = np.append(F, label, axis=0)
        loc = r"G:\5th sem\ee320 DSP\project\csv\newdata\dataset\testdata\angrycsv\an" + str(
            c) + ".csv"
        c = c + 1
        np.savetxt(loc, G, delimiter=",")

dire = r"G:\5th sem\ee320 DSP\project\csv\newdata\dataset\testdata\sad"
c = 1

for filename in os.walk(dire):
Example #44
0
def mtFileClassification(inputFile, modelName, modelType, plotResults = False, gtFile = ""):
	'''
	This function performs mid-term classification of an audio stream.
	Towards this end, supervised knowledge is used, i.e. a pre-trained classifier.
	ARGUMENTS:
		- inputFile:		path of the input WAV file
		- modelName:		name of the classification model
		- modelType:		svm or knn depending on the classifier type
		- plotResults:		True if results are to be plotted using matplotlib along with a set of statistics
	
	RETURNS:
	  	- segs:			a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds)
		- classes:		a sequence of class flags: class[i] is the class ID of the i-th segment
	'''

	if not os.path.isfile(modelName):
		print "mtFileClassificationError: input modelType not found!"
		return (-1,-1,-1)
	# Load classifier:
	if modelType=='svm':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName)
	elif modelType=='knn':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName)
	if computeBEAT:
		print "Model " + modelName + " contains long-term music features (beat etc) and cannot be used in segmentation"	
		return (-1,-1,-1)
	[Fs, x] = audioBasicIO.readAudioFile(inputFile)					# load input file
	if Fs == -1:									# could not read file
		return  (-1,-1,-1)
	x = audioBasicIO.stereo2mono(x);						# convert stereo (if) to mono
	Duration = len(x) / Fs					
											# mid-term feature extraction:
	[MidTermFeatures, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs*stWin), round(Fs*stStep));
	flags = []; Ps = []; flagsInd = []
	for i in range(MidTermFeatures.shape[1]): 					# for each feature vector (i.e. for each fix-sized segment):
		curFV = (MidTermFeatures[:, i] - MEAN) / STD;				# normalize current feature vector					
		[Result, P] = aT.classifierWrapper(Classifier, modelType, curFV)	# classify vector
		flagsInd.append(Result)
		flags.append(classNames[int(Result)])					# update class label matrix
		Ps.append(numpy.max(P))							# update probability matrix
	flagsInd = numpy.array(flagsInd)

	# 1-window smoothing
	for i in range(1, len(flagsInd)-1):
		if flagsInd[i-1]==flagsInd[i+1]:
			flagsInd[i] = flagsInd[i+1]
	(segs, classes) = flags2segs(flags, mtStep)					# convert fix-sized flags to segments and classes
	segs[-1] = len(x) / float(Fs)

	# Load grount-truth:
	if os.path.isfile(gtFile):
		[segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile)		
		flagsGT, classNamesGT = segs2flags(segStartGT, segEndGT, segLabelsGT, mtStep)
		flagsIndGT = []
		for j, fl in enumerate(flagsGT):					# "align" labels with GT
			if classNamesGT[flagsGT[j]] in classNames:
				flagsIndGT.append( classNames.index( classNamesGT[flagsGT[j]] ) )
			else:
				flagsIndGT.append( -1 )
		flagsIndGT = numpy.array(flagsIndGT)
	else:
		flagsIndGT = numpy.array([])
	acc = plotSegmentationResults(flagsInd, flagsIndGT, classNames, mtStep, not plotResults)
	if acc>=0:
		print "Overall Accuracy: {0:.3f}".format(acc)
	return (flagsInd, classNames, acc)
Example #45
0
def fileChromagramWrapper(wavFileName):
    if not os.path.isfile(wavFileName):
        raise Exception("Input audio file not found!")
    [Fs, x] = audioBasicIO.readAudioFile(wavFileName)
    x = audioBasicIO.stereo2mono(x)
    specgram, TimeAxis, FreqAxis = aF.stChromagram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True)
Example #46
0
#!/usr/bin/python

import sys
import audioBasicIO as aIO
import audioSegmentation as aS

[Fs, x] = aIO.readAudioFile(sys.argv[1])
segments = aS.silenceRemoval(x, Fs, 0.020, 0.020, smoothWindow = 1.0, Weight = 0.3, plot = False)

sid = 1
for i in segments:
    print('%s,%.4f,%.4f' % (str(sid).zfill(4),i[0],i[1]))
    sid += 1
def trainHMM_fromDir(dirPath, hmm_model_name, mt_win, mt_step):
    '''
	This function trains a HMM model for segmentation-classification using
	a where WAV files and .segment (ground-truth files) are stored
	ARGUMENTS:
	 - dirPath:        the path of the data diretory
	 - hmm_model_name:    the name of the HMM model to be stored
	 - mt_win:        mid-term window size
	 - mt_step:        mid-term window step
	RETURNS:
	 - hmm:            an object to the resulting HMM
	 - class_names:        a list of class_names

	After training, hmm, class_names, along with the mt_win
	and mt_step values are stored in the hmm_model_name file
	'''

    flags_all = numpy.array([])
    classes_all = []
    for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')):
        # for each WAV file
        wav_file = f
        gt_file = f.replace('.wav', '.segments')
        if not os.path.isfile(gt_file):
            continue
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step)
        for c in class_names:
            # update class names:
            if c not in classes_all:
                classes_all.append(c)
        [fs, x] = audioBasicIO.readAudioFile(wav_file)
        [F, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs,
                                           round(fs * 0.050),
                                           round(fs * 0.050))

        lenF = F.shape[1]
        lenL = len(flags)
        min_sm = min(lenF, lenL)
        F = F[:, 0:min_sm]
        flags = flags[0:min_sm]

        flagsNew = []
        for j, fl in enumerate(flags):  # append features and labels
            flagsNew.append(classes_all.index(class_names[flags[j]]))

        flags_all = numpy.append(flags_all, numpy.array(flagsNew))

        if i == 0:
            f_all = F
        else:
            f_all = numpy.concatenate((f_all, F), axis=1)
    start_prob, transmat, means, cov = trainHMM_computeStatistics(
        f_all, flags_all)  # compute HMM statistics
    hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")  # train HMM
    hmm.startprob_ = start_prob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmm_model_name, "wb")  # save HMM model
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classes_all, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classes_all
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35, PLOT=False):
    '''
    ARGUMENTS:
        - fileName:        the name of the WAV file to be analyzed
        - numOfSpeakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mtSize (opt)     mid-term window size
        - mtStep (opt)     mid-term window step
        - stWin  (opt)     short-term window size
        - LDAdim (opt)     LDA dimension (0 for no LDA)
        - PLOT     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [Fs, x] = audioBasicIO.readAudioFile(fileName)
    x = audioBasicIO.stereo2mono(x)
    Duration = len(x) / Fs

    [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel(
        "data/knnSpeakerAll")
    [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel(
        "data/knnSpeakerFemaleMale")

    [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin),
                                                                  round(Fs * stWin * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2
        [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
        [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2  # TODO
    # SELECT FEATURES:
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20];                                                                                         # SET 0A
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100];                                                                                 # SET 0B
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,
    #   97,98, 99,100];     # SET 0C

    iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
                       53]  # SET 1A
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];                                          # SET 1B
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 1C

    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53];             # SET 2A
    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];     # SET 2B
    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 2C

    # iFeaturesSelect = range(100);                                                                                                    # SET 3
    # MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010

    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    # remove outliers:
    DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    # TODO: Combine energy threshold for outlier removal:
    # EnergyMin = numpy.min(MidTermFeatures[1,:])
    # EnergyMean = numpy.mean(MidTermFeatures[1,:])
    # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    # iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
    # print iNonOutLiers

    perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    # LDA dimensionality reduction:
    if LDAdim > 0:
        # [mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));
        # extract mid-term features with minimum step:
        mtWinRatio = int(round(mtSize / stWin))
        mtStepRatio = int(round(stWin / stWin))
        mtFeaturesToReduce = []
        numOfFeatures = len(ShortTermFeatures)
        numOfStatistics = 2
        # for i in range(numOfStatistics * numOfFeatures + 1):
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append([])

        for i in range(numOfFeatures):  # for each of the short-term features:
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mtWinRatio
                if N2 > N:
                    N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures))
                curPos += mtStepRatio
        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros(
            (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
            [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1),
            i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1)::, i] = P2 + 0.0001
        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        # mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
        (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T
        # DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
        # MDistancesAll = numpy.mean(DistancesAll)
        # iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
        # mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
        Labels = numpy.zeros((mtFeaturesToReduce.shape[1],))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin
        # print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * stWin / LDAstepRatio);
        clf = LDA(n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001)
        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    if numOfSpeakers <= 0:
        sRange = range(2, 10)
    else:
        sRange = [numOfSpeakers]
    clsAll = []
    silAll = []
    centersAll = []

    for iSpeakers in sRange:
        cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True)  # perform k-means clustering

        # YDist =   distance.pdist(MidTermFeaturesNorm.T, metric='euclidean')
        # print distance.squareform(YDist).shape
        # hc = mlpy.HCluster()
        # hc.linkage(YDist)
        # cls = hc.cut(14.5)
        # print cls

        # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
        clsAll.append(cls)
        centersAll.append(means)
        silA = [];
        silB = []
        for c in range(iSpeakers):  # for each speaker (i.e. for each extracted cluster)
            clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls))
            if clusterPerCent < 0.020:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c]  # get subset of feature vectors
                Yt = distance.pdist(
                    MidTermFeaturesNormTemp.T)  # compute average distance between samples that belong to the cluster (a values)
                silA.append(numpy.mean(Yt) * clusterPerCent)
                silBs = []
                for c2 in range(iSpeakers):  # compute distances from samples of other clusters
                    if c2 != c:
                        clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
                        silBs.append(numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0)
                silBs = numpy.array(silBs)
                silB.append(min(silBs))  # ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
        silA = numpy.array(silA)
        silB = numpy.array(silB)
        sil = []
        for c in range(iSpeakers):  # for each cluster (speaker)
            sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001))  # compute silhouette

        silAll.append(numpy.mean(sil))  # keep the AVERAGE SILLOUETTE

    # silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
    imax = numpy.argmax(silAll)  # position of the maximum sillouette value
    nSpeakersFinal = sRange[imax]  # optimal number of clusters

    return nSpeakersFinal
Example #49
0
def trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep):
    '''
    This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored
    ARGUMENTS:
     - dirPath:        the path of the data diretory
     - hmmModelName:    the name of the HMM model to be stored
     - mtWin:        mid-term window size
     - mtStep:        mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - classNames:        a list of classNames

    After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file
    '''

    flagsAll = numpy.array([])
    classesAll = []
    for i, f in enumerate(glob.glob(dirPath + os.sep +
                                    '*.wav')):  # for each WAV file
        wavFile = f
        gtFile = f.replace('.wav', '.segments')  # open for annotated file
        if not os.path.isfile(
                gtFile
        ):  # if current WAV file does not have annotation -> skip
            continue
        [segStart, segEnd, segLabels] = readSegmentGT(gtFile)  # read GT data
        flags, classNames = segs2flags(segStart, segEnd, segLabels,
                                       mtStep)  # convert to flags
        for c in classNames:  # update classnames:
            if c not in classesAll:
                classesAll.append(c)
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)  # read audio data
        [F,
         _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs,
                                     round(Fs * 0.050),
                                     round(Fs * 0.050))  # feature extraction

        lenF = F.shape[1]
        lenL = len(flags)
        MIN = min(lenF, lenL)
        F = F[:, 0:MIN]
        flags = flags[0:MIN]

        flagsNew = []
        for j, fl in enumerate(flags):  # append features and labels
            flagsNew.append(classesAll.index(classNames[flags[j]]))

        flagsAll = numpy.append(flagsAll, numpy.array(flagsNew))

        if i == 0:
            Fall = F
        else:
            Fall = numpy.concatenate((Fall, F), axis=1)
    startprob, transmat, means, cov = trainHMM_computeStatistics(
        Fall, flagsAll)  # compute HMM statistics
    hmm = sklearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob,
                                  transmat)  # train HMM
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmmModelName, "wb")  # save HMM model
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classesAll, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classesAll