Beispiel #1
0
def fileClassification(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(modelName):
        print "fileClassification: input modelName not found!"
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print "fileClassification: wav file not found!"
        return (-1, -1, -1)

    if modelType == 'svm':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(modelName)
    elif modelType == 'knn':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(modelName)

    [Fs, x] = audioBasicIO.readAudioFile(inputFile)        # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)
    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(axis=1)        # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
    curFV = (MidTermFeatures - MEAN) / STD                # normalization

    [Result, P] = classifierWrapper(Classifier, modelType, curFV)    # classification        
    return Result, P, classNames
def fileClassification(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(modelName):
        print "fileClassification: input modelName not found!"
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print "fileClassification: wav file not found!"
        return (-1, -1, -1)

    if modelType == 'svm':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(modelName)
    elif modelType == 'knn':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(modelName)

    [Fs, x] = audioBasicIO.readAudioFile(inputFile)  # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)
    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(axis=1)  # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
    curFV = (MidTermFeatures - MEAN) / STD  # normalization

    [Result, P] = classifierWrapper(Classifier, modelType, curFV)  # classification
    return Result, P, classNames
def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mtWin, mtStep:    mid-term window and step (in seconds)
        - stWin, stStep:    short-term window and step (in seconds)
    """

    allMtFeatures = numpy.array([])
    processingTimes = []

    types = ('*.wav', '*.aif',  '*.aiff', '*.mp3','*.au')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(dirName, files)))

    wavFilesList = sorted(wavFilesList)    
    wavFilesList2 = []
    for i, wavFile in enumerate(wavFilesList):        
        print "Analyzing file {0:d} of {1:d}: {2:s}".format(i+1, len(wavFilesList), wavFile.encode('utf-8'))
        if os.stat(wavFile).st_size == 0:
            print "   (EMPTY FILE -- SKIPPING)"
            continue        
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)            # read file    
        if isinstance(x, int):
            continue        

        t1 = time.clock()        
        x = audioBasicIO.stereo2mono(x)                          # convert stereo to mono                
        if x.shape[0]<float(Fs)/10:
            print "  (AUDIO FILE TOO SMALL - SKIPPING)"
            continue
        wavFilesList2.append(wavFile)
        if computeBEAT:                                          # mid-term feature extraction for current file
            [MidTermFeatures, stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep))
            [beat, beatConf] = beatExtraction(stFeatures, stStep)
        else:
            [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep))

        MidTermFeatures = numpy.transpose(MidTermFeatures)
        MidTermFeatures = MidTermFeatures.mean(axis=0)         # long term averaging of mid-term statistics
        if (not numpy.isnan(MidTermFeatures).any()) and (not numpy.isinf(MidTermFeatures).any()):            
            if computeBEAT:
                MidTermFeatures = numpy.append(MidTermFeatures, beat)
                MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
            if len(allMtFeatures) == 0:                              # append feature vector
                allMtFeatures = MidTermFeatures
            else:
                allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
            t2 = time.clock()
            duration = float(len(x)) / Fs
            processingTimes.append((t2 - t1) / duration)
    if len(processingTimes) > 0:
        print "Feature extraction complexity ratio: {0:.1f} x realtime".format((1.0 / numpy.mean(numpy.array(processingTimes))))
    return (allMtFeatures, wavFilesList2)
def fileChromagramWrapper(wavFileName):
    if not os.path.isfile(wavFileName):
        raise Exception("Input audio file not found!")
    [Fs, x] = audioBasicIO.readAudioFile(wavFileName)
    x = audioBasicIO.stereo2mono(x)
    specgram, TimeAxis, FreqAxis = aF.stChromagram(x, Fs, round(Fs * 0.040),
                                                   round(Fs * 0.040), True)
def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile,
                              storeStFeatures=False, storeToCSV=False, PLOT=False):
    """
    This function is used as a wrapper to:
    a) read the content of a WAV file
    b) perform mid-term feature extraction on that signal
    c) write the mid-term feature sequences to a numpy file
    """
    [Fs, x] = audioBasicIO.readAudioFile(fileName)            # read the wav file
    x = audioBasicIO.stereo2mono(x)                           # convert to MONO if required
    if storeStFeatures:
        [mtF, stF] = mtFeatureExtraction(x, Fs, round(Fs * midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep))
    else:
        [mtF, _] = mtFeatureExtraction(x, Fs, round(Fs*midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep))

    numpy.save(outPutFile, mtF)                              # save mt features to numpy file
    if PLOT:
        print "Mid-term numpy file: " + outPutFile + ".npy saved"
    if storeToCSV:
        numpy.savetxt(outPutFile+".csv", mtF.T, delimiter=",")
        if PLOT:
            print "Mid-term CSV file: " + outPutFile + ".csv saved"

    if storeStFeatures:
        numpy.save(outPutFile+"_st", stF)                    # save st features to numpy file
        if PLOT:
            print "Short-term numpy file: " + outPutFile + "_st.npy saved"
        if storeToCSV:
            numpy.savetxt(outPutFile+"_st.csv", stF.T, delimiter=",")    # store st features to CSV file
            if PLOT:
                print "Short-term CSV file: " + outPutFile + "_st.csv saved"
Beispiel #6
0
def fileSpectrogramWrapper(wav_file):
    if not os.path.isfile(wav_file):
        raise Exception("Input audio file not found!")
    [fs, x] = audioBasicIO.readAudioFile(wav_file)
    x = audioBasicIO.stereo2mono(x)
    specgram, TimeAxis, FreqAxis = aF.stSpectogram(x, fs, round(fs * 0.040),
                                                   round(fs * 0.040), True)
Beispiel #7
0
def fileChromagramWrapper(wavFileName):
    if not os.path.isfile(wavFileName):
        raise Exception("Input audio file not found!")
    [Fs, x] = audioBasicIO.readAudioFile(wavFileName)
    x = audioBasicIO.stereo2mono(x)
    specgram, TimeAxis, FreqAxis = aF.stChromagram(x, Fs, round(Fs * 0.040),
                                                   round(Fs * 0.040), True)
def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mtWin, mtStep:    mid-term window and step (in seconds)
        - stWin, stStep:    short-term window and step (in seconds)
    """

    allMtFeatures = numpy.array([])
    processingTimes = []

    types = ('*.wav', '*.aif',  '*.aiff', '*.mp3','*.au')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(dirName, files)))

    wavFilesList = sorted(wavFilesList)    

    for i, wavFile in enumerate(wavFilesList):        
        print "Analyzing file {0:d} of {1:d}: {2:s}".format(i+1, len(wavFilesList), wavFile.encode('utf-8'))
        if os.stat(wavFile).st_size == 0:
            print "   (EMPTY FILE -- SKIPPING)"
            continue        
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)            # read file    
        if isinstance(x, int):
            continue        


        t1 = time.clock()        
        x = audioBasicIO.stereo2mono(x)                          # convert stereo to mono                
        if x.shape[0]<float(Fs)/10:
            print "  (AUDIO FILE TOO SMALL - SKIPPING)"
            continue
        if computeBEAT:                                          # mid-term feature extraction for current file
            [MidTermFeatures, stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep))
            [beat, beatConf] = beatExtraction(stFeatures, stStep)
        else:
            [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep))

        MidTermFeatures = numpy.transpose(MidTermFeatures)
        MidTermFeatures = MidTermFeatures.mean(axis=0)         # long term averaging of mid-term statistics
        if (not numpy.isnan(MidTermFeatures).any()) and (not numpy.isinf(MidTermFeatures).any()):            
            if computeBEAT:
                MidTermFeatures = numpy.append(MidTermFeatures, beat)
                MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
            if len(allMtFeatures) == 0:                              # append feature vector
                allMtFeatures = MidTermFeatures
            else:
                allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
            t2 = time.clock()
            duration = float(len(x)) / Fs
            processingTimes.append((t2 - t1) / duration)
    if len(processingTimes) > 0:
        print "Feature extraction complexity ratio: {0:.1f} x realtime".format((1.0 / numpy.mean(numpy.array(processingTimes))))
    return (allMtFeatures, wavFilesList)
def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile,
                              storeStFeatures=False, storeToCSV=False, PLOT=False):
    """
    This function is used as a wrapper to:
    a) read the content of a WAV file
    b) perform mid-term feature extraction on that signal
    c) write the mid-term feature sequences to a numpy file
    """
    [Fs, x] = audioBasicIO.readAudioFile(fileName)            # read the wav file
    x = audioBasicIO.stereo2mono(x)                           # convert to MONO if required
    if storeStFeatures:
        [mtF, stF] = mtFeatureExtraction(x, Fs, round(Fs * midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep))
    else:
        [mtF, _] = mtFeatureExtraction(x, Fs, round(Fs*midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep))

    numpy.save(outPutFile, mtF)                              # save mt features to numpy file
    if PLOT:
        print "Mid-term numpy file: " + outPutFile + ".npy saved"
    if storeToCSV:
        numpy.savetxt(outPutFile+".csv", mtF.T, delimiter=",")
        if PLOT:
            print "Mid-term CSV file: " + outPutFile + ".csv saved"

    if storeStFeatures:
        numpy.save(outPutFile+"_st", stF)                    # save st features to numpy file
        if PLOT:
            print "Short-term numpy file: " + outPutFile + "_st.npy saved"
        if storeToCSV:
            numpy.savetxt(outPutFile+"_st.csv", stF.T, delimiter=",")    # store st features to CSV file
            if PLOT:
                print "Short-term CSV file: " + outPutFile + "_st.csv saved"
Beispiel #10
0
def fileRegression(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(inputFile):
        print "fileClassification: wav file not found!"
        return (-1, -1, -1)

    regressionModels = glob.glob(modelName + "_*")
    regressionModels2 = []
    for r in regressionModels:
        if r[-5::] != "MEANS":
            regressionModels2.append(r)
    regressionModels = regressionModels2
    regressionNames = []
    for r in regressionModels:
        regressionNames.append(r[r.rfind("_") + 1::])

    # FEATURE EXTRACTION
    # LOAD ONLY THE FIRST MODEL (for mtWin, etc)
    if modelType == 'svm':
        [_, _, _, mtWin, mtStep, stWin, stStep,
         computeBEAT] = loadSVModel(regressionModels[0], True)
    elif modelType == 'knn':
        [_, _, _, mtWin, mtStep, stWin, stStep,
         computeBEAT] = loadKNNModel(regressionModels[0], True)

    [Fs, x] = audioBasicIO.readAudioFile(
        inputFile)  # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)
    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs,
                                                  mtWin * Fs, mtStep * Fs,
                                                  round(Fs * stWin),
                                                  round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(
        axis=1)  # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)

    # REGRESSION
    R = []
    for ir, r in enumerate(regressionModels):
        if not os.path.isfile(r):
            print "fileClassification: input modelName not found!"
            return (-1, -1, -1)
        if modelType == 'svm':
            [Model, MEAN, STD, mtWin, mtStep, stWin, stStep,
             computeBEAT] = loadSVModel(r, True)
        elif modelType == 'knn':
            [Model, MEAN, STD, mtWin, mtStep, stWin, stStep,
             computeBEAT] = loadKNNModel(r, True)
        curFV = (MidTermFeatures - MEAN) / STD
        # normalization
        R.append(regressionWrapper(Model, modelType, curFV))  # classification
    return R, regressionNames
def hmmSegmentation(wavFileName, hmmModelName, PLOT=False, gtFileName=""):
    if wavFileName is str:
        [Fs, x] = audioBasicIO.readAudioFile(wavFileName)  # load input file
    else:
        Fs = 44100
        x = wavFileName
    x = audioBasicIO.stereo2mono(x)

    try:
        fo = open(hmmModelName, "rb")
    except IOError:
        print "didn't find file"
        return

    try:
        hmm = cPickle.load(fo)
        classesAll = cPickle.load(fo)
        mtWin = cPickle.load(fo)
        mtStep = cPickle.load(fo)
    except:
        fo.close()
    fo.close()

    #Features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);    # feature extraction
    [Features, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs,
                                           round(Fs * 0.050),
                                           round(Fs * 0.050))
    flagsInd = hmm.predict(Features.T)  # apply model
    #for i in range(len(flagsInd)):
    #    if classesAll[flagsInd[i]]=="silence":
    #        flagsInd[i]=classesAll.index("speech")

    # plot results
    if os.path.isfile(gtFileName):
        [segStart, segEnd, segLabels] = readSegmentGT(gtFileName)
        flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep)
        flagsGTNew = []
        for j, fl in enumerate(flagsGT):  # "align" labels with GT
            if classNamesGT[flagsGT[j]] in classesAll:
                flagsGTNew.append(classesAll.index(classNamesGT[flagsGT[j]]))
            else:
                flagsGTNew.append(-1)
        CM = numpy.zeros((len(classNamesGT), len(classNamesGT)))
        flagsIndGT = numpy.array(flagsGTNew)
        for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])):
            CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1
    else:
        flagsIndGT = numpy.array([])
    acc = plotSegmentationResults(flagsInd, flagsIndGT, classesAll, mtStep,
                                  not PLOT)
    if acc >= 0:
        print "Overall Accuracy: {0:.2f}".format(acc)
        return (flagsInd, classNamesGT, acc, CM)
    else:
        return (flagsInd, classesAll, -1, -1)
Beispiel #12
0
def dirWavFeatureExtractionNoAveraging(dirName):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder without averaging each file.

    ARGUMENTS:
        - dirName:          the path of the WAVE directory
        - mtWin, mtStep:    mid-term window and step (in seconds)
        - stWin, stStep:    short-term window and step (in seconds)
    RETURNS:
        - X:                A feature matrix
        - Y:                A matrix of file labels
        - filenames:
    """

    allMtFeatures = numpy.array([])
    signalIndices = numpy.array([])
    processingTimes = []

    types = ('*.wav', '*.aif', '*.aiff')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(dirName, files)))

    wavFilesList = sorted(wavFilesList)

    for i, wavFile in enumerate(wavFilesList):
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)  # read file

        mtWin = float(len(x)) / (Fs * 30)
        mtStep = mtWin
        stWin = 1.0
        stStep = 1.0
        if isinstance(x, int):
            continue

        x = audioBasicIO.stereo2mono(x)  # convert stereo to mono
        [MidTermFeatures,
         _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs),
                                  round(Fs * stWin),
                                  round(Fs * stStep))  # mid-term feature

        MidTermFeatures = numpy.transpose(MidTermFeatures)[0:30]
        #        MidTermFeatures = MidTermFeatures.mean(axis=0)        # long term averaging of mid-term statistics
        if len(allMtFeatures) == 0:  # append feature vector
            allMtFeatures = MidTermFeatures
            signalIndices = numpy.zeros((MidTermFeatures.shape[0], ))
        else:
            allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
            signalIndices = numpy.append(
                signalIndices, i * numpy.ones((MidTermFeatures.shape[0], )))

    return (allMtFeatures, signalIndices, wavFilesList)
Beispiel #13
0
def fileRegression(inputFile, model_name, model_type):
    # Load classifier:

    if not os.path.isfile(inputFile):
        print("fileClassification: wav file not found!")
        return (-1, -1, -1)

    regression_models = glob.glob(model_name + "_*")
    regression_models2 = []
    for r in regression_models:
        if r[-5::] != "MEANS":
            regression_models2.append(r)
    regression_models = regression_models2
    regression_names = []
    for r in regression_models:
        regression_names.append(r[r.rfind("_") + 1::])

    # FEATURE EXTRACTION
    # LOAD ONLY THE FIRST MODEL (for mt_win, etc)
    if model_type == 'svm' or model_type == "svm_rbf" or model_type == 'randomforest':
        [_, _, _, mt_win, mt_step, st_win, st_step,
         compute_beat] = load_model(regression_models[0], True)

    [Fs, x] = audioBasicIO.readAudioFile(
        inputFile)  # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)
    # feature extraction:
    [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs,
                                                 mt_step * Fs,
                                                 round(Fs * st_win),
                                                 round(Fs * st_step))
    mt_features = mt_features.mean(
        axis=1)  # long term averaging of mid-term statistics
    if compute_beat:
        [beat, beatConf] = aF.beatExtraction(s, st_step)
        mt_features = numpy.append(mt_features, beat)
        mt_features = numpy.append(mt_features, beatConf)

    # REGRESSION
    R = []
    for ir, r in enumerate(regression_models):
        if not os.path.isfile(r):
            print("fileClassification: input model_name not found!")
            return (-1, -1, -1)
        if model_type == 'svm' or model_type == "svm_rbf" \
                or model_type == 'randomforest':
            [model, MEAN, STD, mt_win, mt_step, st_win, st_step, compute_beat] = \
                load_model(r, True)
        curFV = (mt_features - MEAN) / STD  # normalization
        R.append(regressionWrapper(model, model_type, curFV))  # classification
    return R, regression_names
def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mtWin, mtStep:    mid-term window and step (in seconds)
        - stWin, stStep:    short-term window and step (in seconds)
    """

    allMtFeatures = numpy.array([])
    processingTimes = []

    types = ('*.wav', '*.aif', '*.aiff')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(dirName, files)))

    wavFilesList = sorted(wavFilesList)

    for wavFile in wavFilesList:
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)  # read file
        t1 = time.clock()
        x = audioBasicIO.stereo2mono(x)  # convert stereo to mono
        if computeBEAT:  # mid-term feature extraction for current file
            [MidTermFeatures, stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs),
                                                                round(Fs * stWin), round(Fs * stStep))
            [beat, beatConf] = beatExtraction(stFeatures, stStep)
        else:
            [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin),
                                                       round(Fs * stStep))

        MidTermFeatures = numpy.transpose(MidTermFeatures)
        MidTermFeatures = MidTermFeatures.mean(axis=0)  # long term averaging of mid-term statistics
        if computeBEAT:
            MidTermFeatures = numpy.append(MidTermFeatures, beat)
            MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
        if len(allMtFeatures) == 0:  # append feature vector
            allMtFeatures = MidTermFeatures
        else:
            allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
        t2 = time.clock()
        duration = float(len(x)) / Fs
        processingTimes.append((t2 - t1) / duration)
    if len(processingTimes) > 0:
        print "Feature extraction complexity ratio: {0:.1f} x realtime".format(
            (1.0 / numpy.mean(numpy.array(processingTimes))))
    return (allMtFeatures, wavFilesList)
Beispiel #15
0
def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT = False):
	"""
	This function extracts the mid-term features of the WAVE files of a particular folder.

	The resulting feature vector is extracted by long-term averaging the mid-term features.
	Therefore ONE FEATURE VECTOR is extracted for each WAV file.
	
	ARGUMENTS:
		- dirName:		the path of the WAVE directory
		- mtWin, mtStep:	mid-term window and step (in seconds)
		- stWin, stStep:	short-term window and step (in seconds)
	"""

	allMtFeatures = numpy.array([])
 	processingTimes = []

	types = ('*.wav', '*.aif',  '*.aiff')
	wavFilesList = []
	for files in types:
		wavFilesList.extend(glob.glob(os.path.join(dirName, files)))
	
	wavFilesList = sorted(wavFilesList)
	
	for wavFile in wavFilesList:	
		[Fs, x] = audioBasicIO.readAudioFile(wavFile)			# read file
		t1 = time.clock()
		x = audioBasicIO.stereo2mono(x);				# convert stereo to mono
		if computeBEAT:							# mid-term feature extraction for current file
			[MidTermFeatures, stFeatures] 	= mtFeatureExtraction(x, Fs, round(mtWin*Fs), round(mtStep*Fs), round(Fs*stWin), round(Fs*stStep))
			[beat, beatConf] 		= beatExtraction(stFeatures, stStep)		
		else:
			[MidTermFeatures, _] 		= mtFeatureExtraction(x, Fs, round(mtWin*Fs), round(mtStep*Fs), round(Fs*stWin), round(Fs*stStep))

		MidTermFeatures = numpy.transpose(MidTermFeatures)
		MidTermFeatures = MidTermFeatures.mean(axis=0)		# long term averaging of mid-term statistics
		if computeBEAT:
			MidTermFeatures = numpy.append(MidTermFeatures, beat);
			MidTermFeatures = numpy.append(MidTermFeatures, beatConf);
		if len(allMtFeatures)==0:				# append feature vector
			allMtFeatures = MidTermFeatures
		else:
			allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
		t2 = time.clock()
		duration = float(len(x)) / Fs
		processingTimes.append((t2-t1) / duration)
	if len(processingTimes)>0:
		print "Feature extraction complexity ratio: {0:.1f} x realtime".format((1.0/numpy.mean(numpy.array(processingTimes))))
	return (allMtFeatures, wavFilesList)
Beispiel #16
0
def dirWavFeatureExtractionNoAveraging(dirName, mt_win, mt_step, st_win,
                                       st_step):
    """
    This function extracts the mid-term features of the WAVE
    files of a particular folder without averaging each file.

    ARGUMENTS:
        - dirName:          the path of the WAVE directory
        - mt_win, mt_step:    mid-term window and step (in seconds)
        - st_win, st_step:    short-term window and step (in seconds)
    RETURNS:
        - X:                A feature matrix
        - Y:                A matrix of file labels
        - filenames:
    """

    all_mt_feats = numpy.array([])
    signal_idx = numpy.array([])
    process_times = []

    types = ('*.wav', '*.aif', '*.aiff', '*.ogg')
    wav_file_list = []
    for files in types:
        wav_file_list.extend(glob.glob(os.path.join(dirName, files)))

    wav_file_list = sorted(wav_file_list)

    for i, wavFile in enumerate(wav_file_list):
        [fs, x] = audioBasicIO.readAudioFile(wavFile)
        if isinstance(x, int):
            continue

        x = audioBasicIO.stereo2mono(x)
        [mt_term_feats, _, _] = mtFeatureExtraction(x, fs, round(mt_win * fs),
                                                    round(mt_step * fs),
                                                    round(fs * st_win),
                                                    round(fs * st_step))

        mt_term_feats = numpy.transpose(mt_term_feats)
        if len(all_mt_feats) == 0:  # append feature vector
            all_mt_feats = mt_term_feats
            signal_idx = numpy.zeros((mt_term_feats.shape[0], ))
        else:
            all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats))
            signal_idx = numpy.append(
                signal_idx, i * numpy.ones((mt_term_feats.shape[0], )))

    return (all_mt_feats, signal_idx, wav_file_list)
def fileRegression(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(inputFile):
        print "fileClassification: wav file not found!"
        return (-1, -1, -1)

    regressionModels = glob.glob(modelName + "_*")
    regressionModels2 = []
    for r in regressionModels:
        if r[-5::] != "MEANS":
            regressionModels2.append(r)
    regressionModels = regressionModels2
    regressionNames = []
    for r in regressionModels:
        regressionNames.append(r[r.rfind("_") + 1::])

    # FEATURE EXTRACTION
    # LOAD ONLY THE FIRST MODEL (for mtWin, etc)
    if modelType == 'svm':
        [_, _, _, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(regressionModels[0], True)
    elif modelType == 'knn':
        [_, _, _, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(regressionModels[0], True)

    [Fs, x] = audioBasicIO.readAudioFile(inputFile)  # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)
    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(axis=1)  # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)

    # REGRESSION
    R = []
    for ir, r in enumerate(regressionModels):
        if not os.path.isfile(r):
            print "fileClassification: input modelName not found!"
            return (-1, -1, -1)
        if modelType == 'svm':
            [Model, MEAN, STD, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(r, True)
        elif modelType == 'knn':
            [Model, MEAN, STD, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(r, True)
        curFV = (MidTermFeatures - MEAN) / STD  # normalization
        R.append(regressionWrapper(Model, modelType, curFV))  # classification
    return R, regressionNames
Beispiel #18
0
def fileClassification(inputFile, model_name, model_type):
    # Load classifier:

    if not os.path.isfile(model_name):
        print("fileClassification: input model_name not found!")
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print("fileClassification: wav file not found!")
        return (-1, -1, -1)

    if model_type == 'knn':
        [
            classifier, MEAN, STD, classNames, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = load_model_knn(model_name)
    else:
        [
            classifier, MEAN, STD, classNames, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = load_model(model_name)

    [Fs, x] = audioBasicIO.readAudioFile(
        inputFile)  # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)

    if isinstance(x, int):  # audio file IO problem
        return (-1, -1, -1)
    if x.shape[0] / float(Fs) <= mt_win:
        return (-1, -1, -1)

    # feature extraction:
    [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs,
                                                 mt_step * Fs,
                                                 round(Fs * st_win),
                                                 round(Fs * st_step))
    mt_features = mt_features.mean(
        axis=1)  # long term averaging of mid-term statistics
    if compute_beat:
        [beat, beatConf] = aF.beatExtraction(s, st_step)
        mt_features = numpy.append(mt_features, beat)
        mt_features = numpy.append(mt_features, beatConf)
    curFV = (mt_features - MEAN) / STD  # normalization

    [Result, P] = classifierWrapper(classifier, model_type,
                                    curFV)  # classification
    return Result, P, classNames
def dirWavFeatureExtractionNoAveraging(dirName, mtWin, mtStep, stWin, stStep):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder without averaging each file.

    ARGUMENTS:
        - dirName:          the path of the WAVE directory
        - mtWin, mtStep:    mid-term window and step (in seconds)
        - stWin, stStep:    short-term window and step (in seconds)
    RETURNS:
        - X:                A feature matrix
        - Y:                A matrix of file labels
        - filenames:
    """

    allMtFeatures = numpy.array([])
    signalIndices = numpy.array([])
    processingTimes = []

    types = ('*.wav', '*.aif',  '*.aiff')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(dirName, files)))

    wavFilesList = sorted(wavFilesList)

    for i, wavFile in enumerate(wavFilesList):
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)            # read file
        if isinstance(x, int):
            continue        
        
        x = audioBasicIO.stereo2mono(x)                          # convert stereo to mono
        [MidTermFeatures, _] = mtFeatureExtraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin), round(Fs * stStep))  # mid-term feature

        MidTermFeatures = numpy.transpose(MidTermFeatures)
#        MidTermFeatures = MidTermFeatures.mean(axis=0)        # long term averaging of mid-term statistics
        if len(allMtFeatures) == 0:                # append feature vector
            allMtFeatures = MidTermFeatures
            signalIndices = numpy.zeros((MidTermFeatures.shape[0], ))
        else:
            allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
            signalIndices = numpy.append(signalIndices, i * numpy.ones((MidTermFeatures.shape[0], )))

    return (allMtFeatures, signalIndices, wavFilesList)
def fileClassification(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(modelName):
        print "fileClassification: input modelName not found!"
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print "fileClassification: wav file not found!"
        return (-1, -1, -1)

    if (modelType) == 'svm' or (modelType == 'svm_rbf'):
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(modelName)
    elif modelType == 'knn':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(modelName)
    elif modelType == 'randomforest':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadRandomForestModel(modelName)
    elif modelType == 'gradientboosting':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadGradientBoostingModel(modelName)
    elif modelType == 'extratrees':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadExtraTreesModel(modelName)

    [Fs, x] = audioBasicIO.readAudioFile(inputFile)        # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)

    if isinstance(x, int):                                 # audio file IO problem
        return (-1, -1, -1)
    if x.shape[0] / float(Fs) <= mtWin:
        return (-1, -1, -1)

    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(axis=1)        # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
    curFV = (MidTermFeatures - MEAN) / STD                # normalization

    [Result, P] = classifierWrapper(Classifier, modelType, curFV)    # classification        
    return Result, P, classNames
Beispiel #21
0
def fileClassification(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(modelName):
        print ("fileClassification: input modelName not found!")
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print ("fileClassification: wav file not found!")
        return (-1, -1, -1)

    if (modelType) == 'svm' or (modelType == 'svm_rbf'):
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadSVModel(modelName)
    elif modelType == 'knn':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadKNNModel(modelName)
    elif modelType == 'randomforest':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadRandomForestModel(modelName)
    elif modelType == 'gradientboosting':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadGradientBoostingModel(modelName)
    elif modelType == 'extratrees':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = loadExtraTreesModel(modelName)

    [Fs, x] = audioBasicIO.readAudioFile(inputFile)        # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)

    if isinstance(x, int):                                 # audio file IO problem
        return (-1, -1, -1)
    if x.shape[0] / float(Fs) <= mtWin:
        return (-1, -1, -1)

    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(axis=1)        # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
    curFV = (MidTermFeatures - MEAN) / STD                # normalization

    [Result, P] = classifierWrapper(Classifier, modelType, curFV)    # classification        
    return Result, P, classNames
def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile,
                              storeStFeatures=False, storeToCSV=False, PLOT=False):
    """
    This function is used as a wrapper to:
    a) read the content of a WAV file
    b) perform mid-term feature extraction on that signal
    c) write the mid-term feature sequences to a numpy file
    """
    [Fs, x] = audioBasicIO.readAudioFile(fileName)            # read the wav file
 #   print("fs is ........",Fs)
    x = audioBasicIO.stereo2mono(x)                           # convert to MONO if required
    if storeStFeatures:
        [mtF, stF] = mtFeatureExtraction(x, Fs, round(Fs * midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep))
    else:
        [mtF, _] = mtFeatureExtraction(x, Fs, round(Fs*midTermSize), round(Fs * midTermStep), round(Fs * shortTermSize), round(Fs * shortTermStep))

    numpy.save(outPutFile, mtF)                              # save mt features to numpy file
    if PLOT:
        print "Mid-term numpy file: " + outPutFile + ".npy saved"
    if storeToCSV:
        path="/Users/somyagoel/Flask/basic_app/music/mt/"
        newpath = outPutFile.split("/")[-1]
        numpy.savetxt(path+newpath+".csv", mtF.T, delimiter=",")
        if PLOT:
            print "Mid-term CSV file: " + outPutFile + ".csv saved"

    if storeStFeatures:
        numpy.save(outPutFile+"_st", stF)                    # save st features to numpy file
        if PLOT:
            print "Short-term numpy file: " + outPutFile + "_st.npy saved"
        if storeToCSV:
            #  os.getcwd()
            path="/Users/somyagoel/Flask/basic_app/music/st/"
            #  os.chdir(path)
            newpath = outPutFile.split("/")[-1]
            numpy.savetxt(path+newpath+"_st.csv", stF.T, delimiter=",")    # store st features to CSV file
            #      path="/Users/somyagoel/dir_fe"
            #os.chdir(path)
            if PLOT:
                print "Short-term CSV file: " + outPutFile + "_st.csv saved"
Beispiel #23
0
def mtFeatureExtraction(fileName, midTermSize, midTermStep, shortTermSize,
                        shortTermStep):
    allMtFeatures = numpy.array([])
    numpy.set_printoptions(suppress=True)
    """
    This function is used as a wrapper to:
    a) read the content of a WAV file
    b) perform mid-term feature extraction on that signal
    c) write the mid-term feature sequences to a numpy file
    """
    [Fs, x] = audioBasicIO.readAudioFile(fileName)  # read the wav file
    x = audioBasicIO.stereo2mono(x)  # convert to MONO if required
    mtWinRatio = int(round(midTermSize / shortTermStep))
    mtStepRatio = int(round(midTermStep / shortTermStep))
    mtFeatures = []
    stFeatures = stFeatureExtraction(x, Fs, shortTermSize * Fs,
                                     shortTermStep * Fs)
    numOfFeatures = len(stFeatures)
    numOfStatistics = 2
    mtFeatures = []
    #for i in range(numOfStatistics * numOfFeatures + 1):
    for i in range(numOfStatistics * numOfFeatures):
        mtFeatures.append([])
    for i in range(numOfFeatures):  # for each of the short-term features:
        curPos = 0
        N = len(stFeatures[i])
        while (curPos < N):
            N1 = curPos
            N2 = curPos + mtWinRatio
            if N2 > N:
                N2 = N
            curStFeatures = stFeatures[i][N1:N2]
            mtFeatures[i].append(numpy.mean(curStFeatures))
            mtFeatures[i + numOfFeatures].append(numpy.std(curStFeatures))
            #mtFeatures[i+2*numOfFeatures].append(numpy.std(curStFeatures) / (numpy.mean(curStFeatures)+0.00000010))
            curPos += mtStepRatio
    return (numpy.array(mtFeatures), stFeatures, Fs, x)
Beispiel #24
0
def main(argv):
    if argv[1] == "-dirMp3toWAV":  # convert mp3 to wav (batch)
        if len(argv) == 5:
            path = argv[2]
            if argv[3] not in ["8000", "16000", "32000", "44100"]:
                print "Error. Unsupported sampling rate (must be: 8000, 16000, 32000 or 44100)."
                return
            if argv[4] not in ["1", "2"]:
                print "Error. Number of output channels must be 1 or 2"
                return
            if not os.path.isdir(path):
                raise Exception("Input path not found!")
            useMp3TagsAsNames = True
            audioBasicIO.convertDirMP3ToWav(path, int(argv[3]), int(argv[4]),
                                            useMp3TagsAsNames)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -dirMp3toWAV <dirName> <sampling Freq> <numOfChannels>"

    if argv[1] == "-dirWAVChangeFs":  # convert mp3 to wav (batch)
        if len(argv) == 5:
            path = argv[2]
            if argv[3] not in ["8000", "16000", "32000", "44100"]:
                print "Error. Unsupported sampling rate (must be: 8000, 16000, 32000 or 44100)."
                return
            if argv[4] not in ["1", "2"]:
                print "Error. Number of output channels must be 1 or 2"
                return
            if not os.path.isdir(path):
                raise Exception("Input path not found!")
            audioBasicIO.convertFsDirWavToWav(path, int(argv[3]), int(argv[4]))
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -dirMp3toWAV <dirName> <sampling Freq> <numOfChannels>"

    elif argv[
            1] == "-featureExtractionFile":  # short-term and mid-term feature extraction to files (csv and numpy)
        if len(argv) == 7:
            wavFileName = argv[2]
            if not os.path.isfile(wavFileName):
                raise Exception("Input audio file not found!")
            if not (uT.isNum(argv[3]) and uT.isNum(argv[4])
                    and uT.isNum(argv[5]) and uT.isNum(argv[6])):
                raise Exception(
                    "Mid-term and short-term window sizes and steps must be numbers!"
                )
            mtWin = float(argv[3])
            mtStep = float(argv[4])
            stWin = float(argv[5])
            stStep = float(argv[6])
            outFile = wavFileName
            aF.mtFeatureExtractionToFile(wavFileName, mtWin, mtStep, stWin,
                                         stStep, outFile, True, True, True)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -featureExtractionFile <wavFileName> <mtWin> <mtStep> <stWin> <stStep>"

    elif argv[1] == "-beatExtraction":
        if len(argv) == 4:
            wavFileName = argv[2]
            if not os.path.isfile(wavFileName):
                raise Exception("Input audio file not found!")
            if not (uT.isNum(argv[3])):
                raise Exception("PLOT must be either 0 or 1")
            if not ((int(argv[3]) == 0) or (int(argv[3]) == 1)):
                raise Exception("PLOT must be either 0 or 1")

            [Fs, x] = audioBasicIO.readAudioFile(wavFileName)
            F = aF.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.050 * Fs)
            BPM, ratio = aF.beatExtraction(F, 0.050, int(argv[3]) == 1)
            print "Beat: {0:d} bpm ".format(int(BPM))
            print "Ratio: {0:.2f} ".format(ratio)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -beatExtraction <wavFileName> <PLOT (0 or 1)>"

    elif argv[
            1] == '-featureExtractionDir':  # same as -featureExtractionFile, in a batch mode (i.e. for each WAV file in the provided path)
        if len(argv) == 7:
            path = argv[2]
            if not os.path.isdir(path):
                raise Exception("Input path not found!")
            if not (uT.isNum(argv[3]) and uT.isNum(argv[4])
                    and uT.isNum(argv[5]) and uT.isNum(argv[6])):
                raise Exception(
                    "Mid-term and short-term window sizes and steps must be numbers!"
                )
            mtWin = float(argv[3])
            mtStep = float(argv[4])
            stWin = float(argv[5])
            stStep = float(argv[6])
            aF.mtFeatureExtractionToFileDir(path, mtWin, mtStep, stWin, stStep,
                                            True, True, True)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -featureExtractionDir <path> <mtWin> <mtStep> <stWin> <stStep>"

    elif argv[
            1] == '-featureVisualizationDir':  # visualize the content relationships between recordings stored in a folder
        if len(argv) == 3:
            if not os.path.isdir(argv[2]):
                raise Exception("Input folder not found!")
            aV.visualizeFeaturesFolder(argv[2], "pca", "")

    elif argv[
            1] == '-fileSpectrogram':  # show spectogram of a sound stored in a file
        if len(argv) == 3:
            wavFileName = argv[2]
            if not os.path.isfile(wavFileName):
                raise Exception("Input audio file not found!")
            [Fs, x] = audioBasicIO.readAudioFile(wavFileName)
            x = audioBasicIO.stereo2mono(x)
            specgram, TimeAxis, FreqAxis = aF.stSpectogram(
                x, Fs, round(Fs * 0.040), round(Fs * 0.040), True)
        else:
            print "Error.\nSyntax: " + argv[0] + " -fileSpectrogram <fileName>"

    elif argv[
            1] == '-fileChromagram':  # show spectogram of a sound stored in a file
        if len(argv) == 3:
            wavFileName = argv[2]
            if not os.path.isfile(wavFileName):
                raise Exception("Input audio file not found!")
            [Fs, x] = audioBasicIO.readAudioFile(wavFileName)
            x = audioBasicIO.stereo2mono(x)
            specgram, TimeAxis, FreqAxis = aF.stChromagram(
                x, Fs, round(Fs * 0.040), round(Fs * 0.040), True)
        else:
            print "Error.\nSyntax: " + argv[0] + " -fileSpectrogram <fileName>"

    elif argv[1] == "-trainClassifier":  # Segment classifier training (OK)
        if len(argv) > 6:
            method = argv[2]
            beatFeatures = (int(argv[3]) == 1)
            listOfDirs = argv[4:len(argv) - 1]
            modelName = argv[-1]
            aT.featureAndTrain(listOfDirs,
                               1,
                               1,
                               aT.shortTermWindow,
                               aT.shortTermStep,
                               method.lower(),
                               modelName,
                               computeBEAT=beatFeatures)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -trainClassifier <method(svm or knn)> <beat features> <directory 1> <directory 2> ... <directory N> <modelName>"

    elif argv[1] == "-trainRegression":  # Segment regression model
        if len(argv) == 6:
            method = argv[2]
            beatFeatures = (int(argv[3]) == 1)
            dirName = argv[4]
            modelName = argv[5]
            aT.featureAndTrainRegression(dirName,
                                         1,
                                         1,
                                         aT.shortTermWindow,
                                         aT.shortTermStep,
                                         method.lower(),
                                         modelName,
                                         computeBEAT=beatFeatures)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -trainRegression <method(svm or knn)> <beat features> <directory> <modelName>"

    elif argv[1] == "-classifyFile":  # Single File Classification (OK)
        if len(argv) == 5:
            modelType = argv[2]
            modelName = argv[3]
            inputFile = argv[4]

            if modelType not in ["svm", "knn"]:
                raise Exception("ModelType has to be either svm or knn!")
            if not os.path.isfile(modelName):
                raise Exception("Input modelName not found!")
            if not os.path.isfile(inputFile):
                raise Exception("Input audio file not found!")

            [Result, P,
             classNames] = aT.fileClassification(inputFile, modelName,
                                                 modelType)
            print "{0:s}\t{1:s}".format("Class", "Probability")
            for i, c in enumerate(classNames):
                print "{0:s}\t{1:.2f}".format(c, P[i])
            print "Winner class: " + classNames[int(Result)]
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -classifyFile <method(svm or knn)> <modelName> <fileName>"

    elif argv[1] == "-regressionFile":  # Single File Classification (OK)
        if len(argv) == 5:
            modelType = argv[2]
            modelName = argv[3]
            inputFile = argv[4]

            if modelType not in ["svm", "knn"]:
                raise Exception("ModelType has to be either svm or knn!")
            if not os.path.isfile(inputFile):
                raise Exception("Input audio file not found!")

            R, regressionNames = aT.fileRegression(inputFile, modelName,
                                                   modelType)
            for i in range(len(R)):
                print "{0:s}\t{1:.3f}".format(regressionNames[i], R[i])

            #print "{0:s}\t{1:.2f}".format(c,P[i])

        else:
            print "Error.\nSyntax: " + argv[
                0] + " -regressionFile <method(svm or knn)> <modelName> <fileName>"

    elif argv[1] == "-classifyFolder":  # Directory classification (Ok)
        if len(argv) == 6 or len(argv) == 5:
            modelType = argv[2]
            modelName = argv[3]
            inputFolder = argv[4]
            if len(argv) == 6:
                outputMode = argv[5]
            else:
                outputMode = "0"

            if modelType not in ["svm", "knn"]:
                raise Exception("ModelType has to be either svm or knn!")
            if outputMode not in ["0", "1"]:
                raise Exception("outputMode has to be 0 or 1")
            if not os.path.isfile(modelName):
                raise Exception("Input modelName not found!")
            files = '*.wav'
            if os.path.isdir(inputFolder):
                strFilePattern = os.path.join(inputFolder, files)
            else:
                strFilePattern = inputFolder + files

            wavFilesList = []
            wavFilesList.extend(glob.glob(strFilePattern))
            wavFilesList = sorted(wavFilesList)
            if len(wavFilesList) == 0:
                print "No WAV files found!"
                return
            Results = []
            for wavFile in wavFilesList:
                [Result, P,
                 classNames] = aT.fileClassification(wavFile, modelName,
                                                     modelType)
                Result = int(Result)
                Results.append(Result)
                if outputMode == "1":
                    print "{0:s}\t{1:s}".format(wavFile, classNames[Result])
            Results = numpy.array(Results)
            # print distribution of classes:
            [Histogram,
             _] = numpy.histogram(Results,
                                  bins=numpy.arange(len(classNames) + 1))
            for i, h in enumerate(Histogram):
                print "{0:20s}\t\t{1:d}".format(classNames[i], h)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -classifyFolder <method(svm or knn)> <modelName> <folderName> <outputMode(0 or 1)"

    elif argv[
            1] == "-regressionFolder":  # Regression applied on the WAV files of a folder
        if len(argv) == 5:
            modelType = argv[2]
            modelName = argv[3]
            inputFolder = argv[4]

            if modelType not in ["svm", "knn"]:
                raise Exception("ModelType has to be either svm or knn!")

            files = '*.wav'
            if os.path.isdir(inputFolder):
                strFilePattern = os.path.join(inputFolder, files)
            else:
                strFilePattern = inputFolder + files

            wavFilesList = []
            wavFilesList.extend(glob.glob(strFilePattern))
            wavFilesList = sorted(wavFilesList)
            if len(wavFilesList) == 0:
                print "No WAV files found!"
                return
            Results = []
            for wavFile in wavFilesList:
                R, regressionNames = aT.fileRegression(wavFile, modelName,
                                                       modelType)
                Results.append(R)
            Results = numpy.array(Results)
            for i, r in enumerate(regressionNames):
                [Histogram, bins] = numpy.histogram(Results[:, i])
                centers = (bins[0:-1] + bins[1::]) / 2.0
                plt.subplot(len(regressionNames), 1, i)
                plt.plot(centers, Histogram)
                plt.title(r)
            plt.show()


#					for h in Histogram:
#						print "{0:20d}".format(h),
#				if outputMode=="1":
#					for i,h in enumerate(Histogram):
#						print "{0:20s}\t\t{1:d}".format(classNames[i], h)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -regressionFolder <method(svm or knn)> <modelName> <folderName>"

    elif argv[1] == '-trainHMMsegmenter_fromfile':
        if len(argv) == 7:
            wavFile = argv[2]
            gtFile = argv[3]
            hmmModelName = argv[4]
            if not uT.isNum(argv[5]):
                print "Error: mid-term window size must be float!"
                return
            if not uT.isNum(argv[6]):
                print "Error: mid-term window step must be float!"
                return
            mtWin = float(argv[5])
            mtStep = float(argv[6])
            if not os.path.isfile(wavFile):
                print "Error: wavfile does not exist!"
                return
            if not os.path.isfile(gtFile):
                print "Error: groundtruth does not exist!"
                return
            aS.trainHMM_fromFile(wavFile, gtFile, hmmModelName, mtWin, mtStep)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -trainHMMsegmenter_fromfile <wavFilePath> <gtSegmentFilePath> <hmmModelFileName> <mtWin> <mtStep>"

    elif argv[1] == '-trainHMMsegmenter_fromdir':
        if len(argv) == 6:
            dirPath = argv[2]
            hmmModelName = argv[3]
            if not uT.isNum(argv[4]):
                print "Error: mid-term window size must be float!"
            if not uT.isNum(argv[5]):
                print "Error: mid-term window step must be float!"
            mtWin = float(argv[4])
            mtStep = float(argv[5])
            aS.trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -trainHMMsegmenter_fromdir <dirPath> <hmmModelFileName> <mtWin> <mtStep>"

    elif argv[
            1] == "-segmentClassifyFileHMM":  # HMM-based segmentation-classification
        if len(argv) == 4:
            hmmModelName = argv[2]
            wavFile = argv[3]
            gtFile = wavFile.replace('.wav', '.segments')
            aS.hmmSegmentation(wavFile,
                               hmmModelName,
                               PLOT=True,
                               gtFileName=gtFile)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -segmentClassifyHMM <hmmModelName> <fileName>"

    elif argv[
            1] == '-segmentClassifyFile':  # Segmentation-classification (fix-sized segment using knn or svm)
        if (len(argv) == 5):
            modelType = argv[2]
            modelName = argv[3]
            inputWavFile = argv[4]

            if modelType not in ["svm", "knn"]:
                raise Exception("ModelType has to be either svm or knn!")
            if not os.path.isfile(modelName):
                raise Exception("Input modelName not found!")
            if not os.path.isfile(inputWavFile):
                raise Exception("Input audio file not found!")
            gtFile = inputWavFile.replace('.wav', '.segments')
            aS.mtFileClassification(inputWavFile, modelName, modelType, True,
                                    gtFile)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -segmentClassifyFile <method(svm or knn)> <modelName> <fileName>"

    elif argv[1] == "-segmentationEvaluation":
        if len(argv) == 5:
            methodName = argv[2]
            modelName = argv[3]
            dirName = argv[4]
            aS.evaluateSegmentationClassificationDir(dirName, modelName,
                                                     methodName)
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -segmentationEvaluation <method(svm or knn)> <modelName> <directoryName>"

    elif argv[1] == "-silenceRemoval":
        if len(argv) == 5:
            inputFile = argv[2]
            if not os.path.isfile(inputFile):
                raise Exception("Input audio file not found!")

            smoothingWindow = float(argv[3])
            weight = float(argv[4])
            [Fs,
             x] = audioBasicIO.readAudioFile(inputFile)  # read audio signal
            segmentLimits = aS.silenceRemoval(x, Fs, 0.05, 0.05,
                                              smoothingWindow, weight,
                                              False)  # get onsets
            for i, s in enumerate(segmentLimits):
                strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(
                    inputFile[0:-4], s[0], s[1])
                wavfile.write(strOut, Fs, x[int(Fs * s[0]):int(Fs * s[1])])
        else:
            print "Error.\nSyntax: " + argv[
                0] + " -silenceRemoval <inputFile> <smoothinWindow(secs)> <Threshold Weight>"

    elif argv[
            1] == '-speakerDiarization':  # speaker diarization (from file): TODO
        inputFile = argv[2]
        nSpeakers = int(argv[3])
        useLDA = (int(argv[4]) == 1)
        if useLDA:
            aS.speakerDiarization(inputFile, nSpeakers, PLOT=True)
        else:
            aS.speakerDiarization(inputFile, nSpeakers, LDAdim=0, PLOT=True)
        #print speechLimits

    elif argv[1] == "-speakerDiarizationScriptEval":
        dir = argv[2]
        listOfLDAs = [int(l) for l in argv[3::]]
        aS.speakerDiarizationEvaluateScript(dir, listOfLDAs)

    elif argv[1] == '-thumbnail':  # music thumbnailing (OK)
        if len(argv) == 4:
            inputFile = argv[2]
            stWindow = 1.0
            stStep = 1.0
            if not os.path.isfile(inputFile):
                raise Exception("Input audio file not found!")

            [Fs, x] = audioBasicIO.readAudioFile(inputFile)  # read file
            if Fs == -1:  # could not read file
                return
            try:
                thumbnailSize = float(argv[3])
            except ValueError:
                print "Thumbnail size must be a float (in seconds)"
                return
            [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(
                x, Fs, stWindow, stStep,
                thumbnailSize)  # find thumbnail endpoints

            # write thumbnails to WAV files:
            thumbnailFileName1 = inputFile.replace(".wav", "_thumb1.wav")
            thumbnailFileName2 = inputFile.replace(".wav", "_thumb2.wav")
            wavfile.write(thumbnailFileName1, Fs, x[int(Fs * A1):int(Fs * A2)])
            wavfile.write(thumbnailFileName2, Fs, x[int(Fs * B1):int(Fs * B2)])
            print "1st thumbnail (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format(
                thumbnailFileName1, A1, A2)
            print "2nd thumbnail (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format(
                thumbnailFileName2, B1, B2)

            # Plot self-similarity matrix:
            fig = plt.figure()
            ax = fig.add_subplot(111, aspect='auto')
            plt.imshow(Smatrix)
            # Plot best-similarity diagonal:
            Xcenter = (A1 / stStep + A2 / stStep) / 2.0
            Ycenter = (B1 / stStep + B2 / stStep) / 2.0

            e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter),
                                            thumbnailSize * 1.4,
                                            3,
                                            angle=45,
                                            linewidth=3,
                                            fill=False)
            ax.add_patch(e1)

            plt.plot([B1, Smatrix.shape[0]], [A1, A1],
                     color='k',
                     linestyle='--',
                     linewidth=2)
            plt.plot([B2, Smatrix.shape[0]], [A2, A2],
                     color='k',
                     linestyle='--',
                     linewidth=2)
            plt.plot([B1, B1], [A1, Smatrix.shape[0]],
                     color='k',
                     linestyle='--',
                     linewidth=2)
            plt.plot([B2, B2], [A2, Smatrix.shape[0]],
                     color='k',
                     linestyle='--',
                     linewidth=2)

            plt.xlim([0, Smatrix.shape[0]])
            plt.ylim([Smatrix.shape[1], 0])

            ax.yaxis.set_label_position("right")
            ax.yaxis.tick_right()

            plt.xlabel('frame no')
            plt.ylabel('frame no')
            plt.title('Self-similarity matrix')

            plt.show()

        else:
            print "Error.\nSyntax: " + argv[
                0] + " -thumbnail <filename> <thumbnailsize(seconds)>"
Beispiel #25
0
def musicThumbnailing(x, Fs, shortTermSize=1.0, shortTermStep=0.5, thumbnailSize=10.0):
	'''
	This function detects instances of the most representative part of a music recording, also called "music thumbnails".
	A technique similar to the one proposed in [1], however a wider set of audio features is used instead of chroma features.
	In particular the following steps are followed:
	 - Extract short-term audio features. Typical short-term window size: 1 second
	 - Compute the self-silimarity matrix, i.e. all pairwise similarities between feature vectors
 	 - Apply a diagonal mask is as a moving average filter on the values of the self-similarty matrix. 
	   The size of the mask is equal to the desirable thumbnail length.
 	 - Find the position of the maximum value of the new (filtered) self-similarity matrix.
	   The audio segments that correspond to the diagonial around that position are the selected thumbnails
	

	ARGUMENTS:
	 - x:			input signal
	 - Fs:			sampling frequency
	 - shortTermSize: 	window size (in seconds)
	 - shortTermStep:	window step (in seconds)
	 - thumbnailSize:	desider thumbnail size (in seconds)
	
	RETURNS:
	 - A1:			beginning of 1st thumbnail (in seconds)
	 - A2:			ending of 1st thumbnail (in seconds)
	 - B1:			beginning of 2nd thumbnail (in seconds)
	 - B2:			ending of 2nd thumbnail (in seconds)

	USAGE EXAMPLE:
  	 import audioFeatureExtraction as aF
	 [Fs, x] = basicIO.readAudioFile(inputFile)
	 [A1, A2, B1, B2] = musicThumbnailing(x, Fs)

	[1] Bartsch, M. A., & Wakefield, G. H. (2005). Audio thumbnailing of popular music using chroma-based representations. 
	Multimedia, IEEE Transactions on, 7(1), 96-104.
	'''
	x = audioBasicIO.stereo2mono(x);
	# feature extraction:
	stFeatures = aF.stFeatureExtraction(x, Fs, Fs*shortTermSize, Fs*shortTermStep)

	# self-similarity matrix
	S = selfSimilarityMatrix(stFeatures)

	# moving filter:
	M = int(round(thumbnailSize / shortTermStep))
	B = numpy.eye(M,M)
	S = scipy.signal.convolve2d(S, B, 'valid')


	# post-processing (remove main diagonal elements)
	MIN = numpy.min(S)
	for i in range(S.shape[0]):
		for j in range(S.shape[1]):
			if abs(i-j) < 5.0 / shortTermStep or i > j:
				S[i,j] = MIN;

	# find max position:
	maxVal = numpy.max(S)
	I = numpy.argmax(S)
	[I, J] = numpy.unravel_index(S.argmax(), S.shape)

	# expand:
	i1 = I; i2 = I
	j1 = J; j2 = J

	while i2-i1<M:
		if S[i1-1, j1-1] > S[i2+1,j2+1]:
			i1 -= 1
			j1 -= 1
		else:
			i2 += 1
			j2 += 1


	return (shortTermStep*i1, shortTermStep*i2, shortTermStep*j1, shortTermStep*j2, S)
Beispiel #26
0
def speakerDiarization(fileName, numOfSpeakers, mtSize = 2.0, mtStep=0.2, stWin=0.05, LDAdim = 35, PLOT = False):
	'''
	ARGUMENTS:
		- fileName:		the name of the WAV file to be analyzed
		- numOfSpeakers	the number of speakers (clusters) in the recording (<=0 for unknown)
		- mtSize (opt)	mid-term window size
		- mtStep (opt)	mid-term window step
		- stWin  (opt)	short-term window size
		- LDAdim (opt)	LDA dimension (0 for no LDA)
		- PLOT	 (opt)	0 for not plotting the results 1 for plottingy
	'''
	[Fs, x] = audioBasicIO.readAudioFile(fileName)
	x = audioBasicIO.stereo2mono(x);
	Duration = len(x) / Fs

	[Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("data/knnSpeakerAll")
	[Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("data/knnSpeakerFemaleMale")

	[MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs*stWin), round(Fs*stWin*0.5));

	MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1] ) )

	for i in range(MidTermFeatures.shape[1]):
		curF1 = (MidTermFeatures[:,i] - MEAN1)  / STD1
		curF2 = (MidTermFeatures[:,i] - MEAN2)  / STD2
		[Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
		[Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
		MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
		MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001;
		MidTermFeatures2[MidTermFeatures.shape[0]+len(classNames1)::, i] = P2 + 0.0001;
	
	MidTermFeatures = MidTermFeatures2	# TODO	
	# SELECT FEATURES:
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; 																											# SET 0A
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; 																									# SET 0B
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 0C
	
	iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53]; 																	# SET 1A
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; 															# SET 1B
	#iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 1C
	
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; 			# SET 2A		
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; 	# SET 2B
	#iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; 	# SET 2C
	
	#iFeaturesSelect = range(100);																									# SET 3	
	#MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010  
	
	MidTermFeatures = MidTermFeatures[iFeaturesSelect,:]		
	
	(MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T])
	MidTermFeaturesNorm = MidTermFeaturesNorm[0].T	
	numOfWindows = MidTermFeatures.shape[1]

	# remove outliers:
	DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0)
	MDistancesAll = numpy.mean(DistancesAll)
	iNonOutLiers = numpy.nonzero(DistancesAll < 1.2*MDistancesAll)[0]
	
	# TODO: Combine energy threshold for outlier removal:
	#EnergyMin = numpy.min(MidTermFeatures[1,:])
	#EnergyMean = numpy.mean(MidTermFeatures[1,:])
	#Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
	#iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
	#print iNonOutLiers

	perOutLier = (100.0*(numOfWindows-iNonOutLiers.shape[0])) / numOfWindows	
	MidTermFeaturesNormOr = MidTermFeaturesNorm
	MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]
	
	# LDA dimensionality reduction:
	if LDAdim > 0:
		#[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));		
		# extract mid-term features with minimum step:
		mtWinRatio  = int(round(mtSize  / stWin));
		mtStepRatio = int(round(stWin / stWin));
		mtFeaturesToReduce = []			
		numOfFeatures = len(ShortTermFeatures)
		numOfStatistics = 2;			
		#for i in range(numOfStatistics * numOfFeatures + 1):
		for i in range(numOfStatistics * numOfFeatures):
			mtFeaturesToReduce.append([])

		for i in range(numOfFeatures):		# for each of the short-term features:
			curPos = 0
			N = len(ShortTermFeatures[i])
			while (curPos<N):
				N1 = curPos
				N2 = curPos + mtWinRatio
				if N2 > N:
					N2 = N
				curStFeatures = ShortTermFeatures[i][N1:N2]
				mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
				mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures))				
				curPos += mtStepRatio		
		mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
				
		mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1] ) )
		for i in range(mtFeaturesToReduce.shape[1]):
			curF1 = (mtFeaturesToReduce[:,i] - MEAN1)  / STD1
			curF2 = (mtFeaturesToReduce[:,i] - MEAN2)  / STD2
			[Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
			[Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
			mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0]+len(classNames1), i] = P1 + 0.0001;
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001;
		mtFeaturesToReduce = mtFeaturesToReduce2		
		mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect,:]		
		#mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
		(mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])	
		mtFeaturesToReduce = mtFeaturesToReduce[0].T
		#DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
		#MDistancesAll = numpy.mean(DistancesAll)
		#iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
		#mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
		Labels = numpy.zeros((mtFeaturesToReduce.shape[1],));
		LDAstep = 1.0
		LDAstepRatio = LDAstep / stWin
		#print LDAstep, LDAstepRatio
		for i in range(Labels.shape[0]):
			Labels[i] = int(i*stWin/LDAstepRatio);		
		clf = LDA(n_components=LDAdim)
		clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001)
		MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

	if numOfSpeakers<=0:
		sRange = range(2,10)
	else:
		sRange = [numOfSpeakers]
	clsAll = []; silAll = []; centersAll = []
	
	for iSpeakers in sRange:
		cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True)		# perform k-means clustering
		
		#YDist =   distance.pdist(MidTermFeaturesNorm.T, metric='euclidean')
		#print distance.squareform(YDist).shape
		#hc = mlpy.HCluster()
		#hc.linkage(YDist)
		#cls = hc.cut(14.5)
		#print cls

		# Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
		clsAll.append(cls)
		centersAll.append(means)
		silA = []; silB = []
		for c in range(iSpeakers):								# for each speaker (i.e. for each extracted cluster)
			clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls))
			if clusterPerCent < 0.020:
				silA.append(0.0)
				silB.append(0.0)
			else:
				MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c]			# get subset of feature vectors
				Yt = distance.pdist(MidTermFeaturesNormTemp.T)				# compute average distance between samples that belong to the cluster (a values)
				silA.append(numpy.mean(Yt)*clusterPerCent)
				silBs = []
				for c2 in range(iSpeakers):						# compute distances from samples of other clusters
					if c2!=c:
						clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls))
						MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2]
						Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
						silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0)
				silBs = numpy.array(silBs)							
				silB.append(min(silBs))							# ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
		silA = numpy.array(silA); 
		silB = numpy.array(silB); 
		sil = []
		for c in range(iSpeakers):								# for each cluster (speaker)
			sil.append( ( silB[c] - silA[c]) / (max(silB[c],  silA[c])+0.00001)  )		# compute silhouette

		silAll.append(numpy.mean(sil))								# keep the AVERAGE SILLOUETTE

	#silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
	imax = numpy.argmax(silAll)									# position of the maximum sillouette value
	nSpeakersFinal = sRange[imax]									# optimal number of clusters

	# generate the final set of cluster labels
	# (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window)
	cls = numpy.zeros((numOfWindows,))
	for i in range(numOfWindows):
		j = numpy.argmin(numpy.abs(i-iNonOutLiers))		
		cls[i] = clsAll[imax][j]
		
	# Post-process method 1: hmm smoothing
	for i in range(1):
		startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls)
		hmm = sklearn.hmm.GaussianHMM(startprob.shape[0], "diag", startprob, transmat)			# hmm training
		hmm.means_ = means; hmm.covars_ = cov
		cls = hmm.predict(MidTermFeaturesNormOr.T)					
	
	# Post-process method 2: median filtering:
	cls = scipy.signal.medfilt(cls, 13)
	cls = scipy.signal.medfilt(cls, 11)

	sil = silAll[imax]										# final sillouette
	classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)];


	# load ground-truth if available
	gtFile = fileName.replace('.wav', '.segments');							# open for annotated file
	if os.path.isfile(gtFile):									# if groundturh exists
		[segStart, segEnd, segLabels] = readSegmentGT(gtFile)					# read GT data
		flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep)			# convert to flags

	if PLOT:
		fig = plt.figure()	
		if numOfSpeakers>0:
			ax1 = fig.add_subplot(111)
		else:
			ax1 = fig.add_subplot(211)
		ax1.set_yticks(numpy.array(range(len(classNames))))
		ax1.axis((0, Duration, -1, len(classNames)))
		ax1.set_yticklabels(classNames)
		ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls)

	if os.path.isfile(gtFile):
		if PLOT:
			ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r')
		purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT)
		print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean)
		if PLOT:
			plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) )
	if PLOT:
		plt.xlabel("time (seconds)")
		#print sRange, silAll	
		if numOfSpeakers<=0:
			plt.subplot(212)
			plt.plot(sRange, silAll)
			plt.xlabel("number of clusters");
			plt.ylabel("average clustering's sillouette");
		plt.show()
Beispiel #27
0
def silenceRemoval(x, Fs, stWin, stStep, smoothWindow = 0.5, Weight = 0.5, plot = False):
	'''
	Event Detection (silence removal)
	ARGUMENTS:
		 - x:			the input audio signal
		 - Fs:			sampling freq
		 - stWin, stStep:	window size and step in seconds
		 - smoothWindow:	(optinal) smooth window (in seconds)
		 - Weight:		(optinal) weight factor (0 < Weight < 1) the higher, the more strict
		 - plot:		(optinal) True if results are to be plotted
	RETURNS:
		 - segmentLimits:	list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that 
					the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds 
	'''

	if Weight>=1:
		Weight = 0.99;
	if Weight<=0:
		Weight = 0.01;

	# Step 1: feature extraction
	x = audioBasicIO.stereo2mono(x);						# convert to mono
	ShortTermFeatures = aF.stFeatureExtraction(x, Fs, stWin*Fs, stStep*Fs)		# extract short-term features	

	# Step 2: train binary SVM classifier of low vs high energy frames
	EnergySt = ShortTermFeatures[1, :]						# keep only the energy short-term sequence (2nd feature)
	E = numpy.sort(EnergySt)							# sort the energy feature values:
	L1 = int(len(E)/10)								# number of 10% of the total short-term windows
	T1 = numpy.mean(E[0:L1])							# compute "lower" 10% energy threshold 
	T2 = numpy.mean(E[-L1:-1])							# compute "higher" 10% energy threshold
	Class1 = ShortTermFeatures[:,numpy.where(EnergySt<T1)[0]]			# get all features that correspond to low energy
	Class2 = ShortTermFeatures[:,numpy.where(EnergySt>T2)[0]]			# get all features that correspond to high energy
	featuresSS = [Class1.T, Class2.T];						# form the binary classification task and ...
	[featuresNormSS, MEANSS, STDSS] = aT.normalizeFeatures(featuresSS)		# normalize and ...
	SVM = aT.trainSVM(featuresNormSS, 1.0)						# train the respective SVM probabilistic model (ONSET vs SILENCE)

	# Step 3: compute onset probability based on the trained SVM
	ProbOnset = []
	for i in range(ShortTermFeatures.shape[1]):					# for each frame
		curFV = (ShortTermFeatures[:,i] - MEANSS) / STDSS			# normalize feature vector
		ProbOnset.append(SVM.pred_probability(curFV)[1])			# get SVM probability (that it belongs to the ONSET class)
	ProbOnset = numpy.array(ProbOnset)
	ProbOnset = smoothMovingAvg(ProbOnset, smoothWindow / stStep)			# smooth probability

	# Step 4A: detect onset frame indices:
	ProbOnsetSorted = numpy.sort(ProbOnset)						# find probability Threshold as a weighted average of top 10% and lower 10% of the values
	Nt = ProbOnsetSorted.shape[0] / 10;	
	T = (numpy.mean( (1-Weight)*ProbOnsetSorted[0:Nt] ) + Weight*numpy.mean(ProbOnsetSorted[-Nt::]) )

	MaxIdx = numpy.where(ProbOnset>T)[0];						# get the indices of the frames that satisfy the thresholding
	i = 0;
	timeClusters = []
	segmentLimits = []

	# Step 4B: group frame indices to onset segments
	while i<len(MaxIdx):								# for each of the detected onset indices
		curCluster = [MaxIdx[i]]
		if i==len(MaxIdx)-1:
			break		
		while MaxIdx[i+1] - curCluster[-1] <= 2:
			curCluster.append(MaxIdx[i+1])
			i += 1
			if i==len(MaxIdx)-1:
				break
		i += 1
		timeClusters.append(curCluster)
		segmentLimits.append([curCluster[0]*stStep, curCluster[-1]*stStep])

	# Step 5: Post process: remove very small segments:
	minDuration = 0.2;
	segmentLimits2 = []
	for s in segmentLimits:
		if s[1] - s[0] > minDuration:
			segmentLimits2.append(s)
	segmentLimits = segmentLimits2;

	if plot:
		timeX = numpy.arange(0, x.shape[0] / float(Fs) , 1.0/Fs)

		plt.subplot(2,1,1); plt.plot(timeX, x)
		for s in segmentLimits:
			plt.axvline(x=s[0]); 
			plt.axvline(x=s[1]); 
		plt.subplot(2,1,2); plt.plot(numpy.arange(0, ProbOnset.shape[0] * stStep, stStep), ProbOnset);
		plt.title('Signal')
		for s in segmentLimits:
			plt.axvline(x=s[0]); 
			plt.axvline(x=s[1]); 
		plt.title('SVM Probability')
		plt.show()

	return segmentLimits
Beispiel #28
0
def mtFileClassification(inputFile, modelName, modelType, plotResults = False, gtFile = ""):
	'''
	This function performs mid-term classification of an audio stream.
	Towards this end, supervised knowledge is used, i.e. a pre-trained classifier.
	ARGUMENTS:
		- inputFile:		path of the input WAV file
		- modelName:		name of the classification model
		- modelType:		svm or knn depending on the classifier type
		- plotResults:		True if results are to be plotted using matplotlib along with a set of statistics
	
	RETURNS:
	  	- segs:			a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds)
		- classes:		a sequence of class flags: class[i] is the class ID of the i-th segment
	'''

	if not os.path.isfile(modelName):
		print "mtFileClassificationError: input modelType not found!"
		return (-1,-1,-1)
	# Load classifier:
	if modelType=='svm':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName)
	elif modelType=='knn':
		[Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName)
	if computeBEAT:
		print "Model " + modelName + " contains long-term music features (beat etc) and cannot be used in segmentation"	
		return (-1,-1,-1)
	[Fs, x] = audioBasicIO.readAudioFile(inputFile)					# load input file
	if Fs == -1:									# could not read file
		return  (-1,-1,-1)
	x = audioBasicIO.stereo2mono(x);						# convert stereo (if) to mono
	Duration = len(x) / Fs					
											# mid-term feature extraction:
	[MidTermFeatures, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs*stWin), round(Fs*stStep));
	flags = []; Ps = []; flagsInd = []
	for i in range(MidTermFeatures.shape[1]): 					# for each feature vector (i.e. for each fix-sized segment):
		curFV = (MidTermFeatures[:, i] - MEAN) / STD;				# normalize current feature vector					
		[Result, P] = aT.classifierWrapper(Classifier, modelType, curFV)	# classify vector
		flagsInd.append(Result)
		flags.append(classNames[int(Result)])					# update class label matrix
		Ps.append(numpy.max(P))							# update probability matrix
	flagsInd = numpy.array(flagsInd)

	# 1-window smoothing
	for i in range(1, len(flagsInd)-1):
		if flagsInd[i-1]==flagsInd[i+1]:
			flagsInd[i] = flagsInd[i+1]
	(segs, classes) = flags2segs(flags, mtStep)					# convert fix-sized flags to segments and classes
	segs[-1] = len(x) / float(Fs)

	# Load grount-truth:
	if os.path.isfile(gtFile):
		[segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile)		
		flagsGT, classNamesGT = segs2flags(segStartGT, segEndGT, segLabelsGT, mtStep)
		flagsIndGT = []
		for j, fl in enumerate(flagsGT):					# "align" labels with GT
			if classNamesGT[flagsGT[j]] in classNames:
				flagsIndGT.append( classNames.index( classNamesGT[flagsGT[j]] ) )
			else:
				flagsIndGT.append( -1 )
		flagsIndGT = numpy.array(flagsIndGT)
	else:
		flagsIndGT = numpy.array([])
	acc = plotSegmentationResults(flagsInd, flagsIndGT, classNames, mtStep, not plotResults)
	if acc>=0:
		print "Overall Accuracy: {0:.3f}".format(acc)
	return (flagsInd, classNames, acc)
def speakerDiarization(filename,
                       n_speakers,
                       mt_size=2.0,
                       mt_step=0.2,
                       st_win=0.05,
                       lda_dim=35,
                       plot_res=False):
    '''
	ARGUMENTS:
		- filename:        the name of the WAV file to be analyzed
		- n_speakers    the number of speakers (clusters) in the recording (<=0 for unknown)
		- mt_size (opt)     mid-term window size
		- mt_step (opt)     mid-term window step
		- st_win  (opt)     short-term window size
		- lda_dim (opt)     LDA dimension (0 for no LDA)
		- plot_res     (opt)   0 for not plotting the results 1 for plottingy
	'''
    [fs, x] = audioBasicIO.readAudioFile(filename)
    x = audioBasicIO.stereo2mono(x)
    duration = len(x) / fs

    # [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerAll"))
    # [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerFemaleMale"))
    [
        classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1,
        stStep1, computeBEAT1
    ] = aT.load_model_knn("data/knnSpeakerAll")
    [
        classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2,
        stStep2, computeBEAT2
    ] = aT.load_model_knn("data/knnSpeakerFemaleMale")

    [mt_feats, st_feats, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs,
                                                     mt_step * fs,
                                                     round(fs * st_win),
                                                     round(fs * st_win * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (mt_feats.shape[0] + len(classNames1) + len(classNames2),
         mt_feats.shape[1]))

    for i in range(mt_feats.shape[1]):
        cur_f1 = (mt_feats[:, i] - MEAN1) / STD1
        cur_f2 = (mt_feats[:, i] - MEAN2) / STD2
        [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
        [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
        MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
        MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::,
                         i] = P2 + 0.0001

    mt_feats = MidTermFeatures2  # TODO
    iFeaturesSelect = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]

    mt_feats = mt_feats[iFeaturesSelect, :]

    (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    n_wins = mt_feats.shape[1]

    # remove outliers:
    dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_norm.T)),
                         axis=0)
    m_dist_all = numpy.mean(dist_all)
    i_non_outliers = numpy.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = numpy.min(mt_feats[1,:])
    #EnergyMean = numpy.mean(mt_feats[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #i_non_outliers = numpy.nonzero(mt_feats[1,:] > Thres)[0]
    #print i_non_outliers

    perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins
    mt_feats_norm_or = mt_feats_norm
    mt_feats_norm = mt_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, st_win * fs, round(fs*st_win), round(fs*st_win));
        # extract mid-term features with minimum step:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        #for i in range(num_of_stats * num_of_features + 1):
        for i in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        for i in range(
                num_of_features):  # for each of the short-term features:
            curPos = 0
            N = len(st_feats[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mt_win_ratio
                if N2 > N:
                    N2 = N
                curStFeatures = st_feats[i][N1:N2]
                mt_feats_to_red[i].append(numpy.mean(curStFeatures))
                mt_feats_to_red[i + num_of_features].append(
                    numpy.std(curStFeatures))
                curPos += mt_step_ratio
        mt_feats_to_red = numpy.array(mt_feats_to_red)
        mt_feats_to_red_2 = numpy.zeros(
            (mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2),
             mt_feats_to_red.shape[1]))
        for i in range(mt_feats_to_red.shape[1]):
            cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1
            cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2
            [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
            [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0],
                              i] = mt_feats_to_red[:, i]
            mt_feats_to_red_2[
                mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::,
                              i] = P2 + 0.0001
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :]
        #mt_feats_to_red += numpy.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010
        (mt_feats_to_red, MEAN,
         STD) = aT.normalizeFeatures([mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        #dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0)
        #m_dist_all = numpy.mean(dist_all)
        #iNonOutLiers2 = numpy.nonzero(dist_all < 3.0*m_dist_all)[0]
        #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2]
        Labels = numpy.zeros((mt_feats_to_red.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / st_win
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * st_win / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, Labels)
        mt_feats_norm = (clf.transform(mt_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    clsAll = []
    sil_all = []
    centersAll = []

    for iSpeakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(mt_feats_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(mt_feats_norm.T))
        clsAll.append(cls)
        centersAll.append(means)
        sil_1 = []
        sil_2 = []
        for c in range(iSpeakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = numpy.nonzero(cls == c)[0].shape[0] / \
                 float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mt_feats_norm[:, cls == c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                Yt = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(numpy.mean(Yt) * clust_per_cent)
                silBs = []
                for c2 in range(iSpeakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = numpy.nonzero(cls == c2)[0].shape[0] /\
                               float(len(cls))
                        MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2]
                        Yt = distance.cdist(mt_feats_norm_temp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clust_per_cent + clust_per_cent_2) / 2.0)
                silBs = numpy.array(silBs)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(silBs))
        sil_1 = numpy.array(sil_1)
        sil_2 = numpy.array(sil_2)
        sil = []
        for c in range(iSpeakers):
            # for each cluster (speaker) compute silhouette
            sil.append(
                (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(numpy.mean(sil))

    imax = numpy.argmax(sil_all)
    # optimal number of clusters
    nSpeakersFinal = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = numpy.zeros((n_wins, ))
    for i in range(n_wins):
        j = numpy.argmin(numpy.abs(i - i_non_outliers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
         trainHMM_computeStatistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]
    class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundturh exists
    if os.path.isfile(gt_file):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs,
                                              mt_step)

    if plot_res:
        fig = plt.figure()
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(numpy.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(numpy.array(range(len(cls))) * mt_step + mt_step / 2.0, cls)

    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(
                numpy.array(range(len(flags_gt))) * mt_step + mt_step / 2.0,
                flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
         evaluateSpeakerDiarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(
                          100 * purity_cluster_m, 100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        #print s_range, sil_all
        if n_speakers <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        #plt.show()
        plt.savefig('output/outImg.jpg')
    return cls
def silenceRemoval(x,
                   fs,
                   st_win,
                   st_step,
                   smoothWindow=0.5,
                   weight=0.5,
                   plot=False):
    '''
	Event Detection (silence removal)
	ARGUMENTS:
		 - x:                the input audio signal
		 - fs:               sampling freq
		 - st_win, st_step:    window size and step in seconds
		 - smoothWindow:     (optinal) smooth window (in seconds)
		 - weight:           (optinal) weight factor (0 < weight < 1) the higher, the more strict
		 - plot:             (optinal) True if results are to be plotted
	RETURNS:
		 - seg_limits:    list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that
					the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds
	'''

    if weight >= 1:
        weight = 0.99
    if weight <= 0:
        weight = 0.01

    # Step 1: feature extraction
    x = audioBasicIO.stereo2mono(x)
    st_feats, _ = aF.stFeatureExtraction(x, fs, st_win * fs, st_step * fs)

    # Step 2: train binary svm classifier of low vs high energy frames
    # keep only the energy short-term sequence (2nd feature)
    st_energy = st_feats[1, :]
    en = numpy.sort(st_energy)
    # number of 10% of the total short-term windows
    l1 = int(len(en) / 10)
    # compute "lower" 10% energy threshold
    t1 = numpy.mean(en[0:l1]) + 0.000000000000001
    # compute "higher" 10% energy threshold
    t2 = numpy.mean(en[-l1:-1]) + 0.000000000000001
    # get all features that correspond to low energy
    class1 = st_feats[:, numpy.where(st_energy <= t1)[0]]
    # get all features that correspond to high energy
    class2 = st_feats[:, numpy.where(st_energy >= t2)[0]]
    # form the binary classification task and ...
    faets_s = [class1.T, class2.T]
    # normalize and train the respective svm probabilistic model
    # (ONSET vs SILENCE)
    [faets_s_norm, means_s, stds_s] = aT.normalizeFeatures(faets_s)
    svm = aT.trainSVM(faets_s_norm, 1.0)

    # Step 3: compute onset probability based on the trained svm
    prob_on_set = []
    for i in range(st_feats.shape[1]):
        # for each frame
        cur_fv = (st_feats[:, i] - means_s) / stds_s
        # get svm probability (that it belongs to the ONSET class)
        prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1])
    prob_on_set = numpy.array(prob_on_set)
    # smooth probability:
    prob_on_set = smoothMovingAvg(prob_on_set, smoothWindow / st_step)

    # Step 4A: detect onset frame indices:
    prog_on_set_sort = numpy.sort(prob_on_set)
    # find probability Threshold as a weighted average
    # of top 10% and lower 10% of the values
    Nt = int(prog_on_set_sort.shape[0] / 10)
    T = (numpy.mean((1 - weight) * prog_on_set_sort[0:Nt]) +
         weight * numpy.mean(prog_on_set_sort[-Nt::]))

    max_idx = numpy.where(prob_on_set > T)[0]
    # get the indices of the frames that satisfy the thresholding
    i = 0
    time_clusters = []
    seg_limits = []

    # Step 4B: group frame indices to onset segments
    while i < len(max_idx):
        # for each of the detected onset indices
        cur_cluster = [max_idx[i]]
        if i == len(max_idx) - 1:
            break
        while max_idx[i + 1] - cur_cluster[-1] <= 2:
            cur_cluster.append(max_idx[i + 1])
            i += 1
            if i == len(max_idx) - 1:
                break
        i += 1
        time_clusters.append(cur_cluster)
        seg_limits.append(
            [cur_cluster[0] * st_step, cur_cluster[-1] * st_step])

    # Step 5: Post process: remove very small segments:
    min_dur = 0.2
    seg_limits_2 = []
    for s in seg_limits:
        if s[1] - s[0] > min_dur:
            seg_limits_2.append(s)
    seg_limits = seg_limits_2

    if plot:
        timeX = numpy.arange(0, x.shape[0] / float(fs), 1.0 / fs)

        plt.subplot(2, 1, 1)
        plt.plot(timeX, x)
        for s in seg_limits:
            plt.axvline(x=s[0])
            plt.axvline(x=s[1])
        plt.subplot(2, 1, 2)
        plt.plot(numpy.arange(0, prob_on_set.shape[0] * st_step, st_step),
                 prob_on_set)
        plt.title('Signal')
        for s in seg_limits:
            plt.axvline(x=s[0])
            plt.axvline(x=s[1])
        plt.title('svm Probability')
        plt.show()

    return seg_limits
Beispiel #31
0
def fileClassification(inputFile, modelName, modelType, chunk_seconds=None):
    # Load classifier:
    print "DEBUG: fileClassification - inputFile: " + inputFile
    '''
    if not os.path.isfile(modelName):
        print "fileClassification: input modelName not found!"
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print "fileClassification: wav file not found!"
        return (-1, -1, -1)
    '''
    #print "computeBeat"

    if (modelType) == 'svm' or (modelType == 'svm_rbf'):
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadSVModel(modelName)
    elif modelType == 'knn':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadKNNModel(modelName)
    elif modelType == 'randomforest':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadRandomForestModel(modelName)
    elif modelType == 'gradientboosting':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadGradientBoostingModel(modelName)
    elif modelType == 'extratrees':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadExtraTreesModel(modelName)

    #print computeBEAT
    chunk_data = audioBasicIO.readAudioFile(
        inputFile, chunk_seconds)  # read audio file and convert to mono
    if chunk_seconds:
        classification_data = []
        features = []
        for i, chunk in enumerate(chunk_data):
            #print str(i) + " of " + str(len(chunk_data))
            [Fs, x] = chunk
            x = audioBasicIO.stereo2mono(x)
            if isinstance(x, int):  # audio file IO problem
                return (-1, -1, -1)
            if x.shape[0] / float(Fs) <= mtWin:
                return (-1, -1, -1)

            # feature extraction:
            [MidTermFeatures,
             stFeatures] = aF.mtFeatureExtraction(x, Fs,
                                                  mtWin * Fs, mtStep * Fs,
                                                  round(Fs * stWin),
                                                  round(Fs * stStep))
            # if i == (len(chunk_data)-1):
            #     print "stTermFeatures"
            #     print stFeatures[33]
            #     print len(stFeatures)
            #     print "---------------"
            #features.append(MidTermFeatures.tolist())
            MidTermFeatures = MidTermFeatures.mean(
                axis=1)  # long term averaging of mid-term statistics
            features.append(MidTermFeatures.tolist())
            if computeBEAT:
                [beat, beatConf] = aF.beatExtraction(stFeatures, stStep)
                MidTermFeatures = numpy.append(MidTermFeatures, beat)
                MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
            curFV = (MidTermFeatures - MEAN) / STD  # normalization

            [Result, P] = classifierWrapper(Classifier, modelType,
                                            curFV)  # classification
            #print(type(classNames))
            #print(type(P.tolist()))
            classification_data.append([Result, P.tolist(), classNames])

        return [classification_data, features]

    [Fs, x] = chunk_data  #audioBasicIO.readAudioFile(inputFile)

    x = audioBasicIO.stereo2mono(x)

    if isinstance(x, int):  # audio file IO problem
        return (-1, -1, -1)
    if x.shape[0] / float(Fs) <= mtWin:
        return (-1, -1, -1)

    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs,
                                                  mtWin * Fs, mtStep * Fs,
                                                  round(Fs * stWin),
                                                  round(Fs * stStep))

    MidTermFeatures = MidTermFeatures.mean(
        axis=1)  # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
    curFV = (MidTermFeatures - MEAN) / STD  # normalization

    [Result, P] = classifierWrapper(Classifier, modelType,
                                    curFV)  # classification
    return [Result, P, classNames, MidTermFeatures.tolist()]
def speakerDiarization(fileName,
                       numOfSpeakers,
                       mtSize=2.0,
                       mtStep=0.2,
                       stWin=0.05,
                       LDAdim=35,
                       PLOT=False):
    '''
    ARGUMENTS:
        - fileName:        the name of the WAV file to be analyzed
        - numOfSpeakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mtSize (opt)     mid-term window size
        - mtStep (opt)     mid-term window step
        - stWin  (opt)     short-term window size
        - LDAdim (opt)     LDA dimension (0 for no LDA)
        - PLOT     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [Fs, x] = audioBasicIO.readAudioFile(fileName)
    x = audioBasicIO.stereo2mono(x)
    Duration = len(x) / Fs

    [
        Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1,
        stStep1, computeBEAT1
    ] = aT.loadKNNModel(os.path.join("data", "knnSpeakerAll"))
    [
        Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2,
        stStep2, computeBEAT2
    ] = aT.loadKNNModel(os.path.join("data", "knnSpeakerFemaleMale"))

    [MidTermFeatures,
     ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs,
                                                 mtSize * Fs, mtStep * Fs,
                                                 round(Fs * stWin),
                                                 round(Fs * stWin * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2),
         MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2
        [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
        [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::,
                         i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2  # TODO
    # SELECT FEATURES:
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20];                                                                                         # SET 0A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100];                                                                                 # SET 0B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,
    #   97,98, 99,100];     # SET 0C

    iFeaturesSelect = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]  # SET 1A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];                                          # SET 1B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 1C

    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53];             # SET 2A
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];     # SET 2B
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 2C

    #iFeaturesSelect = range(100);                                                                                                    # SET 3
    #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010

    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    (MidTermFeaturesNorm, MEAN,
     STD) = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    # remove outliers:
    DistancesAll = numpy.sum(distance.squareform(
        distance.pdist(MidTermFeaturesNorm.T)),
                             axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = numpy.min(MidTermFeatures[1,:])
    #EnergyMean = numpy.mean(MidTermFeatures[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
    #print iNonOutLiers

    perOutLier = (100.0 *
                  (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    # LDA dimensionality reduction:
    if LDAdim > 0:
        #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));
        # extract mid-term features with minimum step:
        mtWinRatio = int(round(mtSize / stWin))
        mtStepRatio = int(round(stWin / stWin))
        mtFeaturesToReduce = []
        numOfFeatures = len(ShortTermFeatures)
        numOfStatistics = 2
        #for i in range(numOfStatistics * numOfFeatures + 1):
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append([])

        for i in range(numOfFeatures):  # for each of the short-term features:
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mtWinRatio
                if N2 > N:
                    N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i + numOfFeatures].append(
                    numpy.std(curStFeatures))
                curPos += mtStepRatio
        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros(
            (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2),
             mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
            [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0],
                                i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[
                mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] +
                                len(classNames1)::, i] = P2 + 0.0001
        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
        (mtFeaturesToReduce, MEAN,
         STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T
        #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
        #MDistancesAll = numpy.mean(DistancesAll)
        #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
        #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
        Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * stWin / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels)
        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    if numOfSpeakers <= 0:
        sRange = range(2, 10)
    else:
        sRange = [numOfSpeakers]
    clsAll = []
    silAll = []
    centersAll = []

    for iSpeakers in sRange:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(MidTermFeaturesNorm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
        clsAll.append(cls)
        centersAll.append(means)
        silA = []
        silB = []
        for c in range(iSpeakers
                       ):  # for each speaker (i.e. for each extracted cluster)
            clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(
                len(cls))
            if clusterPerCent < 0.020:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls ==
                                                              c]  # get subset of feature vectors
                Yt = distance.pdist(
                    MidTermFeaturesNormTemp.T
                )  # compute average distance between samples that belong to the cluster (a values)
                silA.append(numpy.mean(Yt) * clusterPerCent)
                silBs = []
                for c2 in range(
                        iSpeakers
                ):  # compute distances from samples of other clusters
                    if c2 != c:
                        clusterPerCent2 = numpy.nonzero(
                            cls == c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,
                                                                       cls ==
                                                                       c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clusterPerCent + clusterPerCent2) / 2.0)
                silBs = numpy.array(silBs)
                silB.append(
                    min(silBs)
                )  # ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
        silA = numpy.array(silA)
        silB = numpy.array(silB)
        sil = []
        for c in range(iSpeakers):  # for each cluster (speaker)
            sil.append((silB[c] - silA[c]) /
                       (max(silB[c], silA[c]) + 0.00001))  # compute silhouette

        silAll.append(numpy.mean(sil))  # keep the AVERAGE SILLOUETTE

    #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
    imax = numpy.argmax(silAll)  # position of the maximum sillouette value
    nSpeakersFinal = sRange[imax]  # optimal number of clusters

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window)
    cls = numpy.zeros((numOfWindows, ))
    for i in range(numOfWindows):
        j = numpy.argmin(numpy.abs(i - iNonOutLiers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        startprob, transmat, means, cov = trainHMM_computeStatistics(
            MidTermFeaturesNormOr, cls)
        hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0],
                                       "diag")  # hmm training
        hmm.startprob_ = startprob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(MidTermFeaturesNormOr.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = silAll[imax]  # final sillouette
    classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    #debug
    segslist = [list() for x in range(numOfSpeakers)]
    start = 0
    for i in range(0, len(cls) - 1):
        if cls[i] != cls[i + 1]:
            segTemp = dict()
            segTemp['start'] = start
            segTemp['end'] = i * mtStep + mtStep
            speakerID = int(cls[i])
            print speakerID, segTemp
            segslist[speakerID].append(segTemp)
            start = segTemp['end']
    segTemp = dict()
    segTemp['start'] = start
    segTemp['end'] = (len(cls) - 1) * mtStep + mtStep
    speakerID = int(cls[-1])
    print speakerID
    print segTemp
    segslist[speakerID].append(segTemp)
    print segslist
    conversation = list()
    sound = AudioSegment.from_file(fileName)
    for speakerID, speaker in enumerate(segslist):
        for segID, seg in enumerate(speaker):
            chunk = sound[seg['start'] * 1000:seg['end'] * 1000]
            output_name = 'speaker{}_{}.wav'.format(speakerID, segID)
            chunk.export(output_name, format="wav")
            r = sr.Recognizer()
            with sr.AudioFile(output_name) as source:
                audio = r.record(source)  # read the entire audio file
                # recognize speech using Sphinx
                try:
                    print("Sphinx thinks you said: " +
                          r.recognize_sphinx(audio))
                    content = dict()
                    content['text'] = r.recognize_sphinx(audio)
                    content['speakerID'] = speakerID
                    content['start'] = seg['start']
                    conversation.append(content)
                except sr.UnknownValueError:
                    print("Sphinx could not understand audio")
                except sr.RequestError as e:
                    print("Sphinx error; {0}".format(e))

    conversation.sort(key=operator.itemgetter('start'))
    text_file = open('text.txt', 'w')
    for c in conversation:
        line = 'Speaker{}: {}\n'.format(c['speakerID'], c['text'])
        text_file.write(line)

    print conversation
    return cls
Beispiel #33
0
#!/usr/bin/env python2.7
import audioBasicIO
import audioFeatureExtraction
import matplotlib.pyplot as plt
[Fs_x, x] = audioBasicIO.readAudioFile("emer/1.wav")
x = audioBasicIO.stereo2mono(x)
F_x = audioFeatureExtraction.stFeatureExtraction(x, Fs_x, 0.050 * Fs_x,
                                                 0.025 * Fs_x)

[Fs_y, y] = audioBasicIO.readAudioFile("nonemer/9.wav")
y = audioBasicIO.stereo2mono(y)
F_y = audioFeatureExtraction.stFeatureExtraction(y, Fs_y, 0.050 * Fs_y,
                                                 0.025 * Fs_y)

plt.subplot(2, 1, 1)
plt.plot(F_x[0, :])
plt.xlabel('emer')
plt.ylabel('ZCR')
plt.subplot(2, 1, 2)
plt.plot(F_y[0, :])
plt.xlabel('nonemer')
plt.ylabel('ZCR')
plt.show()

plt.subplot(2, 1, 1)
plt.plot(F_x[1, :])
plt.xlabel('emer')
plt.ylabel('Energy')
plt.subplot(2, 1, 2)
plt.plot(F_y[1, :])
plt.xlabel('nonemer')
def main(argv):
	if argv[1] == "-dirMp3toWAV":				# convert mp3 to wav (batch)
		if len(argv)==5:			
			path = argv[2]
			if argv[3] not in ["8000", "16000", "32000", "44100"]:
				print "Error. Unsupported sampling rate (must be: 8000, 16000, 32000 or 44100)."; return
			if argv[4] not in ["1","2"]:
				print "Error. Number of output channels must be 1 or 2"; return
			if not os.path.isdir(path):
				raise Exception("Input path not found!")
			useMp3TagsAsNames = True
			audioBasicIO.convertDirMP3ToWav(path, int(argv[3]), int(argv[4]), useMp3TagsAsNames)
		else:
			print "Error.\nSyntax: " + argv[0] + " -dirMp3toWAV <dirName> <sampling Freq> <numOfChannels>"

	if argv[1] == "-dirWAVChangeFs":				# convert mp3 to wav (batch)
		if len(argv)==5:			
			path = argv[2]
			if argv[3] not in ["8000", "16000", "32000", "44100"]:
				print "Error. Unsupported sampling rate (must be: 8000, 16000, 32000 or 44100)."; return
			if argv[4] not in ["1","2"]:
				print "Error. Number of output channels must be 1 or 2"; return
			if not os.path.isdir(path):
				raise Exception("Input path not found!")
			audioBasicIO.convertFsDirWavToWav(path, int(argv[3]), int(argv[4]))
		else:
			print "Error.\nSyntax: " + argv[0] + " -dirMp3toWAV <dirName> <sampling Freq> <numOfChannels>"

	elif argv[1] == "-featureExtractionFile":		# short-term and mid-term feature extraction to files (csv and numpy)
		if len(argv)==7:
			wavFileName = argv[2]
			if not os.path.isfile(wavFileName):
				raise Exception("Input audio file not found!")
			if not (uT.isNum(argv[3]) and uT.isNum(argv[4]) and uT.isNum(argv[5]) and uT.isNum(argv[6])):
				raise Exception("Mid-term and short-term window sizes and steps must be numbers!")
			mtWin = float(argv[3])
			mtStep = float(argv[4])
			stWin = float(argv[5])
			stStep = float(argv[6])
			outFile = wavFileName
			aF.mtFeatureExtractionToFile(wavFileName, mtWin, mtStep, stWin, stStep, outFile, True, True, True)
		else:
			print "Error.\nSyntax: " + argv[0] + " -featureExtractionFile <wavFileName> <mtWin> <mtStep> <stWin> <stStep>"

	elif argv[1] == "-beatExtraction":
		if len(argv)==4:
			wavFileName = argv[2]
			if not os.path.isfile(wavFileName):
				raise Exception("Input audio file not found!")
			if not (uT.isNum(argv[3])):
				raise Exception("PLOT must be either 0 or 1")
			if not ( (int(argv[3]) == 0) or (int(argv[3]) == 1) ):
				raise Exception("PLOT must be either 0 or 1")

			[Fs, x] = audioBasicIO.readAudioFile(wavFileName);
			F = aF.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);
			BPM, ratio = aF.beatExtraction(F, 0.050, int(argv[3])==1)
			print "Beat: {0:d} bpm ".format(int(BPM))
			print "Ratio: {0:.2f} ".format(ratio)
		else:
			print "Error.\nSyntax: " + argv[0] + " -beatExtraction <wavFileName> <PLOT (0 or 1)>"


	elif argv[1] == '-featureExtractionDir':	# same as -featureExtractionFile, in a batch mode (i.e. for each WAV file in the provided path)
		if len(argv)==7:
			path = argv[2]
			if not os.path.isdir(path):
				raise Exception("Input path not found!")
			if not (uT.isNum(argv[3]) and uT.isNum(argv[4]) and uT.isNum(argv[5]) and uT.isNum(argv[6])):
				raise Exception("Mid-term and short-term window sizes and steps must be numbers!")
			mtWin = float(argv[3])
			mtStep = float(argv[4])
			stWin = float(argv[5])
			stStep = float(argv[6])
			aF.mtFeatureExtractionToFileDir(path, mtWin, mtStep, stWin, stStep, True, True, True)
		else:
			print "Error.\nSyntax: " + argv[0] + " -featureExtractionDir <path> <mtWin> <mtStep> <stWin> <stStep>"

	elif argv[1] == '-featureVisualizationDir':	# visualize the content relationships between recordings stored in a folder
		if len(argv)==3:
			if not os.path.isdir(argv[2]):
				raise Exception("Input folder not found!")
			aV.visualizeFeaturesFolder(argv[2], "pca", "")

	elif argv[1] == '-fileSpectrogram':		# show spectogram of a sound stored in a file
			if len(argv)==3:
				wavFileName = argv[2]		
				if not os.path.isfile(wavFileName):
					raise Exception("Input audio file not found!")
				[Fs, x] = audioBasicIO.readAudioFile(wavFileName)
				x = audioBasicIO.stereo2mono(x)
				specgram, TimeAxis, FreqAxis = aF.stSpectogram(x, Fs, round(Fs*0.040), round(Fs*0.040), True)
			else:
				print "Error.\nSyntax: " + argv[0] + " -fileSpectrogram <fileName>"

	elif argv[1] == '-fileChromagram':		# show spectogram of a sound stored in a file
			if len(argv)==3:
				wavFileName = argv[2]		
				if not os.path.isfile(wavFileName):
					raise Exception("Input audio file not found!")
				[Fs, x] = audioBasicIO.readAudioFile(wavFileName)
				x = audioBasicIO.stereo2mono(x)
				specgram, TimeAxis, FreqAxis = aF.stChromagram(x, Fs, round(Fs*0.040), round(Fs*0.040), True)
			else:
				print "Error.\nSyntax: " + argv[0] + " -fileSpectrogram <fileName>"


	elif argv[1] == "-trainClassifier": 		# Segment classifier training (OK)
			if len(argv)>6: 
				method = argv[2]
				beatFeatures = (int(argv[3])==1)
				listOfDirs = argv[4:len(argv)-1]
				modelName = argv[-1]			
				aT.featureAndTrain(listOfDirs, 1, 1, aT.shortTermWindow, aT.shortTermStep, method.lower(), modelName, computeBEAT = beatFeatures)
			else:
				print "Error.\nSyntax: " + argv[0] + " -trainClassifier <method(svm or knn)> <beat features> <directory 1> <directory 2> ... <directory N> <modelName>"

	elif argv[1] == "-trainRegression": 		# Segment regression model
			if len(argv)==6: 
				method = argv[2]
				beatFeatures = (int(argv[3])==1)
				dirName = argv[4]
				modelName = argv[5]			
				aT.featureAndTrainRegression(dirName, 1, 1, aT.shortTermWindow, aT.shortTermStep, method.lower(), modelName, computeBEAT = beatFeatures)
			else:
				print "Error.\nSyntax: " + argv[0] + " -trainRegression <method(svm or knn)> <beat features> <directory> <modelName>"

	elif argv[1] == "-classifyFile":		# Single File Classification (OK)
			if len(argv)==5: 
				modelType = argv[2]
				modelName = argv[3]
				inputFile = argv[4]

				if modelType not in ["svm", "knn"]:
					raise Exception("ModelType has to be either svm or knn!")
				if not os.path.isfile(modelName):
					raise Exception("Input modelName not found!")
				if not os.path.isfile(inputFile):
					raise Exception("Input audio file not found!")

				[Result, P, classNames] = aT.fileClassification(inputFile, modelName, modelType)
				print "{0:s}\t{1:s}".format("Class","Probability")
				for i,c in enumerate(classNames):
					print "{0:s}\t{1:.2f}".format(c,P[i])
				print "Winner class: " + classNames[int(Result)]
			else:
				print "Error.\nSyntax: " + argv[0] + " -classifyFile <method(svm or knn)> <modelName> <fileName>"

	elif argv[1] == "-regressionFile":		# Single File Classification (OK)
			if len(argv)==5: 
				modelType = argv[2]
				modelName = argv[3]
				inputFile = argv[4]

				if modelType not in ["svm", "knn"]:
					raise Exception("ModelType has to be either svm or knn!")
				if not os.path.isfile(inputFile):
					raise Exception("Input audio file not found!")

				R, regressionNames = aT.fileRegression(inputFile, modelName, modelType)
				for i in range(len(R)):
					print "{0:s}\t{1:.3f}".format(regressionNames[i], R[i])
				
				#print "{0:s}\t{1:.2f}".format(c,P[i])

			else:
				print "Error.\nSyntax: " + argv[0] + " -regressionFile <method(svm or knn)> <modelName> <fileName>"

	elif argv[1] == "-classifyFolder": 			# Directory classification (Ok)
			if len(argv)==6 or len(argv)==5: 
				modelType = argv[2]
				modelName = argv[3]
				inputFolder = argv[4]
				if len(argv)==6:
					outputMode = argv[5]
				else:
					outputMode = "0"

				if modelType not in ["svm", "knn"]:
					raise Exception("ModelType has to be either svm or knn!")
				if outputMode not in ["0","1"]:
					raise Exception("outputMode has to be 0 or 1")
				if not os.path.isfile(modelName):
					raise Exception("Input modelName not found!")
				files = '*.wav'
				if os.path.isdir(inputFolder):
					strFilePattern = os.path.join(inputFolder, files)
				else:
					strFilePattern = inputFolder + files

				wavFilesList = []
				wavFilesList.extend(glob.glob(strFilePattern))
				wavFilesList = sorted(wavFilesList)
				if len(wavFilesList)==0:
					print "No WAV files found!"
					return 
				Results = []
				for wavFile in wavFilesList:	
					[Result, P, classNames] = aT.fileClassification(wavFile, modelName, modelType)	
					Result = int(Result)
					Results.append(Result)
					if outputMode=="1":
						print "{0:s}\t{1:s}".format(wavFile,classNames[Result])
				Results = numpy.array(Results)
				# print distribution of classes:
				[Histogram, _] = numpy.histogram(Results, bins=numpy.arange(len(classNames)+1))
				for i,h in enumerate(Histogram):
					print "{0:20s}\t\t{1:d}".format(classNames[i], h)
			else:
				print "Error.\nSyntax: " + argv[0] + " -classifyFolder <method(svm or knn)> <modelName> <folderName> <outputMode(0 or 1)"

	elif argv[1] == "-regressionFolder": 			# Regression applied on the WAV files of a folder
			if len(argv)==5: 
				modelType = argv[2]
				modelName = argv[3]
				inputFolder = argv[4]

				if modelType not in ["svm", "knn"]:
					raise Exception("ModelType has to be either svm or knn!")

				files = '*.wav'
				if os.path.isdir(inputFolder):
					strFilePattern = os.path.join(inputFolder, files)
				else:
					strFilePattern = inputFolder + files

				wavFilesList = []
				wavFilesList.extend(glob.glob(strFilePattern))
				wavFilesList = sorted(wavFilesList)	
				if len(wavFilesList)==0:
					print "No WAV files found!"
					return 
				Results = []
				for wavFile in wavFilesList:	
					R, regressionNames = aT.fileRegression(wavFile, modelName, modelType)
					Results.append(R)
				Results = numpy.array(Results)
				for i, r in enumerate(regressionNames):
					[Histogram, bins] = numpy.histogram(Results[:, i])
					centers = (bins[0:-1] + bins[1::]) / 2.0
					plt.subplot(len(regressionNames), 1, i);
					plt.plot(centers, Histogram)
					plt.title(r)
				plt.show()
#					for h in Histogram:
#						print "{0:20d}".format(h),
#				if outputMode=="1":
#					for i,h in enumerate(Histogram):
#						print "{0:20s}\t\t{1:d}".format(classNames[i], h)
			else:
				print "Error.\nSyntax: " + argv[0] + " -regressionFolder <method(svm or knn)> <modelName> <folderName>"

	elif argv[1] == '-trainHMMsegmenter_fromfile':
		if len(argv)==7:
			wavFile = argv[2]
			gtFile = argv[3]
			hmmModelName = argv[4]
			if not uT.isNum(argv[5]):
				print "Error: mid-term window size must be float!"; return
			if not uT.isNum(argv[6]):
				print "Error: mid-term window step must be float!"; return
			mtWin = float(argv[5])
			mtStep = float(argv[6])
			if not os.path.isfile(wavFile):
				print "Error: wavfile does not exist!"; return
			if not os.path.isfile(gtFile):
				print "Error: groundtruth does not exist!"; return
			aS.trainHMM_fromFile(wavFile, gtFile, hmmModelName, mtWin, mtStep)
		else:
			print "Error.\nSyntax: " + argv[0] + " -trainHMMsegmenter_fromfile <wavFilePath> <gtSegmentFilePath> <hmmModelFileName> <mtWin> <mtStep>"

	elif argv[1] == '-trainHMMsegmenter_fromdir':
		if len(argv)==6:
			dirPath = argv[2]
			hmmModelName = argv[3]
			if not uT.isNum(argv[4]):
				print "Error: mid-term window size must be float!"
			if not uT.isNum(argv[5]):
				print "Error: mid-term window step must be float!"
			mtWin = float(argv[4])
			mtStep = float(argv[5])
			aS.trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep)
		else:
			print "Error.\nSyntax: " + argv[0] + " -trainHMMsegmenter_fromdir <dirPath> <hmmModelFileName> <mtWin> <mtStep>"

	elif argv[1] == "-segmentClassifyFileHMM":	# HMM-based segmentation-classification
		if len(argv)==4:
			hmmModelName = argv[2]
			wavFile = argv[3]
			gtFile = wavFile.replace('.wav', '.segments');			
			aS.hmmSegmentation(wavFile, hmmModelName, PLOT = True, gtFileName = gtFile)
		else:
			print "Error.\nSyntax: " + argv[0] + " -segmentClassifyHMM <hmmModelName> <fileName>"

	elif argv[1] == '-segmentClassifyFile':		# Segmentation-classification (fix-sized segment using knn or svm)
		if (len(argv)==5):
			modelType = argv[2]
			modelName = argv[3]
			inputWavFile = argv[4]

			if modelType not in ["svm", "knn"]:
				raise Exception("ModelType has to be either svm or knn!")
			if not os.path.isfile(modelName):
				raise Exception("Input modelName not found!")
			if not os.path.isfile(inputWavFile):
				raise Exception("Input audio file not found!")
			gtFile = inputWavFile.replace('.wav', '.segments');
			aS.mtFileClassification(inputWavFile, modelName, modelType, True, gtFile)
		else:
			print "Error.\nSyntax: " + argv[0] + " -segmentClassifyFile <method(svm or knn)> <modelName> <fileName>"

	elif argv[1] == "-segmentationEvaluation":
		if len(argv)==5:
			methodName = argv[2]
			modelName = argv[3]
			dirName = argv[4]
			aS.evaluateSegmentationClassificationDir(dirName, modelName, methodName)
		else:
			print "Error.\nSyntax: " + argv[0] + " -segmentationEvaluation <method(svm or knn)> <modelName> <directoryName>"

	elif argv[1] == "-silenceRemoval":
		if len(argv)==5:
			inputFile = argv[2]
			if not os.path.isfile(inputFile):
				raise Exception("Input audio file not found!")

			smoothingWindow = float(argv[3])
			weight = float(argv[4])
			[Fs, x] = audioBasicIO.readAudioFile(inputFile)						# read audio signal
			segmentLimits = aS.silenceRemoval(x, Fs, 0.05, 0.05, smoothingWindow, weight, False)	# get onsets
			for i, s in enumerate(segmentLimits):
				strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(inputFile[0:-4], s[0], s[1])
				wavfile.write( strOut, Fs, x[int(Fs*s[0]):int(Fs*s[1])])
		else:
			print "Error.\nSyntax: " + argv[0] + " -silenceRemoval <inputFile> <smoothinWindow(secs)> <Threshold Weight>"

	elif argv[1] == '-speakerDiarization':		# speaker diarization (from file): TODO				
			inputFile = argv[2]
			nSpeakers = int(argv[3])
			useLDA = (int(argv[4])==1)			
			if useLDA:
				aS.speakerDiarization(inputFile, nSpeakers, PLOT = True);
			else:
				aS.speakerDiarization(inputFile, nSpeakers, LDAdim = 0, PLOT = True);
			#print speechLimits

	elif argv[1] == "-speakerDiarizationScriptEval":
			dir = argv[2]
			listOfLDAs = [int(l) for l in argv[3::]]
			aS.speakerDiarizationEvaluateScript(dir, listOfLDAs)

	elif argv[1] == '-thumbnail':			# music thumbnailing (OK)
			if len(argv)==4:	
				inputFile = argv[2]
				stWindow = 1.0
				stStep = 1.0
				if not os.path.isfile(inputFile):
					raise Exception("Input audio file not found!")

				[Fs, x] = audioBasicIO.readAudioFile(inputFile)						# read file
				if Fs == -1:	# could not read file
					return
				try:
					thumbnailSize = float(argv[3])
				except ValueError:
					print "Thumbnail size must be a float (in seconds)"
					return 
				[A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x, Fs, stWindow, stStep, thumbnailSize)	# find thumbnail endpoints			

				# write thumbnails to WAV files:
				thumbnailFileName1 = inputFile.replace(".wav","_thumb1.wav")
				thumbnailFileName2 = inputFile.replace(".wav","_thumb2.wav")
				wavfile.write(thumbnailFileName1, Fs, x[int(Fs*A1):int(Fs*A2)])
				wavfile.write(thumbnailFileName2, Fs, x[int(Fs*B1):int(Fs*B2)])
				print "1st thumbnail (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format(thumbnailFileName1, A1, A2)
				print "2nd thumbnail (stored in file {0:s}): {1:4.1f}sec -- {2:4.1f}sec".format(thumbnailFileName2, B1, B2)

				# Plot self-similarity matrix:
				fig = plt.figure()
				ax = fig.add_subplot(111, aspect='auto')
				plt.imshow(Smatrix)
				# Plot best-similarity diagonal:
				Xcenter = (A1/stStep + A2/stStep) / 2.0
				Ycenter = (B1/stStep + B2/stStep) / 2.0


				e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter), thumbnailSize * 1.4, 3,
			             angle=45, linewidth=3, fill=False)
				ax.add_patch(e1)

				plt.plot([B1, Smatrix.shape[0]], [A1, A1], color='k', linestyle='--', linewidth=2)
				plt.plot([B2, Smatrix.shape[0]], [A2, A2], color='k', linestyle='--', linewidth=2)
				plt.plot([B1, B1], [A1, Smatrix.shape[0]], color='k', linestyle='--', linewidth=2)
				plt.plot([B2, B2], [A2, Smatrix.shape[0]], color='k', linestyle='--', linewidth=2)

				plt.xlim([0, Smatrix.shape[0]])
				plt.ylim([Smatrix.shape[1], 0])



				ax.yaxis.set_label_position("right")
				ax.yaxis.tick_right()


				plt.xlabel('frame no')
				plt.ylabel('frame no')
				plt.title('Self-similarity matrix')

				plt.show()

			else: 
				print "Error.\nSyntax: " + argv[0] + " -thumbnail <filename> <thumbnailsize(seconds)>"
Beispiel #35
0
def dirWavFeatureExtraction(dirName,
                            mt_win,
                            mt_step,
                            st_win,
                            st_step,
                            compute_beat=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mt_win, mt_step:    mid-term window and step (in seconds)
        - st_win, st_step:    short-term window and step (in seconds)
    """

    all_mt_feats = numpy.array([])
    process_times = []

    types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au', '*.ogg')
    wav_file_list = []
    for files in types:
        wav_file_list.extend(glob.glob(os.path.join(dirName, files)))

    wav_file_list = sorted(wav_file_list)
    wav_file_list2, mt_feature_names = [], []
    for i, wavFile in enumerate(wav_file_list):
        print("Analyzing file {0:d} of "
              "{1:d}: {2:s}".format(i + 1, len(wav_file_list), wavFile))
        if os.stat(wavFile).st_size == 0:
            print("   (EMPTY FILE -- SKIPPING)")
            continue
        [fs, x] = audioBasicIO.readAudioFile(wavFile)
        if isinstance(x, int):
            continue

        t1 = time.clock()
        x = audioBasicIO.stereo2mono(x)
        if x.shape[0] < float(fs) / 5:
            print("  (AUDIO FILE TOO SMALL - SKIPPING)")
            continue
        wav_file_list2.append(wavFile)
        if compute_beat:
            [mt_term_feats, st_features, mt_feature_names] = \
                mtFeatureExtraction(x, fs, round(mt_win * fs),
                                    round(mt_step * fs),
                                    round(fs * st_win), round(fs * st_step))
            [beat, beat_conf] = beatExtraction(st_features, st_step)
        else:
            [mt_term_feats, _, mt_feature_names] = \
                mtFeatureExtraction(x, fs, round(mt_win * fs),
                                    round(mt_step * fs),
                                    round(fs * st_win), round(fs * st_step))

        mt_term_feats = numpy.transpose(mt_term_feats)
        mt_term_feats = mt_term_feats.mean(axis=0)
        # long term averaging of mid-term statistics
        if (not numpy.isnan(mt_term_feats).any()) and \
                (not numpy.isinf(mt_term_feats).any()):
            if compute_beat:
                mt_term_feats = numpy.append(mt_term_feats, beat)
                mt_term_feats = numpy.append(mt_term_feats, beat_conf)
            if len(all_mt_feats) == 0:
                # append feature vector
                all_mt_feats = mt_term_feats
            else:
                all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats))
            t2 = time.clock()
            duration = float(len(x)) / fs
            process_times.append((t2 - t1) / duration)
    if len(process_times) > 0:
        print("Feature extraction complexity ratio: "
              "{0:.1f} x realtime".format(
                  (1.0 / numpy.mean(numpy.array(process_times)))))
    return (all_mt_feats, wav_file_list2, mt_feature_names)
Beispiel #36
0
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35, PLOT=False):
    '''
    ARGUMENTS:
        - fileName:        the name of the WAV file to be analyzed
        - numOfSpeakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mtSize (opt)     mid-term window size
        - mtStep (opt)     mid-term window step
        - stWin  (opt)     short-term window size
        - LDAdim (opt)     LDA dimension (0 for no LDA)
        - PLOT     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [Fs, x] = audioBasicIO.readAudioFile(fileName)
    x = audioBasicIO.stereo2mono(x)
    Duration = len(x) / Fs

    [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel(
        "data/knnSpeakerAll")
    [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel(
        "data/knnSpeakerFemaleMale")

    [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin),
                                                                  round(Fs * stWin * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2
        [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
        [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2  # TODO
    # SELECT FEATURES:
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20];                                                                                         # SET 0A
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100];                                                                                 # SET 0B
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,
    #   97,98, 99,100];     # SET 0C

    iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
                       53]  # SET 1A
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];                                          # SET 1B
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 1C

    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53];             # SET 2A
    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];     # SET 2B
    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 2C

    # iFeaturesSelect = range(100);                                                                                                    # SET 3
    # MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010

    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    # remove outliers:
    DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    # TODO: Combine energy threshold for outlier removal:
    # EnergyMin = numpy.min(MidTermFeatures[1,:])
    # EnergyMean = numpy.mean(MidTermFeatures[1,:])
    # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    # iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
    # print iNonOutLiers

    perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    # LDA dimensionality reduction:
    if LDAdim > 0:
        # [mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));
        # extract mid-term features with minimum step:
        mtWinRatio = int(round(mtSize / stWin))
        mtStepRatio = int(round(stWin / stWin))
        mtFeaturesToReduce = []
        numOfFeatures = len(ShortTermFeatures)
        numOfStatistics = 2
        # for i in range(numOfStatistics * numOfFeatures + 1):
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append([])

        for i in range(numOfFeatures):  # for each of the short-term features:
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mtWinRatio
                if N2 > N:
                    N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures))
                curPos += mtStepRatio
        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros(
            (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
            [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1),
            i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1)::, i] = P2 + 0.0001
        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        # mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
        (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T
        # DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
        # MDistancesAll = numpy.mean(DistancesAll)
        # iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
        # mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
        Labels = numpy.zeros((mtFeaturesToReduce.shape[1],))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin
        # print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * stWin / LDAstepRatio);
        clf = LDA(n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels, tol=0.000001)
        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    if numOfSpeakers <= 0:
        sRange = range(2, 10)
    else:
        sRange = [numOfSpeakers]
    clsAll = []
    silAll = []
    centersAll = []

    for iSpeakers in sRange:
        cls, means, steps = mlpy.kmeans(MidTermFeaturesNorm.T, k=iSpeakers, plus=True)  # perform k-means clustering

        # YDist =   distance.pdist(MidTermFeaturesNorm.T, metric='euclidean')
        # print distance.squareform(YDist).shape
        # hc = mlpy.HCluster()
        # hc.linkage(YDist)
        # cls = hc.cut(14.5)
        # print cls

        # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
        clsAll.append(cls)
        centersAll.append(means)
        silA = [];
        silB = []
        for c in range(iSpeakers):  # for each speaker (i.e. for each extracted cluster)
            clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls))
            if clusterPerCent < 0.020:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c]  # get subset of feature vectors
                Yt = distance.pdist(
                    MidTermFeaturesNormTemp.T)  # compute average distance between samples that belong to the cluster (a values)
                silA.append(numpy.mean(Yt) * clusterPerCent)
                silBs = []
                for c2 in range(iSpeakers):  # compute distances from samples of other clusters
                    if c2 != c:
                        clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
                        silBs.append(numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0)
                silBs = numpy.array(silBs)
                silB.append(min(silBs))  # ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
        silA = numpy.array(silA)
        silB = numpy.array(silB)
        sil = []
        for c in range(iSpeakers):  # for each cluster (speaker)
            sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001))  # compute silhouette

        silAll.append(numpy.mean(sil))  # keep the AVERAGE SILLOUETTE

    # silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
    imax = numpy.argmax(silAll)  # position of the maximum sillouette value
    nSpeakersFinal = sRange[imax]  # optimal number of clusters

    return nSpeakersFinal
def mtFileClassification(input_file,
                         model_name,
                         model_type,
                         plot_results=False,
                         gt_file=""):
    '''
	This function performs mid-term classification of an audio stream.
	Towards this end, supervised knowledge is used, i.e. a pre-trained classifier.
	ARGUMENTS:
		- input_file:        path of the input WAV file
		- model_name:        name of the classification model
		- model_type:        svm or knn depending on the classifier type
		- plot_results:      True if results are to be plotted using
							 matplotlib along with a set of statistics

	RETURNS:
		  - segs:           a sequence of segment's endpoints: segs[i] is the
							endpoint of the i-th segment (in seconds)
		  - classes:        a sequence of class flags: class[i] is the
							class ID of the i-th segment
	'''

    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if model_type == "knn":
        [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = \
         aT.load_model_knn(model_name)
    else:
        [
            classifier, MEAN, STD, class_names, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = aT.load_model(model_name)

    if compute_beat:
        print("Model " + model_name + " contains long-term music features "
              "(beat etc) and cannot be used in "
              "segmentation")
        return (-1, -1, -1, -1)
    [fs, x] = audioBasicIO.readAudioFile(input_file)  # load input file
    if fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audioBasicIO.stereo2mono(x)  # convert stereo (if) to mono
    duration = len(x) / fs
    # mid-term feature extraction:
    [mt_feats, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs,
                                              round(fs * st_win),
                                              round(fs * st_step))
    flags = []
    Ps = []
    flags_ind = []
    for i in range(
            mt_feats.shape[1]
    ):  # for each feature vector (i.e. for each fix-sized segment):
        cur_fv = (mt_feats[:, i] -
                  MEAN) / STD  # normalize current feature vector
        [res, P] = aT.classifierWrapper(classifier, model_type,
                                        cur_fv)  # classify vector
        flags_ind.append(res)
        flags.append(class_names[int(res)])  # update class label matrix
        Ps.append(numpy.max(P))  # update probability matrix
    flags_ind = numpy.array(flags_ind)

    # 1-window smoothing
    for i in range(1, len(flags_ind) - 1):
        if flags_ind[i - 1] == flags_ind[i + 1]:
            flags_ind[i] = flags_ind[i + 1]
    # convert fix-sized flags to segments and classes
    (segs, classes) = flags2segs(flags, mt_step)
    segs[-1] = len(x) / float(fs)

    # Load grount-truth:
    if os.path.isfile(gt_file):
        [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt,
                                              seg_l_gt, mt_step)
        flags_ind_gt = []
        for j, fl in enumerate(flags_gt):
            # "align" labels with GT
            if class_names_gt[flags_gt[j]] in class_names:
                flags_ind_gt.append(
                    class_names.index(class_names_gt[flags_gt[j]]))
            else:
                flags_ind_gt.append(-1)
        flags_ind_gt = numpy.array(flags_ind_gt)
        cm = numpy.zeros((len(class_names_gt), len(class_names_gt)))
        for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])):
            cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1
    else:
        cm = []
        flags_ind_gt = numpy.array([])
    acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names,
                                  mt_step, not plot_results)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc))
        return (flags_ind, class_names_gt, acc, cm)
    else:
        return (flags_ind, class_names, acc, cm)