Esempi in Python per read_audio_file, esempi in Python per audio_basic_io.read_audio_file

Esempio n. 1

0

Mostra file

File: daiab.py Progetto: daiab/voice_process

def speaker_diarization():
    file = '/home/daiab/machine_disk/data/voice_identity/dianxin/1.wav'
    use_LDA = False
    plot = True
    num_speaker = 2
    if use_LDA:
        pos, cls = aS.speaker_diarization(file,
                                          num_speaker,
                                          mt_size=4.0,
                                          mt_step=0.1,
                                          st_win=0.05,
                                          st_step=0.01,
                                          plot=plot)
    else:
        pos, cls = aS.speaker_diarization(file, num_speaker, lda_dim=0, plot=plot)
    fr, x = audio_basic_io.read_audio_file(file)

    sep_voice = [[], []]
    pre_pos = 0
    cut_num = int(x.shape[0] * 0.0001)
    print('cut_num', cut_num)
    for i, c in enumerate(cls):
        c = int(c)
        v_from = pre_pos
        v_to = int(pos[i] * fr)
        sep_voice[c] += x[v_from + cut_num: v_to - cut_num].tolist()
        pre_pos = v_to

    print(len(sep_voice[0]), len(sep_voice[1]))
    wavfile.write('./0.wav', fr, np.array(sep_voice[0], dtype=np.int16))
    wavfile.write('./1.wav', fr, np.array(sep_voice[1], dtype=np.int16))

Esempio n. 2

0

Mostra file

File: audioAnalysis.py Progetto: daiab/voice_process

def fileChromagramWrapper(wavFileName):
    if not os.path.isfile(wavFileName):
        raise Exception("Input audio file not found!")
    [Fs, x] = audio_basic_io.read_audio_file(wavFileName)
    x = audio_basic_io.stereo2mono(x)
    specgram, TimeAxis, FreqAxis = aF.stChromagram(x, Fs, round(Fs * 0.040),
                                                   round(Fs * 0.040), True)

Esempio n. 3

0

Mostra file

File: audioFeatureExtraction.py Progetto: daiab/voice_process

def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile,
                              storeStFeatures=False, storeToCSV=False, PLOT=False):
    """
    This function is used as a wrapper to:
    a) read the content of a WAV file
    b) perform mid-term feature extraction on that signal
    c) write the mid-term feature sequences to a numpy file
    """
    [Fs, x] = audio_basic_io.read_audio_file(fileName)  # read the wav file
    x = audio_basic_io.stereo2mono(x)  # convert to MONO if required
    if storeStFeatures:
        [mtF, stF] = mt_feature_extraction(x, Fs, round(Fs * midTermSize), round(Fs * midTermStep),
                                           round(Fs * shortTermSize), round(Fs * shortTermStep))
    else:
        [mtF, _] = mt_feature_extraction(x, Fs, round(Fs * midTermSize), round(Fs * midTermStep),
                                         round(Fs * shortTermSize), round(Fs * shortTermStep))

    numpy.save(outPutFile, mtF)  # save mt features to numpy file
    if PLOT:
        print("Mid-term numpy file: " + outPutFile + ".npy saved")
    if storeToCSV:
        numpy.savetxt(outPutFile + ".csv", mtF.T, delimiter=",")
        if PLOT:
            print("Mid-term CSV file: " + outPutFile + ".csv saved")

    if storeStFeatures:
        numpy.save(outPutFile + "_st", stF)  # save st features to numpy file
        if PLOT:
            print("Short-term numpy file: " + outPutFile + "_st.npy saved")
        if storeToCSV:
            numpy.savetxt(outPutFile + "_st.csv", stF.T, delimiter=",")  # store st features to CSV file
            if PLOT:
                print("Short-term CSV file: " + outPutFile + "_st.csv saved")

Esempio n. 4

0

Mostra file

def getMusicSegmentsFromFile(inputFile):
    modelType = "svm"
    modelName = "data/svmMovies8classes"

    dirOutput = inputFile[0:-4] + "_musicSegments"

    if os.path.exists(dirOutput) and dirOutput != ".":
        shutil.rmtree(dirOutput)
    os.makedirs(dirOutput)

    [Fs, x] = audio_basic_io.read_audio_file(inputFile)

    if modelType == 'svm':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName)
    elif modelType == 'knn':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName)

    flagsInd, classNames, acc, CM = aS.mtFileClassification(inputFile, modelName, modelType, plotResults=False,
                                                            gtFile="")
    segs, classes = aS.flags2segs(flagsInd, mtStep)

    for i, s in enumerate(segs):
        if (classNames[int(classes[i])] == "Music") and (s[1] - s[0] >= minDuration):
            strOut = "{0:s}{1:.3f}-{2:.3f}.wav".format(dirOutput + os.sep, s[0], s[1])
            wavfile.write(strOut, Fs, x[int(Fs * s[0]):int(Fs * s[1])])

Esempio n. 5

0

Mostra file

File: daiab.py Progetto: daiab/voice_process

def plot_spectorgram():
    import audioFeatureExtraction as aF
    Fs, x = audio_basic_io.read_audio_file(example_file)
    x = audio_basic_io.stereo2mono(x)
    specgram, TimeAxis, FreqAxis = aF.stSpectogram(x, Fs,
                                                   round(Fs * 0.040),
                                                   round(Fs * 0.040),
                                                   True)

Esempio n. 6

0

Mostra file

File: audioAnalysis.py Progetto: daiab/voice_process

def beatExtractionWrapper(wavFileName, plot):
    if not os.path.isfile(wavFileName):
        raise Exception("Input audio file not found!")
    [Fs, x] = audio_basic_io.read_audio_file(wavFileName)
    F = aF.st_feature_extraction(x, Fs, 0.050 * Fs, 0.050 * Fs)
    BPM, ratio = aF.beatExtraction(F, 0.050, plot)
    print("Beat: {0:d} bpm ".format(int(BPM)))
    print("Ratio: {0:.2f} ".format(ratio))

Esempio n. 7

0

Mostra file

File: audioTrainTest.py Progetto: daiab/voice_process

def fileRegression(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(inputFile):
        print("fileClassification: wav file not found!")
        return (-1, -1, -1)

    regressionModels = glob.glob(modelName + "_*")
    regressionModels2 = []
    for r in regressionModels:
        if r[-5::] != "MEANS":
            regressionModels2.append(r)
    regressionModels = regressionModels2
    regressionNames = []
    for r in regressionModels:
        regressionNames.append(r[r.rfind("_") + 1::])

    # FEATURE EXTRACTION
    # LOAD ONLY THE FIRST MODEL (for mtWin, etc)
    if modelType == 'svm' or modelType == "svm_rbf":
        [_, _, _, mtWin, mtStep, stWin, stStep,
         computeBEAT] = loadSVModel(regressionModels[0], True)
    elif modelType == 'randomforest':
        [_, _, _, mtWin, mtStep, stWin, stStep,
         computeBEAT] = loadRandomForestModel(regressionModels[0], True)

    [Fs, x] = audio_basic_io.read_audio_file(
        inputFile)  # read audio file and convert to mono
    x = audio_basic_io.stereo2mono(x)
    # feature extraction:
    [MidTermFeatures, s] = aF.mt_feature_extraction(x, Fs, mtWin * Fs,
                                                    mtStep * Fs,
                                                    round(Fs * stWin),
                                                    round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(
        axis=1)  # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)

    # REGRESSION
    R = []
    for ir, r in enumerate(regressionModels):
        if not os.path.isfile(r):
            print("fileClassification: input modelName not found!")
            return (-1, -1, -1)
        if modelType == 'svm' or modelType == "svm_rbf":
            [Model, MEAN, STD, mtWin, mtStep, stWin, stStep,
             computeBEAT] = loadSVModel(r, True)
        elif modelType == 'randomforest':
            [Model, MEAN, STD, mtWin, mtStep, stWin, stStep,
             computeBEAT] = loadRandomForestModel(r, True)
        curFV = (MidTermFeatures - MEAN) / STD  # normalization
        R.append(regressionWrapper(Model, modelType, curFV))  # classification
    return R, regressionNames

Esempio n. 8

0

Mostra file

File: audioAnalysis.py Progetto: daiab/voice_process

def silenceRemovalWrapper(inputFile, smoothingWindow, weight):
    if not os.path.isfile(inputFile):
        raise Exception("Input audio file not found!")

    [Fs, x] = audio_basic_io.read_audio_file(inputFile)
    segmentLimits = aS.silenceRemoval(x, Fs, 0.05, 0.05, smoothingWindow,
                                      weight, True)
    for i, s in enumerate(segmentLimits):
        strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(inputFile[0:-4], s[0],
                                                    s[1])
        wavfile.write(strOut, Fs, x[int(Fs * s[0]):int(Fs * s[1])])

Esempio n. 9

0

Mostra file

File: daiab.py Progetto: daiab/voice_process

def remove_silence():
    smoothing = 1.0
    weight = 0.5
    example_file = '/home/yulongwu/d/voice/wav/2nU95KARZwk.wav'
    fr, x = audio_basic_io.read_audio_file(example_file)
    print('x shape', x.shape)
    segment_limits = aS.silenceRemoval(x, fr, 0.05, 0.05,
                                       smoothing, weight, True)
    for i, s in enumerate(segment_limits):
        name = "{0:s}_{1:.3f}-{2:.3f}.wav".format(example_file[0:-4], s[0], s[1])
        wavfile.write(name, fr, x[int(fr * s[0]):int(fr * s[1])])

Esempio n. 10

0

Mostra file

def classifyFolderWrapper(inputFolder, modelType, modelName, outputMode=False):
    if not os.path.isfile(modelName):
        raise Exception("Input modelName not found!")

    if modelType == 'svm':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName)
    elif modelType == 'knn':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName)

    PsAll = numpy.zeros((len(classNames),))

    files = "*.wav"
    if os.path.isdir(inputFolder):
        strFilePattern = os.path.join(inputFolder, files)
    else:
        strFilePattern = inputFolder + files

    wavFilesList = []
    wavFilesList.extend(glob.glob(strFilePattern))
    wavFilesList = sorted(wavFilesList)
    if len(wavFilesList) == 0:
        print("No WAV files found!")
        return

    Results = []
    for wavFile in wavFilesList:
        [Fs, x] = audio_basic_io.read_audio_file(wavFile)
        signalLength = x.shape[0] / float(Fs)
        [Result, P, classNames] = aT.fileClassification(wavFile, modelName, modelType)
        PsAll += (numpy.array(P) * signalLength)
        Result = int(Result)
        Results.append(Result)
        if outputMode:
            print("{0:s}\t{1:s}".format(wavFile, classNames[Result]))
    Results = numpy.array(Results)

    # print(distribution of classes:
    [Histogram, _] = numpy.histogram(Results, bins=numpy.arange(len(classNames) + 1))
    if outputMode:
        for i, h in enumerate(Histogram):
            print("{0:20s}\t\t{1:d}".format(classNames[i], h))
    PsAll = PsAll / numpy.sum(PsAll)

    if outputMode:
        fig = plt.figure()
        ax = fig.add_subplot(111)
        plt.title("Classes percentage " + inputFolder.replace('Segments', ''))
        ax.axis((0, len(classNames) + 1, 0, 1))
        ax.set_xticks(numpy.array(range(len(classNames) + 1)))
        ax.set_xticklabels([" "] + classNames)
        ax.bar(numpy.array(range(len(classNames))) + 0.5, PsAll)
        plt.show()
    return classNames, PsAll

Esempio n. 11

0

Mostra file

File: daiab.py Progetto: daiab/voice_process

def extract_feat():
    import audio_basic_io
    import audioFeatureExtraction
    import matplotlib.pyplot as plt
    Fs, x = audio_basic_io.read_audio_file(example_file)
    F = audioFeatureExtraction.st_feature_extraction(x, Fs, 0.050 * Fs, 0.025 * Fs);
    plt.subplot(2, 1, 1)
    plt.plot(F[0, :])
    plt.xlabel('Frame no')
    plt.ylabel('ZCR')
    plt.subplot(2, 1, 2)
    plt.plot(F[1, :])
    plt.xlabel('Frame no')
    plt.ylabel('Energy')
    plt.show()

Esempio n. 12

0

Mostra file

def hmmSegmentation(wavFileName, hmmModelName, PLOT=False, gtFileName=""):
    [Fs, x] = audio_basic_io.read_audio_file(wavFileName)  # read audio data

    try:
        fo = open(hmmModelName, "rb")
    except IOError:
        print("didn't find file")
        return

    try:
        hmm = cPickle.load(fo)
        classesAll = cPickle.load(fo)
        mtWin = cPickle.load(fo)
        mtStep = cPickle.load(fo)
    except:
        fo.close()
    fo.close()

    # Features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);    # feature extraction
    [Features, _] = aF.mt_feature_extraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050))
    flagsInd = hmm.predict(Features.T)  # apply model
    # for i in range(len(flagsInd)):
    #    if classesAll[flagsInd[i]]=="silence":
    #        flagsInd[i]=classesAll.index("speech")

    # plot results
    if os.path.isfile(gtFileName):
        [segStart, segEnd, segLabels] = readSegmentGT(gtFileName)
        flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep)
        flagsGTNew = []
        for j, fl in enumerate(flagsGT):  # "align" labels with GT
            if classNamesGT[flagsGT[j]] in classesAll:
                flagsGTNew.append(classesAll.index(classNamesGT[flagsGT[j]]))
            else:
                flagsGTNew.append(-1)
        CM = np.zeros((len(classNamesGT), len(classNamesGT)))
        flagsIndGT = np.array(flagsGTNew)
        for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])):
            CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1
    else:
        flagsIndGT = np.array([])
    acc = plotSegmentationResults(flagsInd, flagsIndGT, classesAll, mtStep, not PLOT)
    if acc >= 0:
        print("Overall Accuracy: {0:.2f}".format(acc))
        return (flagsInd, classNamesGT, acc, CM)
    else:
        return (flagsInd, classesAll, -1, -1)

Esempio n. 13

0

Mostra file

File: audioFeatureExtraction.py Progetto: daiab/voice_process

def dirWavFeatureExtractionNoAveraging(dirName, mtWin, mtStep, stWin, stStep):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder without averaging each file.

    ARGUMENTS:
        - dirName:          the path of the WAVE directory
        - mtWin, mtStep:    mid-term window and step (in seconds)
        - stWin, stStep:    short-term window and step (in seconds)
    RETURNS:
        - X:                A feature matrix
        - Y:                A matrix of file labels
        - filenames:
    """

    allMtFeatures = numpy.array([])
    signalIndices = numpy.array([])
    processingTimes = []

    types = ('*.wav', '*.aif', '*.aiff')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(dirName, files)))

    wavFilesList = sorted(wavFilesList)

    for i, wavFile in enumerate(wavFilesList):
        [Fs, x] = audio_basic_io.read_audio_file(wavFile)  # read file
        if isinstance(x, int):
            continue

        x = audio_basic_io.stereo2mono(x)  # convert stereo to mono
        [MidTermFeatures, _] = mt_feature_extraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs), round(Fs * stWin),
                                                     round(Fs * stStep))  # mid-term feature

        MidTermFeatures = numpy.transpose(MidTermFeatures)
        #        MidTermFeatures = MidTermFeatures.mean(axis=0)        # long term averaging of mid-term statistics
        if len(allMtFeatures) == 0:  # append feature vector
            allMtFeatures = MidTermFeatures
            signalIndices = numpy.zeros((MidTermFeatures.shape[0],))
        else:
            allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
            signalIndices = numpy.append(signalIndices, i * numpy.ones((MidTermFeatures.shape[0],)))

    return (allMtFeatures, signalIndices, wavFilesList)

Esempio n. 14

0

Mostra file

def trainHMM_fromFile(wavFile, gtFile, hmmModelName, mtWin, mtStep):
    '''
    This function trains a HMM model for segmentation-classification using a single annotated audio file
    ARGUMENTS:
     - wavFile:        the path of the audio filename
     - gtFile:         the path of the ground truth filename
                       (a csv file of the form <segment start in seconds>,<segment end in seconds>,<segment label> in each row
     - hmmModelName:   the name of the HMM model to be stored
     - mtWin:          mid-term window size
     - mtStep:         mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - classNames:     a list of classNames

    After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file
    '''

    [segStart, segEnd, segLabels] = readSegmentGT(gtFile)  # read ground truth data
    flags, classNames = segs2flags(segStart, segEnd, segLabels, mtStep)  # convert to fix-sized sequence of flags

    [Fs, x] = audio_basic_io.read_audio_file(wavFile)  # read audio data
    # F = aF.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);
    [F, _] = aF.mt_feature_extraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050),
                                      round(Fs * 0.050))  # feature extraction
    startprob, transmat, means, cov = trainHMM_computeStatistics(F,
                                                                 flags)  # compute HMM statistics (priors, transition matrix, etc)

    hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag")  # hmm training

    hmm.startprob_ = startprob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmmModelName, "wb")  # output to file
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classNames

Esempio n. 15

0

Mostra file

def annotation2files(wavFile, csvFile):
    '''
        Break an audio stream to segments of interest, 
        defined by a csv file
        
        - wavFile:    path to input wavfile
        - csvFile:    path to csvFile of segment limits
        
        Input CSV file must be of the format <T1>\t<T2>\t<Label>
    '''

    [Fs, x] = audio_basic_io.read_audio_file(wavFile)
    with open(csvFile, 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter='\t', quotechar='|')
        for j, row in enumerate(reader):
            T1 = float(row[0].replace(",", "."))
            T2 = float(row[1].replace(",", "."))
            label = "%s_%s_%.2f_%.2f.wav" % (wavFile, row[2], T1, T2)
            label = label.replace(" ", "_")
            xtemp = x[int(round(T1 * Fs)):int(round(T2 * Fs))]
            print(T1, T2, label, xtemp.shape)
            wavfile.write(label, Fs, xtemp)

Esempio n. 16

0

Mostra file

def trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep):
    '''
    This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored
    ARGUMENTS:
     - dirPath:        the path of the data diretory
     - hmmModelName:    the name of the HMM model to be stored
     - mtWin:        mid-term window size
     - mtStep:        mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - classNames:        a list of classNames

    After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file
    '''

    flagsAll = np.array([])
    classesAll = []
    for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')):  # for each WAV file
        wavFile = f
        gtFile = f.replace('.wav', '.segments')  # open for annotated file
        if not os.path.isfile(gtFile):  # if current WAV file does not have annotation -> skip
            continue
        [segStart, segEnd, segLabels] = readSegmentGT(gtFile)  # read GT data
        flags, classNames = segs2flags(segStart, segEnd, segLabels, mtStep)  # convert to flags
        for c in classNames:  # update classnames:
            if c not in classesAll:
                classesAll.append(c)
        [Fs, x] = audio_basic_io.read_audio_file(wavFile)  # read audio data
        [F, _] = aF.mt_feature_extraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050),
                                          round(Fs * 0.050))  # feature extraction

        lenF = F.shape[1]
        lenL = len(flags)
        MIN = min(lenF, lenL)
        F = F[:, 0:MIN]
        flags = flags[0:MIN]

        flagsNew = []
        for j, fl in enumerate(flags):  # append features and labels
            flagsNew.append(classesAll.index(classNames[flags[j]]))

        flagsAll = np.append(flagsAll, np.array(flagsNew))

        if i == 0:
            Fall = F
        else:
            Fall = np.concatenate((Fall, F), axis=1)
    startprob, transmat, means, cov = trainHMM_computeStatistics(Fall, flagsAll)  # compute HMM statistics
    hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag")  # train HMM
    hmm.startprob_ = startprob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmmModelName, "wb")  # save HMM model
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classesAll, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classesAll

Esempio n. 17

0

Mostra file

File: audioFeatureExtraction.py Progetto: daiab/voice_process

def dirWavFeatureExtraction(dirName, mtWin, mtStep, stWin, stStep, computeBEAT=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mtWin, mtStep:    mid-term window and step (in seconds)
        - stWin, stStep:    short-term window and step (in seconds)
    """

    allMtFeatures = numpy.array([])
    processingTimes = []

    types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(dirName, files)))

    wavFilesList = sorted(wavFilesList)
    wavFilesList2 = []
    for i, wavFile in enumerate(wavFilesList):
        print("Analyzing file {0:d} of {1:d}: {2:s}".format(i + 1, len(wavFilesList), wavFile.encode('utf-8')))
        if os.stat(wavFile).st_size == 0:
            print("   (EMPTY FILE -- SKIPPING)")
            continue
        [Fs, x] = audio_basic_io.read_audio_file(wavFile)  # read file
        if isinstance(x, int):
            continue

        t1 = time.clock()
        x = audio_basic_io.stereo2mono(x)  # convert stereo to mono
        if x.shape[0] < float(Fs) / 10:
            print("  (AUDIO FILE TOO SMALL - SKIPPING)")
            continue
        wavFilesList2.append(wavFile)
        if computeBEAT:  # mid-term feature extraction for current file
            [MidTermFeatures, stFeatures] = mt_feature_extraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs),
                                                                  round(Fs * stWin), round(Fs * stStep))
            [beat, beatConf] = beatExtraction(stFeatures, stStep)
        else:
            [MidTermFeatures, _] = mt_feature_extraction(x, Fs, round(mtWin * Fs), round(mtStep * Fs),
                                                         round(Fs * stWin), round(Fs * stStep))

        MidTermFeatures = numpy.transpose(MidTermFeatures)
        MidTermFeatures = MidTermFeatures.mean(axis=0)  # long term averaging of mid-term statistics
        if (not numpy.isnan(MidTermFeatures).any()) and (not numpy.isinf(MidTermFeatures).any()):
            if computeBEAT:
                MidTermFeatures = numpy.append(MidTermFeatures, beat)
                MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
            if len(allMtFeatures) == 0:  # append feature vector
                allMtFeatures = MidTermFeatures
            else:
                allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
            t2 = time.clock()
            duration = float(len(x)) / Fs
            processingTimes.append((t2 - t1) / duration)
    if len(processingTimes) > 0:
        print("Feature extraction complexity ratio: {0:.1f} x realtime".format(
            (1.0 / numpy.mean(numpy.array(processingTimes)))))
    return (allMtFeatures, wavFilesList2)

Esempio n. 18

0

Mostra file

File: audioAnalysis.py Progetto: daiab/voice_process

def thumbnailWrapper(inputFile, thumbnailWrapperSize):
    stWindow = 1.0
    stStep = 1.0
    if not os.path.isfile(inputFile):
        raise Exception("Input audio file not found!")

    [Fs, x] = audio_basic_io.read_audio_file(inputFile)
    if Fs == -1:  # could not read file
        return

    [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x, Fs, stWindow, stStep,
                                                     thumbnailWrapperSize)

    # write thumbnailWrappers to WAV files:
    if inputFile.endswith(".wav"):
        thumbnailWrapperFileName1 = inputFile.replace(".wav", "_thumb1.wav")
        thumbnailWrapperFileName2 = inputFile.replace(".wav", "_thumb2.wav")
    if inputFile.endswith(".mp3"):
        thumbnailWrapperFileName1 = inputFile.replace(".mp3", "_thumb1.mp3")
        thumbnailWrapperFileName2 = inputFile.replace(".mp3", "_thumb2.mp3")
    wavfile.write(thumbnailWrapperFileName1, Fs, x[int(Fs * A1):int(Fs * A2)])
    wavfile.write(thumbnailWrapperFileName2, Fs, x[int(Fs * B1):int(Fs * B2)])
    print("1st thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec" \
          " -- {2:4.1f}sec".format(thumbnailWrapperFileName1, A1, A2))
    print("2nd thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec" \
          " -- {2:4.1f}sec".format(thumbnailWrapperFileName2, B1, B2))

    # Plot self-similarity matrix:
    fig = plt.figure()
    ax = fig.add_subplot(111, aspect="auto")
    plt.imshow(Smatrix)
    # Plot best-similarity diagonal:
    Xcenter = (A1 / stStep + A2 / stStep) / 2.0
    Ycenter = (B1 / stStep + B2 / stStep) / 2.0

    e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter),
                                    thumbnailWrapperSize * 1.4,
                                    3,
                                    angle=45,
                                    linewidth=3,
                                    fill=False)
    ax.add_patch(e1)

    plt.plot([B1, Smatrix.shape[0]], [A1, A1],
             color="k",
             linestyle="--",
             linewidth=2)
    plt.plot([B2, Smatrix.shape[0]], [A2, A2],
             color="k",
             linestyle="--",
             linewidth=2)
    plt.plot([B1, B1], [A1, Smatrix.shape[0]],
             color="k",
             linestyle="--",
             linewidth=2)
    plt.plot([B2, B2], [A2, Smatrix.shape[0]],
             color="k",
             linestyle="--",
             linewidth=2)

    plt.xlim([0, Smatrix.shape[0]])
    plt.ylim([Smatrix.shape[1], 0])

    ax.yaxis.set_label_position("right")
    ax.yaxis.tick_right()

    plt.xlabel("frame no")
    plt.ylabel("frame no")
    plt.title("Self-similarity matrix")

    plt.show()

Esempio n. 19

0

Mostra file

def mtFileClassification(inputFile, modelName, modelType, plotResults=False, gtFile=""):
    '''
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used, i.e. a pre-trained classifier.
    ARGUMENTS:
        - inputFile:        path of the input WAV file
        - modelName:        name of the classification model
        - modelType:        svm or knn depending on the classifier type
        - plotResults:      True if results are to be plotted using matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the class ID of the i-th segment
    '''

    if not os.path.isfile(modelName):
        print("mtFileClassificationError: input modelType not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if (modelType == 'svm') or (modelType == 'svm_rbf'):
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName)
    elif modelType == 'knn':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName)
    elif modelType == 'randomforest':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadRandomForestModel(
            modelName)
    elif modelType == 'gradientboosting':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadGradientBoostingModel(
            modelName)
    elif modelType == 'extratrees':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadExtraTreesModel(
            modelName)

    if computeBEAT:
        print("Model " + modelName + " contains long-term music features (beat etc) and cannot be used in segmentation")
        return (-1, -1, -1, -1)
    [Fs, x] = audio_basic_io.read_audio_file(inputFile)  # load input file
    if Fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audio_basic_io.stereo2mono(x)  # convert stereo (if) to mono
    Duration = len(x) / Fs
    # mid-term feature extraction:
    [MidTermFeatures, _] = aF.mt_feature_extraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin),
                                                    round(Fs * stStep))
    flags = []
    Ps = []
    flagsInd = []
    for i in range(MidTermFeatures.shape[1]):  # for each feature vector (i.e. for each fix-sized segment):
        curFV = (MidTermFeatures[:, i] - MEAN) / STD  # normalize current feature vector
        [Result, P] = aT.classifierWrapper(Classifier, modelType, curFV)  # classify vector
        flagsInd.append(Result)
        flags.append(classNames[int(Result)])  # update class label matrix
        Ps.append(np.max(P))  # update probability matrix
    flagsInd = np.array(flagsInd)

    # 1-window smoothing
    for i in range(1, len(flagsInd) - 1):
        if flagsInd[i - 1] == flagsInd[i + 1]:
            flagsInd[i] = flagsInd[i + 1]
    (segs, classes) = flags2segs(flags, mtStep)  # convert fix-sized flags to segments and classes
    segs[-1] = len(x) / float(Fs)

    # Load grount-truth:        
    if os.path.isfile(gtFile):
        [segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile)
        flagsGT, classNamesGT = segs2flags(segStartGT, segEndGT, segLabelsGT, mtStep)
        flagsIndGT = []
        for j, fl in enumerate(flagsGT):  # "align" labels with GT
            if classNamesGT[flagsGT[j]] in classNames:
                flagsIndGT.append(classNames.index(classNamesGT[flagsGT[j]]))
            else:
                flagsIndGT.append(-1)
        flagsIndGT = np.array(flagsIndGT)
        CM = np.zeros((len(classNamesGT), len(classNamesGT)))
        for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])):
            CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1
    else:
        CM = []
        flagsIndGT = np.array([])
    acc = plotSegmentationResults(flagsInd, flagsIndGT, classNames, mtStep, not plotResults)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc))
        return (flagsInd, classNamesGT, acc, CM)
    else:
        return (flagsInd, classNames, acc, CM)

Esempio n. 20

0

Mostra file

def speaker_diarization(file_name, num_speaker, mt_size=2.0,
                        mt_step=0.2, st_win=0.05, st_step=0.025,
                        lda_dim=35,
                        plot=False):
    '''
    ARGUMENTS:
        - fileName:        the name of the WAV file to be analyzed
        - numOfSpeakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mtSize (opt)     mid-term window size
        - mtStep (opt)     mid-term window step
        - stWin  (opt)     short-term window size
        - LDAdim (opt)     LDA dimension (0 for no LDA)
        - PLOT     (opt)   0 for not plotting the results 1 for plottingy
    '''
    fr, x = audio_basic_io.read_audio_file(file_name)
    x = audio_basic_io.stereo2mono(x)
    duration = len(x) / fr

    classifier1, mean1, std1, class_names1, mt_win1, mt_step1, st_win1, st_step1, compute_beta1 = aT.loadKNNModel(
        os.path.join("data", "knnSpeakerAll"))
    classifier2, mean2, std2, class_names2, mt_win2, mt_step2, st_win2, st_step2, compute_beta2 = aT.loadKNNModel(
        os.path.join("data", "knnSpeakerFemaleMale"))

    mid_term_features, short_term_features = aF.mt_feature_extraction(signal=x,
                                                                      fr=fr,
                                                                      mt_win=mt_size * fr,
                                                                      mt_step=mt_step * fr,
                                                                      st_win=round(fr * st_win),
                                                                      st_step=round(fr * st_step))

    # (68, 329) (34, 2630)
    print(mid_term_features.shape, short_term_features.shape)
    mid_term_features2 = np.zeros((mid_term_features.shape[0] + len(class_names1) + len(class_names2),
                                   mid_term_features.shape[1]))

    for i in range(mid_term_features.shape[1]):
        cur_f1 = (mid_term_features[:, i] - mean1) / std1
        cur_f2 = (mid_term_features[:, i] - mean2) / std2
        result, p1 = aT.classifierWrapper(classifier1, "knn", cur_f1)
        result, p2 = aT.classifierWrapper(classifier2, "knn", cur_f2)
        mid_term_features2[0:mid_term_features.shape[0], i] = mid_term_features[:, i]
        mid_term_features2[mid_term_features.shape[0]:mid_term_features.shape[0] + len(class_names1), i] = p1 + 0.0001
        mid_term_features2[mid_term_features.shape[0] + len(class_names1)::, i] = p2 + 0.0001

    mid_term_features = mid_term_features2  # TODO
    # SELECT FEATURES:
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20];     # SET 0A
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100];    # SET 0B
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,
    # 74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,
    # 97,98, 99,100];     # SET 0C

    i_features_select = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41,
                         42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]  # SET 1A
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B
    # iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,
    # 48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,
    # 87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 1C

    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,
    # 36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53];  # SET 2A
    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,
    # 36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];     # SET 2B
    # iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,
    # 36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,
    # 76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];  # SET 2C

    # iFeaturesSelect = range(100);   # SET 3
    # MidTermFeatures += np.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010

    mid_term_features = mid_term_features[i_features_select, :]

    mid_term_features_norm, mean, std = aT.normalizeFeatures([mid_term_features.T])
    mid_term_features_norm = mid_term_features_norm[0].T
    num_of_windows = mid_term_features.shape[1]

    # remove outliers:
    distances_all = np.sum(distance.squareform(distance.pdist(mid_term_features_norm.T)), axis=0)
    m_distances_all = np.mean(distances_all)
    i_non_out_liers = np.nonzero(distances_all < 1.2 * m_distances_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    # EnergyMin = np.min(MidTermFeatures[1,:])
    # EnergyMean = np.mean(MidTermFeatures[1,:])
    # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    # iNonOutLiers = np.nonzero(MidTermFeatures[1,:] > Thres)[0]
    # print(iNonOutLiers

    # per_out_lier = (100.0 * (num_of_windows - i_non_out_liers.shape[0])) / num_of_windows
    mid_term_features_norm_or = mid_term_features_norm
    mid_term_features_norm = mid_term_features_norm[:, i_non_out_liers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_features_to_reduce = []
        num_of_features = len(short_term_features)
        num_of_statistics = 2
        # for i in range(numOfStatistics * numOfFeatures + 1):
        for i in range(num_of_statistics * num_of_features):
            mt_features_to_reduce.append([])

        for i in range(num_of_features):  # for each of the short-term features:
            cur_pos = 0
            n = len(short_term_features[i])
            while cur_pos < n:
                n1 = cur_pos
                n2 = cur_pos + mt_win_ratio
                if n2 > n:
                    n2 = n
                cur_st_features = short_term_features[i][n1:n2]
                mt_features_to_reduce[i].append(np.mean(cur_st_features))
                mt_features_to_reduce[i + num_of_features].append(np.std(cur_st_features))
                cur_pos += mt_step_ratio
        mt_features_to_reduce = np.array(mt_features_to_reduce)
        mt_features_to_reduce2 = np.zeros((mt_features_to_reduce.shape[0] + len(class_names1) + len(class_names2),
                                           mt_features_to_reduce.shape[1]))
        for i in range(mt_features_to_reduce.shape[1]):
            cur_f1 = (mt_features_to_reduce[:, i] - mean1) / std1
            cur_f2 = (mt_features_to_reduce[:, i] - mean2) / std2
            result, p1 = aT.classifierWrapper(classifier1, "knn", cur_f1)
            result, p2 = aT.classifierWrapper(classifier2, "knn", cur_f2)
            mt_features_to_reduce2[0:mt_features_to_reduce.shape[0], i] = mt_features_to_reduce[:, i]
            mt_features_to_reduce2[mt_features_to_reduce.shape[0]:mt_features_to_reduce.shape[0] + len(class_names1),
            i] = p1 + 0.0001
            mt_features_to_reduce2[mt_features_to_reduce.shape[0] + len(class_names1)::, i] = p2 + 0.0001
        mt_features_to_reduce = mt_features_to_reduce2
        mt_features_to_reduce = mt_features_to_reduce[i_features_select, :]
        # mtFeaturesToReduce += np.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
        mt_features_to_reduce, mean, std = aT.normalizeFeatures([mt_features_to_reduce.T])
        mt_features_to_reduce = mt_features_to_reduce[0].T
        # DistancesAll = np.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
        # MDistancesAll = np.mean(DistancesAll)
        # iNonOutLiers2 = np.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
        # mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
        labels = np.zeros((mt_features_to_reduce.shape[1],))
        lda_step = 1.0
        lda_step_ratio = lda_step / st_win
        # print(LDAstep, LDAstepRatio
        for i in range(labels.shape[0]):
            labels[i] = int(i * st_win / lda_step_ratio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=lda_dim)
        clf.fit(mt_features_to_reduce.T, labels)
        mid_term_features_norm = (clf.transform(mid_term_features_norm.T)).T

    if num_speaker <= 0:
        s_range = range(2, 10)
    else:
        s_range = [num_speaker]
    cls_all = []
    sil_all = []
    centers_all = []
    # (26, 314)
    print('mid_term_features_norm', mid_term_features_norm.shape)
    for i_speakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=i_speakers)
        k_means.fit(mid_term_features_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
        cls_all.append(cls)
        centers_all.append(means)
        sil_a = []
        sil_b = []
        for c in range(i_speakers):  # for each speaker (i.e. for each extracted cluster)
            cluster_percent = np.nonzero(cls == c)[0].shape[0] / float(len(cls))
            if cluster_percent < 0.020:
                sil_a.append(0.0)
                sil_b.append(0.0)
            else:
                mid_term_features_norm_temp = mid_term_features_norm[:, cls == c]  # get subset of feature vectors
                # compute average distance between samples that belong to the cluster (a values)
                yt = distance.pdist(mid_term_features_norm_temp.T)
                sil_a.append(np.mean(yt) * cluster_percent)
                sil_bs = []
                for c2 in range(i_speakers):  # compute distances from samples of other clusters
                    if c2 != c:
                        cluster_percent2 = np.nonzero(cls == c2)[0].shape[0] / float(len(cls))
                        mid_term_features_norm_temp2 = mid_term_features_norm[:, cls == c2]
                        yt = distance.cdist(mid_term_features_norm_temp.T, mid_term_features_norm_temp2.T)
                        sil_bs.append(np.mean(yt) * (cluster_percent + cluster_percent2) / 2.0)
                sil_bs = np.array(sil_bs)
                # ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
                sil_b.append(min(sil_bs))
        sil_a = np.array(sil_a)
        sil_b = np.array(sil_b)
        sil = []
        for c in range(i_speakers):  # for each cluster (speaker)
            sil.append((sil_b[c] - sil_a[c]) / (max(sil_b[c], sil_a[c]) + 0.00001))  # compute silhouette
        sil_all.append(np.mean(sil))  # keep the AVERAGE SILLOUETTE

    # silAll = silAll * (1.0/(np.power(np.array(sRange),0.5)))
    imax = np.argmax(sil_all)  # position of the maximum sillouette value
    n_speakers_final = s_range[imax]  # optimal number of clusters

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their nearest non-outlier window)
    cls = np.zeros((num_of_windows,))
    for i in range(num_of_windows):
        j = np.argmin(np.abs(i - i_non_out_liers))
        cls[i] = cls_all[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        startprob, transmat, means, cov = trainHMM_computeStatistics(mid_term_features_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag")  # hmm training
        hmm.startprob_ = startprob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mid_term_features_norm_or.T)

        # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]  # final sillouette
    class_names = ["speaker{0:d}".format(c) for c in range(n_speakers_final)]

    # load ground-truth if available
    gt_file = file_name.replace('.wav', '.segments')  # open for annotated file
    if os.path.isfile(gt_file):  # if groundturh exists
        seg_start, seg_end, seg_labels = readSegmentGT(gt_file)  # read GT data
        flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labels, mt_step)  # convert to flags

    x = np.arange(len(cls)) * mt_step + mt_step / 2.0
    if plot:
        fig = plt.figure()
        if num_speaker > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(np.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(x, cls)

    if os.path.isfile(gt_file):
        if plot:
            ax1.plot(np.array(range(len(flags_gt))) * mt_step + mt_step / 2.0, flags_gt, 'r')
        purity_cluster_mean, purity_speaker_mean = evaluateSpeakerDiarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_mean, 100 * purity_speaker_mean))
        if plot:
            plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100 * purity_cluster_mean,
                                                                                   100 * purity_speaker_mean))
    if plot:
        plt.xlabel("time (seconds)")
        # print(sRange, silAll)
        if num_speaker <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
    return x, cls

Esempio n. 21

0

Mostra file

File: audioTrainTest.py Progetto: daiab/voice_process

def fileClassification(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(modelName):
        print("fileClassification: input modelName not found!")
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print("fileClassification: wav file not found!")
        return (-1, -1, -1)

    if (modelType) == 'svm' or (modelType == 'svm_rbf'):
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadSVModel(modelName)
    elif modelType == 'knn':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadKNNModel(modelName)
    elif modelType == 'randomforest':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadRandomForestModel(modelName)
    elif modelType == 'gradientboosting':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadGradientBoostingModel(modelName)
    elif modelType == 'extratrees':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadExtraTreesModel(modelName)

    [Fs, x] = audio_basic_io.read_audio_file(
        inputFile)  # read audio file and convert to mono
    x = audio_basic_io.stereo2mono(x)

    if isinstance(x, int):  # audio file IO problem
        return (-1, -1, -1)
    if x.shape[0] / float(Fs) <= mtWin:
        return (-1, -1, -1)

    # feature extraction:
    [MidTermFeatures, s] = aF.mt_feature_extraction(x, Fs, mtWin * Fs,
                                                    mtStep * Fs,
                                                    round(Fs * stWin),
                                                    round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(
        axis=1)  # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
    curFV = (MidTermFeatures - MEAN) / STD  # normalization

    [Result, P] = classifierWrapper(Classifier, modelType,
                                    curFV)  # classification
    return Result, P, classNames