Ejemplo n.º 1
0
def thumbnailWrapper(inputFile, thumbnailWrapperSize):
    st_window = 0.5
    st_step = 0.5
    if not os.path.isfile(inputFile):
        raise Exception("Input audio file not found!")

    [fs, x] = audioBasicIO.readAudioFile(inputFile)
    if fs == -1:    # could not read file
        return

    [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x, fs, st_window, st_step,
                                                     thumbnailWrapperSize)

    # write thumbnailWrappers to WAV files:
    if inputFile.endswith(".wav"):
        thumbnailWrapperFileName1 = inputFile.replace(".wav", "_thumb1.wav")
        thumbnailWrapperFileName2 = inputFile.replace(".wav", "_thumb2.wav")
    if inputFile.endswith(".mp3"):
        thumbnailWrapperFileName1 = inputFile.replace(".mp3", "_thumb1.mp3")
        thumbnailWrapperFileName2 = inputFile.replace(".mp3", "_thumb2.mp3")
    wavfile.write(thumbnailWrapperFileName1, fs, x[int(fs * A1):int(fs * A2)])
    wavfile.write(thumbnailWrapperFileName2, fs, x[int(fs * B1):int(fs * B2)])
    print("1st thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec" \
          " -- {2:4.1f}sec".format(thumbnailWrapperFileName1, A1, A2))
    print("2nd thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec" \
          " -- {2:4.1f}sec".format(thumbnailWrapperFileName2, B1, B2))

    # Plot self-similarity matrix:
    fig = plt.figure()
    ax = fig.add_subplot(111, aspect="auto")
    plt.imshow(Smatrix)
    # Plot best-similarity diagonal:
    Xcenter = (A1 / st_step + A2 / st_step) / 2.0
    Ycenter = (B1 / st_step + B2 / st_step) / 2.0

    e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter),
                                    thumbnailWrapperSize * 1.4, 3, angle=45,
                                    linewidth=3, fill=False)
    ax.add_patch(e1)

    plt.plot([B1/ st_step, Smatrix.shape[0]], [A1/ st_step, A1/ st_step], color="k",
             linestyle="--", linewidth=2)
    plt.plot([B2/ st_step, Smatrix.shape[0]], [A2/ st_step, A2/ st_step], color="k",
             linestyle="--", linewidth=2)
    plt.plot([B1/ st_step, B1/ st_step], [A1/ st_step, Smatrix.shape[0]], color="k",
             linestyle="--", linewidth=2)
    plt.plot([B2/ st_step, B2/ st_step], [A2/ st_step, Smatrix.shape[0]], color="k",
             linestyle="--", linewidth=2)

    plt.xlim([0, Smatrix.shape[0]])
    plt.ylim([Smatrix.shape[1], 0])

    ax.yaxis.set_label_position("right")
    ax.yaxis.tick_right()

    plt.xlabel("frame no")
    plt.ylabel("frame no")
    plt.title("Self-similarity matrix")

    plt.show()
Ejemplo n.º 2
0
def fileChromagramWrapper(wav_file):
    if not os.path.isfile(wav_file):
        raise Exception("Input audio file not found!")
    [fs, x] = audioBasicIO.readAudioFile(wav_file)
    x = audioBasicIO.stereo2mono(x)
    specgram, TimeAxis, FreqAxis = aF.stChromagram(x, fs, round(fs * 0.040),
                                                   round(fs * 0.040), True)
Ejemplo n.º 3
0
def load_validation_set():
    """
    Output
        a tuple of features: (fft features, mfcc features, mean-std features)
    Description
        extracts three types of features from validation set.
    """
    ffts = dict()
    mfccs = dict()
    mean_stds = dict()

    for i in validation_ids:
        path = './validation/validation.{i}.wav'.format(i=i)

        _, X = read_wav(path)

        # FFT
        fft = np.array(abs(sp.fft(X)[:1000]))
        ffts.update({i: fft})

        # MFCC
        ceps, mspec, spec = mfcc(X)
        num_ceps = len(ceps)
        x = np.mean(ceps[int(num_ceps*1/10):int(num_ceps*9/10)], axis=0)
        mfccs.update({i: x})


        # Mean-Std
        [Fs, x] = audioBasicIO.readAudioFile(path);
        F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs);
        mean_std = []
        for f in F:
            mean_std.extend([f.mean(), f.std()])
        mean_stds.update({i: np.array(mean_std)})
    return (ffts, mfccs, mean_stds)
Ejemplo n.º 4
0
 def POST(self):
     x = web.input(myfile={})
     filename = 'tmp/'+uuid.uuid4().hex+'.wav'
     file = open(filename, 'w+')
     file.seek(0)
     file.write(x['myfile'].value)
     file.close()
     [Fs, x] = audioBasicIO.readAudioFile(filename);
     #os.remove(filename)
     x = audioBasicIO.stereo2mono(x)
     [F, _] = audioFeatureExtraction.mtFeatureExtraction(x, Fs, round(Fs*1.0), round(Fs * 1.0), round(Fs * 0.050), round(Fs * 0.050))
     F = F.transpose()
     for vec in F:
         results={}
         current_highest = ""
         current_highest_value = 0
         vec = numpy.around(vec.astype(numpy.float), 6)
         current = model.getNN(vec)
         result = current[0][1].partition("_")[0]
         if result in results:
             results[result] = results[result]+1
         else:
             results[result] = 1
         if results[result] > current_highest_value:
             current_highest_value = results[result]
             current_highest = result
     print results
     print current_highest
     raise web.seeother('/')
Ejemplo n.º 5
0
def beatExtractionWrapper(wav_file, plot):
    if not os.path.isfile(wav_file):
        raise Exception("Input audio file not found!")
    [fs, x] = audioBasicIO.readAudioFile(wav_file)
    F, _ = aF.stFeatureExtraction(x, fs, 0.050 * fs, 0.050 * fs)
    bpm, ratio = aF.beatExtraction(F, 0.050, plot)
    print("Beat: {0:d} bpm ".format(int(bpm)))
    print("Ratio: {0:.2f} ".format(ratio))
Ejemplo n.º 6
0
def silenceRemovalWrapper(inputFile, smoothingWindow, weight):
    if not os.path.isfile(inputFile):
        raise Exception("Input audio file not found!")

    [fs, x] = audioBasicIO.readAudioFile(inputFile)
    segmentLimits = aS.silenceRemoval(x, fs, 0.05, 0.05,
                                      smoothingWindow, weight, True)
    for i, s in enumerate(segmentLimits):
        strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(inputFile[0:-4], s[0], s[1])
        wavfile.write(strOut, fs, x[int(fs * s[0]):int(fs * s[1])])
Ejemplo n.º 7
0
def process_mp3_files():
    files = read_input()
    os.system("touch test.wav")
    for mp3_file in files:
        mean_value = []
        sound = AudioSegment.from_mp3(mp3_file)
        sound.export("test.wav", format="wav")
        # print mp3_file
        [Fs, x] = audioBasicIO.readAudioFile("test.wav")
        F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs)
        for i in range(len(F)):
            mean_value.append(numpy.mean(F[i]))
        compute_emotion(mean_value)
def dirWavFeatureExtractionNoAveraging(dirName, mt_win, mt_step, st_win, st_step):
    """
    This function extracts the mid-term features of the WAVE
    files of a particular folder without averaging each file.

    ARGUMENTS:
        - dirName:          the path of the WAVE directory
        - mt_win, mt_step:    mid-term window and step (in seconds)
        - st_win, st_step:    short-term window and step (in seconds)
    RETURNS:
        - X:                A feature matrix
        - Y:                A matrix of file labels
        - filenames:
    """

    all_mt_feats = numpy.array([])
    signal_idx = numpy.array([])
    process_times = []

    types = ('*.wav', '*.aif',  '*.aiff', '*.ogg')
    wav_file_list = []
    for files in types:
        wav_file_list.extend(glob.glob(os.path.join(dirName, files)))

    wav_file_list = sorted(wav_file_list)

    for i, wavFile in enumerate(wav_file_list):
        [fs, x] = audioBasicIO.readAudioFile(wavFile)
        if isinstance(x, int):
            continue        
        
        x = audioBasicIO.stereo2mono(x)
        [mt_term_feats, _, _] = mtFeatureExtraction(x, fs, round(mt_win * fs),
                                                    round(mt_step * fs),
                                                    round(fs * st_win),
                                                    round(fs * st_step))

        mt_term_feats = numpy.transpose(mt_term_feats)
        if len(all_mt_feats) == 0:                # append feature vector
            all_mt_feats = mt_term_feats
            signal_idx = numpy.zeros((mt_term_feats.shape[0], ))
        else:
            all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats))
            signal_idx = numpy.append(signal_idx, i * numpy.ones((mt_term_feats.shape[0], )))

    return (all_mt_feats, signal_idx, wav_file_list)
def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile,
                              storeStFeatures=False, storeToCSV=False, PLOT=False):
    """
    This function is used as a wrapper to:
    a) read the content of a WAV file
    b) perform mid-term feature extraction on that signal
    c) write the mid-term feature sequences to a numpy file
    """
    [fs, x] = audioBasicIO.readAudioFile(fileName)
    x = audioBasicIO.stereo2mono(x)
    if storeStFeatures:
        [mtF, stF, _] = mtFeatureExtraction(x, fs,
                                         round(fs * midTermSize),
                                         round(fs * midTermStep),
                                         round(fs * shortTermSize),
                                         round(fs * shortTermStep))
    else:
        [mtF, _, _] = mtFeatureExtraction(x, fs, round(fs*midTermSize),
                                       round(fs * midTermStep),
                                       round(fs * shortTermSize),
                                       round(fs * shortTermStep))
    # save mt features to numpy file
    numpy.save(outPutFile, mtF)
    if PLOT:
        print("Mid-term numpy file: " + outPutFile + ".npy saved")
    if storeToCSV:
        numpy.savetxt(outPutFile+".csv", mtF.T, delimiter=",")
        if PLOT:
            print("Mid-term CSV file: " + outPutFile + ".csv saved")

    if storeStFeatures:
        # save st features to numpy file
        numpy.save(outPutFile+"_st", stF)
        if PLOT:
            print("Short-term numpy file: " + outPutFile + "_st.npy saved")
        if storeToCSV:
            # store st features to CSV file
            numpy.savetxt(outPutFile+"_st.csv", stF.T, delimiter=",")
            if PLOT:
                print("Short-term CSV file: " + outPutFile + "_st.csv saved")
Ejemplo n.º 10
0
def showFeatures(name):
    print("processing - " + name)
    [Fs, x] = audioBasicIO.readAudioFile(name)
    # print(x)
    F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.50 * Fs, 0.25 * Fs)
    # print(x.size, Fs, 0.50 * Fs, 0.25 * Fs)
    # a = F[0, :]
    # numpy.savetxt("foo.csv", a, delimiter=",")

    # plt.subplot(3, 1, 1)
    # plt.plot(F[0, :])
    # plt.xlabel('Frame no')
    # plt.ylabel('ZCR')
    #
    # plt.subplot(3, 1, 2)
    # plt.plot(F[1, :])
    # plt.xlabel('Frame no')
    # plt.ylabel('Energy')
    #
    # plt.subplot(3, 1, 3)
    # plt.plot(F[3, :])
    # plt.xlabel('Frame no')
    # plt.ylabel('SC')
    #
    # plt.show()
    # items = ' '.join(map(str, a))
    # print(items)
    # print("--", F[0, :])
    vec = [
        F[0, :].mean(), F[1, :].mean(), F[4, :].mean(), F[5, :].mean(), F[6, :].mean(), F[7, :].mean(),
        F[0, :].std(), F[1, :].std(), F[4, :].std(), F[5, :].std(), F[6, :].std(), F[7, :].std()
    ]

    vecstr = ' '.join(map(str, vec))
    print("vector in audio.py : ",vecstr);
    melfeat = melfeature(F)
    # chromafeat = chromafeature(F)
    return vecstr + " " + melfeat
Ejemplo n.º 11
0
def trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep):
    '''
    This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored
    ARGUMENTS:
     - dirPath:        the path of the data diretory
     - hmmModelName:    the name of the HMM model to be stored
     - mtWin:        mid-term window size
     - mtStep:        mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - classNames:        a list of classNames

    After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file
    '''

    flagsAll = numpy.array([])
    initializedFall = False
    classesAll = []
    # for each WAV file
    for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')):
        wavFile = f
        # open for annotated file
        gtFile = f.replace('.wav', '.segments')
        # if current WAV file does not have annotation -> skip
        if not os.path.isfile(gtFile):
            continue
        [segStart, segEnd, segLabels] = readSegmentGT(
            gtFile)                   # read GT data
        flags, classNames = segs2flags(
            segStart, segEnd, segLabels, mtStep)     # convert to flags
        # update classnames:
        for c in classNames:
            if c not in classesAll:
                classesAll.append(c)
        [Fs, x] = audioBasicIO.readAudioFile(
            wavFile)                           # read audio data
        [F, _] = aF.mtFeatureExtraction(
            x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050))     # feature extraction

        lenF = F.shape[1]
        lenL = len(flags)
        MIN = min(lenF, lenL)
        F = F[:, 0:MIN]
        flags = flags[0:MIN]

        flagsNew = []
        for j, fl in enumerate(flags):      # append features and labels
            flagsNew.append(classesAll.index(classNames[flags[j]]))

        flagsAll = numpy.append(flagsAll, numpy.array(flagsNew))

        if not initializedFall:
            Fall = F
            initializedFall = True
        else:
            Fall = numpy.concatenate((Fall, F), axis=1)
    startprob, transmat, means, cov = trainHMM_computeStatistics(
        Fall, flagsAll)        # compute HMM statistics
    hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag")      # train HMM
    hmm.startprob_ = startprob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmmModelName, "wb")   # save HMM model
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classesAll, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classesAll
Ejemplo n.º 12
0
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import matplotlib.pyplot as plt
[Fs, x] = audioBasicIO.readAudioFile("demo.wav")
F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs)
plt.subplot(2, 1, 1)
plt.plot(F[0, :])
plt.xlabel('Frame no')
plt.ylabel('ZCR')
plt.subplot(2, 1, 2)
plt.plot(F[1, :])
plt.xlabel('Frame no')
plt.ylabel('Energy')
plt.show()
Ejemplo n.º 13
0
# from pyAudioAnalysis import audioTrainTest as aT
# aT.featureAndTrain(["data/uniform_ah_18/1", "data/uniform_ah_18/2"], 1.0, 1.0,
#                    aT.shortTermWindow, aT.shortTermStep, "svm", "svmSMtemp", False)
# aT.fileClassification("data/doremi.wav", "svmSMtemp", "svm")

from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import matplotlib.pyplot as plt
[Fs, x] = audioBasicIO.readAudioFile("data/english.wav")
F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs)
# print len(F)
'''
Feature ID	Feature Name	    Description
1	        Zero Crossing Rate	The rate of sign-changes of the signal during the duration of a particular frame.
2	        Energy	            The sum of squares of the signal values, normalized by the respective frame length.
3	        Entropy of Energy	The entropy of sub-frames' normalized energies. It can be interpreted as a measure of abrupt changes.
4	        Spectral Centroid	The center of gravity of the spectrum.
5	        Spectral Spread	    The second central moment of the spectrum.
6	        Spectral Entropy	Entropy of the normalized spectral energies for a set of sub-frames.
7	        Spectral Flux	    The squared difference between the normalized magnitudes of the spectra of the two successive frames.
8	        Spectral Rolloff	The frequency below which 90% of the magnitude distribution of the spectrum is concentrated.
9-21	    MFCCs	            Mel Frequency Cepstral Coefficients form a cepstral representation where the frequency bands are not linear but distributed according to the mel-scale.
22-33	    Chroma Vector	    A 12-element representation of the spectral energy where the bins represent the 12 equal-tempered pitch classes of western-type music (semitone spacing).
34	        Chroma Deviation	The standard deviation of the 12 chroma coefficients.
'''
fig, ax = plt.subplots(figsize=(12, 15))
fig.suptitle('pyAudioAnalysis', fontsize=14, fontweight='bold')

plt.subplot(13, 1, 1)
plt.plot(F[8, :])
plt.xlabel('Frame no')
Ejemplo n.º 14
0
def main(argv):
    if argv[1] == "-shortTerm":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            F = audioFeatureExtraction.stFeatureExtraction(
                x, Fs, 0.050 * Fs, 0.050 * Fs)
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "short-term feature extraction: {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-classifyFile":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            aT.fileClassification("diarizationExample.wav", "svmSM", "svm")
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-mtClassify":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            [flagsInd, classesAll,
             acc] = aS.mtFileClassification("diarizationExample.wav", "svmSM",
                                            "svm", False, '')
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-hmmSegmentation":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            aS.hmmSegmentation('diarizationExample.wav', 'hmmRadioSM', False,
                               '')
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "HMM-based classification - segmentation \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-silenceRemoval":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.clock()
            [Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav")
            segments = aS.silenceRemoval(x,
                                         Fs,
                                         0.050,
                                         0.050,
                                         smoothWindow=1.0,
                                         Weight=0.3,
                                         plot=False)
            t2 = time.clock()
            perTime1 = duration / (t2 - t1)
            print "Silence removal \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-thumbnailing":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.readAudioFile("scottish.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.clock()
            [A1, A2, B1, B2,
             Smatrix] = aS.musicThumbnailing(x1, Fs1, 1.0, 1.0,
                                             15.0)  # find thumbnail endpoints
            t2 = time.clock()
            perTime1 = duration1 / (t2 - t1)
            print "Thumbnail \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-diarization-noLDA":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.clock()
            aS.speakerDiarization("diarizationExample.wav",
                                  4,
                                  LDAdim=0,
                                  PLOT=False)
            t2 = time.clock()
            perTime1 = duration1 / (t2 - t1)
            print "Diarization \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-diarization-LDA":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.clock()
            aS.speakerDiarization("diarizationExample.wav", 4, PLOT=False)
            t2 = time.clock()
            perTime1 = duration1 / (t2 - t1)
            print "Diarization \t {0:.1f} x realtime".format(perTime1)
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

#audio_path = "/home/brainlab/Desktop/Rudas/Data/Propofol/Taken-[AudioTrimmer.com].wav"
audio_path = "/home/brainlab/Desktop/Rudas/Data/Propofol/Taken-[AudioTrimmer.com].wav"

[Fs, x] = audioBasicIO.readAudioFile(audio_path)
x = audioBasicIO.stereo2mono(x)

tr = 2

F, f_names = audioFeatureExtraction.stFeatureExtraction(
    x, Fs, tr * Fs, tr * Fs)

np.savetxt('audio_predictors.txt',
           np.transpose(F[:21]),
           fmt='%10.6f',
           delimiter=',')

from nilearn.signal import clean
#F = clean(signals=F,
#          detrend=False,
#          standardize=True,
#          ensure_finite=False)

#for feature in range(2):
#    plt.subplot(2,1,feature+1);
#    plt.plot(F[feature,:]);
Ejemplo n.º 16
0
    def Audio_Feature_Extraction_Extract_Directory(self):

        pathToSaveFiles = QFileDialog.getExistingDirectory(
            self.somethingToPass, "Select Directory to Save Files")

        i = 0

        path = self.Audio_Feature_Extraction_DirectoryPath.text()
        extractionList = []
        nameList = []
        tempHold = []

        for root, directs, files in os.walk(path):
            for x in files:
                extractionList.append(root + "/" + x)
                nameList.append(x)

        for name in nameList:
            x = name.split('.')
            tempHold.append(x[0])

        nameList = tempHold

        if self.Audio_Feature_Extraction_DirectoryWindowTerm.currentText(
        ) == "ShortTerm":

            for audio in extractionList:
                [Fs, x] = audioBasicIO.readAudioFile(audio)
                print(audio)
                stFeatures = audioFeatureExtraction.stFeatureExtraction(
                    x, Fs,
                    float(self.Audio_Feature_Extraction_DirectoryWindowSize.
                          text()) * Fs,
                    float(
                        self.Audio_Feature_Extraction_DirectoryStepSize.text())
                    * Fs)
                #I think the files are overwriting each other.  I am getting
                #299 files but it should be 5134.  I changed the namelist[i] to
                #just i
                numpy.savetxt(pathToSaveFiles + "/" + str(i) + ".csv",
                              stFeatures,
                              delimiter=',')

                i += 1

            QMessageBox.about(self.somethingToPass, "Files Created",
                              "Files have been saved as CSV files.")

        else:
            for x in extractionList:

                audioFeatureExtraction.mtFeatureExtractionToFile(
                    x,
                    float(self.
                          Audio_Feature_Extraction_DirectorymidTermWindowSize.
                          text()),
                    float(
                        self.
                        Audio_Feature_Extraction_DirectorymidTermWindowStepSize
                        .text()),
                    float(self.Audio_Feature_Extraction_DirectoryWindowSize.
                          text()),
                    float(self.Audio_Feature_Extraction_DirectoryStepSize.text(
                    )),
                    pathToSaveFiles + "/" + nameList[i],
                    storeStFeatures=True,
                    storeToCSV=True,
                    PLOT=False)
                i += 1

            QMessageBox.about(
                self.somethingToPass, "Files Created",
                "Files have been saved as CSV files and .npy files. There ")
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import matplotlib.pyplot as plt


[Fs1, x1] = audioBasicIO.readAudioFile("happy.wav");
[Fs2, x2] = audioBasicIO.readAudioFile("sad.wav");
# Fs is frequency
# x is real data

th = 100 # fixed fea length
k12 = (len(x1)-800)/th/float(Fs1)
k22 = (len(x2)-800)/th/float(Fs2)


F1, f_names1 = audioFeatureExtraction.stFeatureExtraction(x1, Fs1, 0.05*Fs1, k12*Fs1);
F2, f_names2 = audioFeatureExtraction.stFeatureExtraction(x2, Fs2, 0.05*Fs2, k22*Fs2);
# stFeatureExtraction(signal, fs, win, step):
# signal:       the input signal samples
# fs:           the sampling freq (in Hz)
# win:          the short-term window size (in samples)
# step:         the short-term window step (in samples)
'''
here, 
window size = 0.05*Fs = 0.05*16000 = 800
step size = 0.025*Fs = 0.024*16000 = 400
we can get n frames from signal with length 23776

400*n+800=23776 -> n=57.44 = 58

as below F.shape = (34,58)
def main(argv):
    dirName = argv[1]
    types = ('*.wav', )
    filesList = []
    for files in types:
        filesList.extend(glob.glob(os.path.join(dirName, files)))
    filesList = sorted(filesList)
    WIDTH_SEC = 2.4
    stWin = 0.020
    stStep = 0.015
    WIDTH = WIDTH_SEC / stStep

    for f in filesList:
        [Fs, x] = audioBasicIO.readAudioFile(f)
        print(Fs)
        x = audioBasicIO.stereo2mono(x)
        specgramOr, TimeAxis, FreqAxis = aF.stSpectogram(
            x, Fs, round(Fs * stWin), round(Fs * stStep), False)
        if specgramOr.shape[0] > WIDTH:
            specgram = specgramOr[int(specgramOr.shape[0] / 2) -
                                  WIDTH / 2:int(specgramOr.shape[0] / 2) +
                                  WIDTH / 2, :]
            specgram = scipy.misc.imresize(specgram,
                                           float(227.0) /
                                           float(specgram.shape[0]),
                                           interp='bilinear')
            print specgram.shape
            im = Image.fromarray(numpy.uint8(
                matplotlib.cm.jet(specgram) * 255))
            #plt.imshow(im)
            scipy.misc.imsave(f.replace(".wav", ".jpg"), im)

            if int(specgramOr.shape[0] / 2) - WIDTH / 2 - int(
                (0.2) / stStep) > 0:
                specgram = specgramOr[
                    int(specgramOr.shape[0] / 2) - WIDTH / 2 -
                    int((0.2) / stStep):int(specgramOr.shape[0] / 2) +
                    WIDTH / 2 - int((0.2) / stStep), :]
                specgram = scipy.misc.imresize(specgram,
                                               float(227.0) /
                                               float(specgram.shape[0]),
                                               interp='bilinear')
                im = Image.fromarray(
                    numpy.uint8(matplotlib.cm.jet(specgram) * 255))
                print specgram.shape
                scipy.misc.imsave(f.replace(".wav", "_02A.jpg"), im)

                specgram = specgramOr[
                    int(specgramOr.shape[0] / 2) - WIDTH / 2 +
                    int((0.2) / stStep):int(specgramOr.shape[0] / 2) +
                    WIDTH / 2 + int((0.2) / stStep), :]
                specgram = scipy.misc.imresize(specgram,
                                               float(227.0) /
                                               float(specgram.shape[0]),
                                               interp='bilinear')
                print specgram.shape
                im = Image.fromarray(
                    numpy.uint8(matplotlib.cm.jet(specgram) * 255))
                scipy.misc.imsave(f.replace(".wav", "_02B.jpg"), im)

                # ONLY FOR SPEECH (fewer samples). Must comment for music
                """specgram = specgramOr[int(specgramOr.shape[0]/2) - WIDTH/2 - int((0.1) / stStep):int(specgramOr.shape[0]/2) + WIDTH/2 - int((0.1) / stStep), :]                
def remove_silence(filename, out_dir, smoothing=1.0, weight=0.3, plot=False):
    """
    A function that implements pyAudioAnalysis' silence extraction module
    and creates wav files of the participant specific portions of audio. The
    smoothing and weight parameters were tuned for the AVEC 2016 dataset.

    Parameters
    ----------
    filename : filepath
        path to the input wav file
    out_dir : filepath
        path to the desired directory (where a participant folder will
        be created containing a 'PXXX_no_silence.wav' file)
    smoothing : float
        tunable parameter to compensate for sparseness of recordings
    weight : float
        probability threshold for silence removal used in SVM
    plot : bool
        plots SVM probabilities of silence (used in tuning)

    Returns
    -------
    A folder for each participant containing a single wav file
    (named 'PXXX_no_silence.wav') with the vast majority of silence
    and virtual interviewer speech removed. Feature extraction is
    performed on these segmented wav files.
    """
    # print(filename.split('/')[-1].split('_')[0], 'filename')

    partic_id = 'P' + filename.split('/')[-1].split('_')[0].split('\\')[
        1]  # PXXX

    print(partic_id, 'partic_id')
    if is_segmentable(partic_id):
        # create participant directory for segmented wav files
        participant_dir = os.path.join(out_dir, partic_id)
        if not os.path.exists(participant_dir):
            os.makedirs(participant_dir)

        os.chdir(participant_dir)
        # print(participant_dir, 'participant_dir')

        [Fs, x] = aIO.readAudioFile(filename)
        segments = aS.silenceRemoval(x,
                                     Fs,
                                     0.020,
                                     0.020,
                                     smoothWindow=smoothing,
                                     weight=weight,
                                     plot=plot)
        # print(segments)

        for s in segments:
            # filename = partic_id + s[0] + s[1]
            # seg_name = "%.s_%.2f-%.2f.wav".format(partic_id, s[0], s[1])

            # print(s[0])
            # print(s[1])
            # seg_name = '/' + str(partic_id) + '_' + str(s[0]).replace('.', 'b') + '_' + str(s[1]).replace('.', 'b') + '.wav'
            seg_name = '/' + '_' + str(s[0]).replace('.', 'b') + '_' + str(
                s[1]).replace('.', 'b') + '.wav'
            # print(seg_name, 'seg_name')
            wavfile.write(participant_dir + seg_name, Fs,
                          x[int(Fs * s[0]):int(Fs * s[1])])

        # concatenate segmented wave files within participant directory
        concatenate_segments(participant_dir, partic_id)
Ejemplo n.º 20
0
with open(speaker_file, 'r') as data:
    speaker_features = ujson.load(data)
for i, dirname in enumerate(os.listdir(datadir)):
    if dirname == ".DS_Store":
        continue
    speaker = dirname
    for filename in os.listdir(datadir + dirname + '/audio_trimmed/pedal/'):
        if filename == ".DS_Store":
            continue
        if "lie" in filename:
            #labels.append((filename, 1))
            labels[filename] = 1
        else:
            labels[filename] = 0
        [Fs,
         x] = audioBasicIO.readAudioFile(datadir + dirname +
                                         '/audio_trimmed/pedal/' + filename)
        #we might want to play with the timeframe here - as it is this is giving us up to ~1.5k frames for our sequences
        speaker_feat = speaker_features[dirname]
        st_features = audioFeatureExtraction.stFeatureExtraction(
            x, Fs, frame_size * Fs, frame_stepsize * Fs)
        num_features, num_windows = st_features.shape
        new_features = np.zeros((num_features, num_windows))
        for i in range(num_features):
            new_features[i] = (st_features[i] -
                               speaker_feat[i]) / speaker_feat[i]
        st_features = np.concatenate((st_features, new_features))
        features[filename] = st_features.tolist()
        total += 1
    print(i)
print(total)
with open('labels_{}_{}.json'.format(frame_size, frame_stepsize),
Ejemplo n.º 21
0
    return x[15:17]

def getThird(val):
    return val[2]

os.chdir('C:/Users/konst_000/Desktop/Σχολή/6ο Εξάμηνο/ΨΕΣ/Speech Emotion Recognition/Audio Database/Complete')
fileList = os.listdir('C:/Users/konst_000/Desktop/Σχολή/6ο Εξάμηνο/ΨΕΣ/Speech Emotion Recognition/Audio Database/Complete')
featureList = [] #list of lists used to store the extracted features of each training sample
labelList = []   #list of strings used to store the labels(emotions) for each training sample
speakerList = [] #list of strings used to store the speaker identity

for f in fileList:
    label = getEmotionLabel(f)
    if (label != '02' and label != '03' and label != '04' and label != '05' and label != '06'):
        continue
    [Fs, sample] = audioBasicIO.readAudioFile(f)
    sample = audioBasicIO.stereo2mono(sample) #feature extraction can be performed only on mono signals
    speaker = getSpeakerLabel(f)
    features = emoFeatExtract(sample, Fs, 0.050*Fs, 0.025*Fs)
    featureList.append(features)
    labelList.append(label)
    speakerList.append(speaker)

final = []

for i in range(len(featureList)):
    l = [featureList[i]]
    l.append(labelList[i])
    l.append(speakerList[i])
    final.append(l)
Ejemplo n.º 22
0
# this list of wav files is consistent with labels
# checked with == operator (data_id == files_id)
#files = [os.path.basename(x) for x in glob.glob(os.path.join(data_path + './session?/*/?/', '*.wav'))]
files = glob.glob(os.path.join(data_path + './session?/*/?/', '*.wav'))
files.sort(key=lambda x: x[-30:])

# feat_train = []
# feat_test = []
hfs_train = []
hfs_test = []

for f in files:
    if int(ntpath.basename(f)[18]) in range(1, 6):
        print("Process..., ", f)
        [Fs, x] = audioBasicIO.readAudioFile(f)
        F, f_names = audioFeatureExtraction.stFeatureExtraction(
            x, Fs, 0.025 * Fs, 0.010 * Fs)
        mean_train = np.mean(F, axis=1)
        std_train = np.std(F, axis=1)
        feat_hfs_train = np.hstack([mean_train, std_train])
        hfs_train.append(feat_hfs_train)
        feat_train.append(F.transpose())

    elif int(ntpath.basename(f)[18]) == 6:
        print("Process..., ", f)
        [Fs, x] = audioBasicIO.readAudioFile(f)
        F, f_names = audioFeatureExtraction.stFeatureExtraction(
            x, Fs, 0.025 * Fs, 0.010 * Fs)
        mean_test = np.mean(F, axis=1)
        std_test = np.std(F, axis=1)
Ejemplo n.º 23
0
    # Sanity cleaning to remove empty strings
    files = [f for f in files if f]
    return files




data_set = []
for file in os.listdir("training_dataset/unhappy"):
    temp = []
    mean_value = []
    if file.endswith(".mp3"):
        #print "training_dataset/unhappy/"+file
        sound=AudioSegment.from_mp3("training_dataset/unhappy/"+file)
        sound.export("test.wav",format="wav")
        [Fs, x] = audioBasicIO.readAudioFile("test.wav");
        F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05*Fs, 0.025*Fs);
        for i in range(len(F)):
            temp.append(numpy.mean(F[i]))
        mean_value.append(temp)
        mean_value.append(1)
        data_set.append(mean_value)
for file in os.listdir("training_dataset/happy"):
    temp = []
    mean_value = []
    if file.endswith(".mp3"):
        #print "training_dataset/happy/"+file
        sound=AudioSegment.from_mp3("training_dataset/happy/"+file)
        sound.export("test.wav",format="wav")
        [Fs, x] = audioBasicIO.readAudioFile("test.wav");
        F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05*Fs, 0.025*Fs);
Ejemplo n.º 24
0
def speakerDiarization(fileName, sRange = xrange(2, 10), mtSize = 2.0, mtStep = 0.2, stWin = 0.05, LDAdim = 35):
	Fs, x = audioBasicIO.readAudioFile(fileName)
	x = audioBasicIO.stereo2mono(x)
	duration = len(x) / Fs

	Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll'))
	Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale'))

	MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5))
	MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1]))

	for i in range(MidTermFeatures.shape[1]):
		curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
		curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2

		Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1)
		Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2)

		MidTermFeatures2[0: MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
		MidTermFeatures2[MidTermFeatures.shape[0]: MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001
		MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001

	MidTermFeatures = MidTermFeatures2
	iFeaturesSelect = range(8, 21) + range(41, 54)
	MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

	MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T])
	MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
	numOfWindows = MidTermFeatures.shape[1]

	DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis = 0)
	MDistancesAll = numpy.mean(DistancesAll)
	iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

	perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
	MidTermFeaturesNormOr = MidTermFeaturesNorm
	MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

	if LDAdim > 0:
		mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int(round(mtSize / stWin)), int(round(stWin / stWin)), list(), len(ShortTermFeatures), 2
		for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list())

		for i in range(numOfFeatures):
			curPos = 0
			N = len(ShortTermFeatures[i])
			while (curPos < N):
				N1, N2 = curPos, curPos + mtWinRatio
				if N2 > N: N2 = N
				curStFeatures = ShortTermFeatures[i][N1: N2]
				mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
				mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures))
				curPos += mtStepRatio

		mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
		mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1]))
		for i in range(mtFeaturesToReduce.shape[1]):
			curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
			curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
			Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1)
			Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2)
			mtFeaturesToReduce2[0: mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]: mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001

		mtFeaturesToReduce = mtFeaturesToReduce2
		mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
		mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures([mtFeaturesToReduce.T])
		mtFeaturesToReduce = mtFeaturesToReduce[0].T
	
		Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ))
		LDAstep = 1.0
		LDAstepRatio = LDAstep / stWin

		for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio)
		clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components = LDAdim)
		clf.fit(mtFeaturesToReduce.T, Labels)

		MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

	clsAll, silAll, centersAll = list(), list(), list()

	for iSpeakers in sRange:
		k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers)
		k_means.fit(MidTermFeaturesNorm.T)
		cls = k_means.labels_
		means = k_means.cluster_centers_

		clsAll.append(cls)
		centersAll.append(means)
		silA, silB = list(), list()
		for c in range(iSpeakers):
			clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls))
			if clusterPerCent < 0.02:
				silA.append(0.0)
				silB.append(0.0)
			else:
				MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c]
				Yt = distance.pdist(MidTermFeaturesNormTemp.T)
				silA.append(numpy.mean(Yt) * clusterPerCent)
				silBs = list()
				for c2 in range(iSpeakers):
					if c2 != c:
						clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls))
						MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2]
						Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
						silBs.append(numpy.mean(Yt) * (clusterPerCent+clusterPerCent2) / 2.0)
				silBs = numpy.array(silBs)
				silB.append(min(silBs))
		silA, silB, sil = numpy.array(silA), numpy.array(silB), list()
		for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c],  silA[c]) + 0.00001))
		silAll.append(numpy.mean(sil))

	imax = numpy.argmax(silAll)
	nSpeakersFinal = sRange[imax]

	cls = numpy.zeros((numOfWindows, ))
	for i in range(numOfWindows):
		j = numpy.argmin(numpy.abs(i - iNonOutLiers))
		cls[i] = clsAll[imax][j]

	startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls)
	hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag')
	hmm.startprob_ = startprob
	hmm.transmat_ = transmat
	hmm.means_ = means
	hmm.covars_ = cov
	cls = hmm.predict(MidTermFeaturesNormOr.T)
	cls = scipy.signal.medfilt(cls, 13)
	cls = scipy.signal.medfilt(cls, 11)

	sil = silAll[imax]
	classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)]

	return cls, classNames, duration, mtStep, silAll
import operator
import wave
import numpy as np
from pyAudioAnalysis import audioFeatureExtraction as aF
from pyAudioAnalysis import audioTrainTest as aT
from pyAudioAnalysis import audioSegmentation as aS
from pyAudioAnalysis import audioVisualization as aV
from pyAudioAnalysis import audioBasicIO


if __name__ =='__main__':
	#csv and wav file as argument
	csvFileName = sys.argv[1]
	wavFileName = sys.argv[2]

	Fs, x = audioBasicIO.readAudioFile(wavFileName)
	annotations = []
	silence = []
	folderName = None
	fileCounter = 0
	start, end = 0, 0

	#duration of wavFile
	spf = wave.open(wavFileName,'r')
	#Get wavFile duration
	frames = spf.getnframes()
	rate = spf.getframerate()
	duration = frames / float(rate)
	#duration = int((duration))

Ejemplo n.º 26
0
def speakerDiarization(fileName,
                       sRange=xrange(2, 10),
                       mtSize=2.0,
                       mtStep=0.2,
                       stWin=0.05,
                       LDAdim=35):
    Fs, x = audioBasicIO.readAudioFile(fileName)
    x = audioBasicIO.stereo2mono(x)
    duration = len(x) / Fs

    Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel(
        os.path.join(
            '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data',
            'knnSpeakerAll'))
    Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel(
        os.path.join(
            '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data',
            'knnSpeakerFemaleMale'))

    MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction(
        x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin),
        round(Fs * stWin * 0.5))
    MidTermFeatures2 = numpy.zeros(
        (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2),
         MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2

        Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1)
        Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2)

        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):,
                         i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2
    iFeaturesSelect = range(8, 21) + range(41, 54)
    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    DistancesAll = numpy.sum(distance.squareform(
        distance.pdist(MidTermFeaturesNorm.T)),
                             axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    perOutLier = (100.0 *
                  (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    if LDAdim > 0:
        mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int(
            round(mtSize / stWin)), int(round(
                stWin / stWin)), list(), len(ShortTermFeatures), 2
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append(list())

        for i in range(numOfFeatures):
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1, N2 = curPos, curPos + mtWinRatio
                if N2 > N: N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i + numOfFeatures].append(
                    numpy.std(curStFeatures))
                curPos += mtStepRatio

        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros(
            (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2),
             mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1)
            Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0],
                                i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[
                mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] +
                                len(classNames1):, i] = P2 + 0.0001

        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures(
            [mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T

        Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin

        for i in range(Labels.shape[0]):
            Labels[i] = int(i * stWin / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels)

        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    clsAll, silAll, centersAll = list(), list(), list()

    for iSpeakers in sRange:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(MidTermFeaturesNorm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        clsAll.append(cls)
        centersAll.append(means)
        silA, silB = list(), list()
        for c in range(iSpeakers):
            clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(
                len(cls))
            if clusterPerCent < 0.02:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c]
                Yt = distance.pdist(MidTermFeaturesNormTemp.T)
                silA.append(numpy.mean(Yt) * clusterPerCent)
                silBs = list()
                for c2 in range(iSpeakers):
                    if c2 != c:
                        clusterPerCent2 = numpy.nonzero(
                            cls == c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,
                                                                       cls ==
                                                                       c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clusterPerCent + clusterPerCent2) / 2.0)
                silBs = numpy.array(silBs)
                silB.append(min(silBs))
        silA, silB, sil = numpy.array(silA), numpy.array(silB), list()
        for c in range(iSpeakers):
            sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001))
        silAll.append(numpy.mean(sil))

    imax = numpy.argmax(silAll)
    nSpeakersFinal = sRange[imax]

    cls = numpy.zeros((numOfWindows, ))
    for i in range(numOfWindows):
        j = numpy.argmin(numpy.abs(i - iNonOutLiers))
        cls[i] = clsAll[imax][j]

    startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls)
    hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag')
    hmm.startprob_ = startprob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov
    cls = hmm.predict(MidTermFeaturesNormOr.T)
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = silAll[imax]
    classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)]

    return cls, classNames, duration, mtStep, silAll
Ejemplo n.º 27
0
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import matplotlib.pyplot as plt
[Fs, x] = audioBasicIO.readAudioFile("../audio_data/doremi.wav")
print Fs
print len(x)
#using a frame size of 50 msecs and a frame step of 25 msecs (50% overlap)
F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs)
"""
    stFeatureExtraction
    This function implements the shor-term windowing process. For each short-term window a set of features is extracted.
    This results to a sequence of feature vectors, stored in a numpy matrix.
    ARGUMENTS
        signal:       the input signal samples
        Fs:           the sampling freq (in Hz)
        Win:          the short-term window size (in samples)
        Step:         the short-term window step (in samples)
    RETURNS
        stFeatures:   a numpy array (numOfFeatures x numOfShortTermWindows)
"""
print len(F)
plt.subplot(2,1,1); plt.plot(F[0,:]); plt.xlabel('Frame no'); plt.ylabel('ZCR')
plt.subplot(2,1,2); plt.plot(F[1,:]); plt.xlabel('Frame no'); plt.ylabel('Energy'); plt.show()
Ejemplo n.º 28
0
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import matplotlib.pyplot as plt
[Fs, x] = audioBasicIO.readAudioFile("data/20170621_16sec.wav")
F = audioFeatureExtraction.stFeatureExtraction(x, float(Fs), float(0.1 * Fs),
                                               float(0.1 * Fs))
print(Fs)
print(x)
# plt.subplot(2,1,1); plt.plot(F[0,:]); plt.xlabel('Frame no'); plt.ylabel('ZCR');
plt.subplot(2, 1, 1)
plt.plot(F[1, :])
plt.xlabel('Frame no')
plt.ylabel('Energy')
plt.show()
Ejemplo n.º 29
0
def speakerDiarization(fileName,
                       numOfSpeakers,
                       mtSize=2.0,
                       mtStep=0.2,
                       stWin=0.05,
                       LDAdim=35,
                       PLOT=False):
    '''
    ARGUMENTS:
        - fileName:        the name of the WAV file to be analyzed
        - numOfSpeakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mtSize (opt)     mid-term window size
        - mtStep (opt)     mid-term window step
        - stWin  (opt)     short-term window size
        - LDAdim (opt)     LDA dimension (0 for no LDA)
        - PLOT     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [Fs, x] = audioBasicIO.readAudioFile(fileName)
    x = audioBasicIO.stereo2mono(x)
    Duration = len(x) / Fs

    [
        Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1,
        stStep1, computeBEAT1
    ] = aT.loadKNNModel(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "data",
                     "knnSpeakerAll"))
    [
        Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2,
        stStep2, computeBEAT2
    ] = aT.loadKNNModel(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "data",
                     "knnSpeakerFemaleMale"))

    [MidTermFeatures,
     ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs,
                                                 mtSize * Fs, mtStep * Fs,
                                                 round(Fs * stWin),
                                                 round(Fs * stWin * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2),
         MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2
        [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
        [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::,
                         i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2  # TODO
    # SELECT FEATURES:
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20];                                                                                         # SET 0A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100];                                                                                 # SET 0B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,
    #   97,98, 99,100];     # SET 0C

    iFeaturesSelect = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]  # SET 1A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];                                          # SET 1B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 1C

    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53];             # SET 2A
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];     # SET 2B
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 2C

    #iFeaturesSelect = range(100);                                                                                                    # SET 3
    #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010

    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    (MidTermFeaturesNorm, MEAN,
     STD) = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    # remove outliers:
    DistancesAll = numpy.sum(distance.squareform(
        distance.pdist(MidTermFeaturesNorm.T)),
                             axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = numpy.min(MidTermFeatures[1,:])
    #EnergyMean = numpy.mean(MidTermFeatures[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
    #print iNonOutLiers

    perOutLier = (100.0 *
                  (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    # LDA dimensionality reduction:
    if LDAdim > 0:
        #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));
        # extract mid-term features with minimum step:
        mtWinRatio = int(round(mtSize / stWin))
        mtStepRatio = int(round(stWin / stWin))
        mtFeaturesToReduce = []
        numOfFeatures = len(ShortTermFeatures)
        numOfStatistics = 2
        #for i in range(numOfStatistics * numOfFeatures + 1):
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append([])

        for i in range(numOfFeatures):  # for each of the short-term features:
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mtWinRatio
                if N2 > N:
                    N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i + numOfFeatures].append(
                    numpy.std(curStFeatures))
                curPos += mtStepRatio
        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros(
            (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2),
             mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
            [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0],
                                i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[
                mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] +
                                len(classNames1)::, i] = P2 + 0.0001
        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
        (mtFeaturesToReduce, MEAN,
         STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T
        #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
        #MDistancesAll = numpy.mean(DistancesAll)
        #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
        #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
        Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * stWin / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels)
        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    if numOfSpeakers <= 0:
        sRange = range(2, 10)
    else:
        sRange = [numOfSpeakers]
    clsAll = []
    silAll = []
    centersAll = []

    for iSpeakers in sRange:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(MidTermFeaturesNorm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
        clsAll.append(cls)
        centersAll.append(means)
        silA = []
        silB = []
        for c in range(iSpeakers
                       ):  # for each speaker (i.e. for each extracted cluster)
            clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(
                len(cls))
            if clusterPerCent < 0.020:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls ==
                                                              c]  # get subset of feature vectors
                Yt = distance.pdist(
                    MidTermFeaturesNormTemp.T
                )  # compute average distance between samples that belong to the cluster (a values)
                silA.append(numpy.mean(Yt) * clusterPerCent)
                silBs = []
                for c2 in range(
                        iSpeakers
                ):  # compute distances from samples of other clusters
                    if c2 != c:
                        clusterPerCent2 = numpy.nonzero(
                            cls == c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,
                                                                       cls ==
                                                                       c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clusterPerCent + clusterPerCent2) / 2.0)
                silBs = numpy.array(silBs)
                silB.append(
                    min(silBs)
                )  # ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
        silA = numpy.array(silA)
        silB = numpy.array(silB)
        sil = []
        for c in range(iSpeakers):  # for each cluster (speaker)
            sil.append((silB[c] - silA[c]) /
                       (max(silB[c], silA[c]) + 0.00001))  # compute silhouette

        silAll.append(numpy.mean(sil))  # keep the AVERAGE SILLOUETTE

    #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
    imax = numpy.argmax(silAll)  # position of the maximum sillouette value
    nSpeakersFinal = sRange[imax]  # optimal number of clusters

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window)
    cls = numpy.zeros((numOfWindows, ))
    for i in range(numOfWindows):
        j = numpy.argmin(numpy.abs(i - iNonOutLiers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        startprob, transmat, means, cov = trainHMM_computeStatistics(
            MidTermFeaturesNormOr, cls)
        hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0],
                                       "diag")  # hmm training
        hmm.startprob_ = startprob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(MidTermFeaturesNormOr.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = silAll[imax]  # final sillouette
    classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    # load ground-truth if available
    gtFile = fileName.replace('.wav', '.segments')
    # open for annotated file
    if os.path.isfile(gtFile):  # if groundturh exists
        [segStart, segEnd, segLabels] = readSegmentGT(gtFile)  # read GT data
        flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels,
                                           mtStep)  # convert to flags

    if PLOT:
        fig = plt.figure()
        if numOfSpeakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(numpy.array(range(len(classNames))))
        ax1.axis((0, Duration, -1, len(classNames)))
        ax1.set_yticklabels(classNames)
        ax1.plot(numpy.array(range(len(cls))) * mtStep + mtStep / 2.0, cls)

    if os.path.isfile(gtFile):
        if PLOT:
            ax1.plot(
                numpy.array(range(len(flagsGT))) * mtStep + mtStep / 2.0,
                flagsGT, 'r')
        purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(
            cls, flagsGT)
        print("{0:.1f}\t{1:.1f}".format(100 * purityClusterMean,
                                        100 * puritySpeakerMean))
        if PLOT:
            plt.title(
                "Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(
                    100 * purityClusterMean, 100 * puritySpeakerMean))
    if PLOT:
        plt.xlabel("time (seconds)")
        #print sRange, silAll
        if numOfSpeakers <= 0:
            plt.subplot(212)
            plt.plot(sRange, silAll)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
    return cls
Ejemplo n.º 30
0
def trainHMM_fromDir(dirPath, hmm_model_name, mt_win, mt_step):
    """
    This function trains a HMM model for segmentation-classification using
    a where WAV files and .segment (ground-truth files) are stored
    ARGUMENTS:
     - dirPath:        the path of the data diretory
     - hmm_model_name:    the name of the HMM model to be stored
     - mt_win:        mid-term window size
     - mt_step:        mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - class_names:        a list of class_names

    After training, hmm, class_names, along with the mt_win
    and mt_step values are stored in the hmm_model_name file
    """

    flags_all = np.array([])
    classes_all = []
    for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')):
        # for each WAV file
        wav_file = f
        gt_file = f.replace('.wav', '.segments')
        if not os.path.isfile(gt_file):
            continue
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step)
        for c in class_names:
            # update class names:
            if c not in classes_all:
                classes_all.append(c)
        [fs, x] = audioBasicIO.readAudioFile(wav_file)
        [F, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs,
                                           mt_step * fs, round(fs * 0.050),
                                           round(fs * 0.050))

        lenF = F.shape[1]
        lenL = len(flags)
        min_sm = min(lenF, lenL)
        F = F[:, 0:min_sm]
        flags = flags[0:min_sm]

        flagsNew = []
        for j, fl in enumerate(flags):      # append features and labels
            flagsNew.append(classes_all.index(class_names[flags[j]]))

        flags_all = np.append(flags_all, np.array(flagsNew))

        if i == 0:
            f_all = F
        else:
            f_all = np.concatenate((f_all, F), axis=1)

    # compute HMM statistics
    start_prob, transmat, means, cov = trainHMM_computeStatistics(f_all,
                                                                  flags_all)
    # train the HMM
    hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
    hmm.startprob_ = start_prob
    hmm.transmat_ = transmat        
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmm_model_name, "wb")   # save HMM model
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classes_all, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classes_all
Ejemplo n.º 31
0
        for emotion in sorted(glob.glob('train_wavdata/*')):

            #print (spct/float(total_sp))*100.0,'% completed'

            emotion_name = emotion.replace('train_wavdata/', '')

            #print emotion_name

            emotions.update({emotion_name: spct})

            all_emotion_Fs, all_emotion_data = 0, []

            for sample_file in glob.glob(emotion + '/*.wav'):

                [Fs, x] = audioBasicIO.readAudioFile(sample_file)

                if all_emotion_Fs == 0:
                    all_emotion_Fs = Fs

                if Fs == all_emotion_Fs:
                    features = extract_MFCCs(x, Fs, window * Fs,
                                             window_overlap * Fs,
                                             voiced_threshold_mul,
                                             voiced_threshold_range,
                                             calc_deltas)
                    all_emotion_data.append(features)
                else:
                    print sample_file + " skipped due to mismatch in frame rate"

            all_emotion_data = np.concatenate(all_emotion_data, 0)
Ejemplo n.º 32
0
def mtFileClassification(input_file, model_name, model_type,
                         plot_results=False, gt_file=""):
    """
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used,
    i.e. a pre-trained classifier.
    ARGUMENTS:
        - input_file:        path of the input WAV file
        - model_name:        name of the classification model
        - model_type:        svm or knn depending on the classifier type
        - plot_results:      True if results are to be plotted using
                             matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the
                            endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the
                            class ID of the i-th segment
    """

    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if model_type == "knn":
        [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = \
            aT.load_model_knn(model_name)
    else:
        [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step,
         compute_beat] = aT.load_model(model_name)

    if compute_beat:
        print("Model " + model_name + " contains long-term music features "
                                     "(beat etc) and cannot be used in "
                                     "segmentation")
        return (-1, -1, -1, -1)
    [fs, x] = audioBasicIO.readAudioFile(input_file) # load input file
    if fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audioBasicIO.stereo2mono(x)  # convert stereo (if) to mono
    # mid-term feature extraction:
    [mt_feats, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs,
                                                     mt_step * fs,
                                                     round(fs * st_win),
                                                     round(fs * st_step))
    flags = []
    Ps = []
    flags_ind = []
    # for each feature vector (i.e. for each fix-sized segment):
    for i in range(mt_feats.shape[1]):
        cur_fv = (mt_feats[:, i] - MEAN) / STD  # normalize current feature v
        # classify vector:
        [res, P] = aT.classifierWrapper(classifier, model_type, cur_fv)
        flags_ind.append(res)
        flags.append(class_names[int(res)])  # update class label matrix
        Ps.append(np.max(P))   # update probability matrix
    flags_ind = np.array(flags_ind)

    # 1-window smoothing
    for i in range(1, len(flags_ind) - 1):
        if flags_ind[i-1] == flags_ind[i + 1]:
            flags_ind[i] = flags_ind[i + 1]
    # convert fix-sized flags to segments and classes
    (segs, classes) = flags2segs(flags, mt_step)
    segs[-1] = len(x) / float(fs)

    # Load grount-truth:        
    if os.path.isfile(gt_file):
        [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt,
                                              seg_l_gt, mt_step)
        flags_ind_gt = []
        for j, fl in enumerate(flags_gt):
            # "align" labels with GT
            if class_names_gt[flags_gt[j]] in class_names:
                flags_ind_gt.append(class_names.index(class_names_gt[
                                                          flags_gt[j]]))
            else:
                flags_ind_gt.append(-1)
        flags_ind_gt = np.array(flags_ind_gt)        
        cm = np.zeros((len(class_names_gt), len(class_names_gt)))
        for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])):
            cm[int(flags_ind_gt[i]),int(flags_ind[i])] += 1        
    else:
        cm = []
        flags_ind_gt = np.array([])
    acc = plotSegmentationResults(flags_ind, flags_ind_gt,
                                  class_names, mt_step, not plot_results)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc)  )
        return (flags_ind, class_names_gt, acc, cm)
    else:
        return (flags_ind, class_names, acc, cm)
def dirWavFeatureExtraction(dirName, mt_win, mt_step, st_win, st_step,
                            compute_beat=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mt_win, mt_step:    mid-term window and step (in seconds)
        - st_win, st_step:    short-term window and step (in seconds)
    """

    all_mt_feats = numpy.array([])
    process_times = []

    types = ('*.wav', '*.aif',  '*.aiff', '*.mp3', '*.au', '*.ogg')
    wav_file_list = []
    for files in types:
        wav_file_list.extend(glob.glob(os.path.join(dirName, files)))

    wav_file_list = sorted(wav_file_list)    
    wav_file_list2, mt_feature_names = [], []
    for i, wavFile in enumerate(wav_file_list):        
        print("Analyzing file {0:d} of "
              "{1:d}: {2:s}".format(i+1,
                                    len(wav_file_list),
                                    wavFile))
        if os.stat(wavFile).st_size == 0:
            print("   (EMPTY FILE -- SKIPPING)")
            continue        
        [fs, x] = audioBasicIO.readAudioFile(wavFile)
        if isinstance(x, int):
            continue        

        t1 = time.clock()        
        x = audioBasicIO.stereo2mono(x)
        if x.shape[0]<float(fs)/5:
            print("  (AUDIO FILE TOO SMALL - SKIPPING)")
            continue
        wav_file_list2.append(wavFile)
        if compute_beat:
            [mt_term_feats, st_features, mt_feature_names] = \
                mtFeatureExtraction(x, fs, round(mt_win * fs),
                                    round(mt_step * fs),
                                    round(fs * st_win), round(fs * st_step))
            [beat, beat_conf] = beatExtraction(st_features, st_step)
        else:
            [mt_term_feats, _, mt_feature_names] = \
                mtFeatureExtraction(x, fs, round(mt_win * fs),
                                    round(mt_step * fs),
                                    round(fs * st_win), round(fs * st_step))

        mt_term_feats = numpy.transpose(mt_term_feats)
        mt_term_feats = mt_term_feats.mean(axis=0)
        # long term averaging of mid-term statistics
        if (not numpy.isnan(mt_term_feats).any()) and \
                (not numpy.isinf(mt_term_feats).any()):
            if compute_beat:
                mt_term_feats = numpy.append(mt_term_feats, beat)
                mt_term_feats = numpy.append(mt_term_feats, beat_conf)
            if len(all_mt_feats) == 0:
                # append feature vector
                all_mt_feats = mt_term_feats
            else:
                all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats))
            t2 = time.clock()
            duration = float(len(x)) / fs
            process_times.append((t2 - t1) / duration)
    if len(process_times) > 0:
        print("Feature extraction complexity ratio: "
              "{0:.1f} x realtime".format((1.0 / numpy.mean(numpy.array(process_times)))))
    return (all_mt_feats, wav_file_list2, mt_feature_names)
Ejemplo n.º 34
0
def speakerDiarization(filename, n_speakers, mt_size=2.0, mt_step=0.2, 
                       st_win=0.05, lda_dim=35, plot_res=False):
    """
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
        - n_speakers       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - mt_size (opt)    mid-term window size
        - mt_step (opt)    mid-term window step
        - st_win  (opt)    short-term window size
        - lda_dim (opt     LDA dimension (0 for no LDA)
        - plot_res         (opt)   0 for not plotting the results 1 for plotting
    """
    [fs, x] = audioBasicIO.readAudioFile(filename)
    x = audioBasicIO.stereo2mono(x)
    duration = len(x) / fs

    [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerAll"))
    [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerFemaleMale"))

    [mt_feats, st_feats, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs,
                                                     mt_step * fs,
                                                     round(fs * st_win),
                                                     round(fs*st_win * 0.5))

    MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) +
                                    len(classNames2), mt_feats.shape[1]))

    for i in range(mt_feats.shape[1]):
        cur_f1 = (mt_feats[:, i] - MEAN1) / STD1
        cur_f2 = (mt_feats[:, i] - MEAN2) / STD2
        [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
        [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
        MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
        MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0]+len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001

    mt_feats = MidTermFeatures2    # TODO
    iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41,
                       42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]

    mt_feats = mt_feats[iFeaturesSelect, :]

    (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    n_wins = mt_feats.shape[1]

    # remove outliers:
    dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)),
                         axis=0)
    m_dist_all = np.mean(dist_all)
    i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = np.min(mt_feats[1,:])
    #EnergyMean = np.mean(mt_feats[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
    #print i_non_outliers

    perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins
    mt_feats_norm_or = mt_feats_norm
    mt_feats_norm = mt_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs,
        # st_win * fs, round(fs*st_win), round(fs*st_win));
        # extract mid-term features with minimum step:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        #for i in range(num_of_stats * num_of_features + 1):
        for i in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        for i in range(num_of_features):  # for each of the short-term features:
            curPos = 0
            N = len(st_feats[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mt_win_ratio
                if N2 > N:
                    N2 = N
                curStFeatures = st_feats[i][N1:N2]
                mt_feats_to_red[i].append(np.mean(curStFeatures))
                mt_feats_to_red[i+num_of_features].append(np.std(curStFeatures))
                curPos += mt_step_ratio
        mt_feats_to_red = np.array(mt_feats_to_red)
        mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] +
                                        len(classNames1) + len(classNames2),
                                         mt_feats_to_red.shape[1]))
        for i in range(mt_feats_to_red.shape[1]):
            cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1
            cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2
            [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
            [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i]
            mt_feats_to_red_2[mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001
            mt_feats_to_red_2[mt_feats_to_red.shape[0]+len(classNames1)::, i] = P2 + 0.0001
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :]
        #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010
        (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures([mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0)
        #m_dist_all = np.mean(dist_all)
        #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0]
        #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2]
        Labels = np.zeros((mt_feats_to_red.shape[1], ));
        LDAstep = 1.0
        LDAstepRatio = LDAstep / st_win
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i*st_win/LDAstepRatio);        
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, Labels)
        mt_feats_norm = (clf.transform(mt_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    clsAll = []
    sil_all = []
    centersAll = []
    
    for iSpeakers in s_range:        
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(mt_feats_norm.T)
        cls = k_means.labels_        
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(mt_feats_norm.T))
        clsAll.append(cls)
        centersAll.append(means)
        sil_1 = []; sil_2 = []
        for c in range(iSpeakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \
                             float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mt_feats_norm[:, cls==c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                Yt = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(np.mean(Yt)*clust_per_cent)
                silBs = []
                for c2 in range(iSpeakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\
                                           float(len(cls))
                        MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2]
                        Yt = distance.cdist(mt_feats_norm_temp.T, 
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(np.mean(Yt)*(clust_per_cent
                                                     + clust_per_cent_2)/2.0)
                silBs = np.array(silBs)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(silBs))
        sil_1 = np.array(sil_1); 
        sil_2 = np.array(sil_2); 
        sil = []
        for c in range(iSpeakers):
            # for each cluster (speaker) compute silhouette
            sil.append( ( sil_2[c] - sil_1[c]) / (max(sil_2[c],
                                                      sil_1[c]) + 0.00001))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(np.mean(sil))

    imax = np.argmax(sil_all)
    # optimal number of clusters
    nSpeakersFinal = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = np.zeros((n_wins,))
    for i in range(n_wins):
        j = np.argmin(np.abs(i-i_non_outliers))        
        cls[i] = clsAll[imax][j]
        
    # Post-process method 1: hmm smoothing
    for i in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
            trainHMM_computeStatistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat            
        hmm.means_ = means; hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)                    
    
    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]
    class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)];


    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundturh exists
    if os.path.isfile(gt_file):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step)

    if plot_res:
        fig = plt.figure()    
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(np.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls)

    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(np.array(range(len(flags_gt))) *
                     mt_step + mt_step / 2.0, flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
            evaluateSpeakerDiarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m,
                                                        100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        #print s_range, sil_all    
        if n_speakers<=0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters");
            plt.ylabel("average clustering's sillouette");
        plt.show()
    return cls
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import matplotlib.pyplot as plt

[Fs, x] = audioBasicIO.readAudioFile("in_Data/rattle.wav")

F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs);
plt.subplot(2,1,1); plt.plot(F[0,:]); plt.xlabel('Frame no'); plt.ylabel('ZCR');
plt.subplot(2,1,2); plt.plot(F[1,:]); plt.xlabel('Frame no'); plt.ylabel('Energy'); plt.show()
Ejemplo n.º 36
0
model_file_path = 'Models/neural_net_model.model'
model_weigths_path = 'Models/neural_net_model.weights'

model = keras.models.load_model(model_file_path)

out_audio_for_test_path = 'output_audio_for_testing'

dir_to_test = os.path.join(os.path.dirname(os.getcwd()), out_audio_for_test_path)

file_list = os.listdir(dir_to_test)

feat_list = []

for file in file_list:
    file_path = os.path.join(dir_to_test, file)
    [Fs, x] = audioBasicIO.readAudioFile(file_path)
    F, f_names = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.200 * Fs, 0.150 * Fs)
    feat_list.append(F)

# Make the input shape (646,1) for Dense input neural networks

audio_feature_set = []
for item in feat_list:
    list = []
    for feature in item:
        for frame in feature:
            list.append(frame)
    audio_feature_set.append(list)

feat_list = np.array(feat_list)
audio_feature_set = np.array(audio_feature_set)
	emotion_names = { all_emotions[k].replace('../Ravdess_Dataset/test_wavdata/',''):k for k in range(len(all_emotions)) }

	total_emotions=len(num_test_cases)


	confusion_matrix = np.zeros((total_emotions,total_emotions))


	for emotion in all_emotions:

		emotion_name=emotion.replace('../Ravdess_Dataset/test_wavdata/','')
		# speaker_name=speaker.replace(emotion+'/','')
		for testcasefile in glob.glob(emotion+'/*.wav'):

			[Fs, x] = audioBasicIO.readAudioFile(testcasefile)
			mfcc_features = extract_MFCCs(x,Fs,window*Fs,window_overlap*Fs,calc_deltas)
			actual_file_name = testcasefile.replace(emotion+"/",'')
			# print actual_file_name

			prosody_features = extract_prosody(actual_file_name,emotion_name)
			lpcc_features = extract_lpcc(actual_file_name,emotion_name)
			#print mfcc_features.shape,prosody_features.shape,lpcc_features.shape
			
			if mfcc_features.shape[0]==prosody_features.shape[0] and prosody_features.shape[0]==lpcc_features.shape[0]:
				pass
			else:
				min_shape=min([ mfcc_features.shape[0],prosody_features.shape[0],lpcc_features.shape[0] ])
				if mfcc_features.shape[0]!=min_shape:
					mfcc_features=mfcc_features[0:min_shape]
				if prosody_features.shape[0]!=min_shape:
Ejemplo n.º 38
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue May 22 11:13:09 2018

@author: bara
"""

from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import matplotlib.pyplot as plt
import numpy as np

[fs, x_good] = audioBasicIO.readAudioFile("samples/good/5.wav")
x_good = x_good / (2.**15)
times = np.arange(len(x_good)) / float(fs)

plt.subplot(2, 1, 1)
plt.plot(times, x_good)
plt.xlabel('Tempo (s)')
plt.ylabel('Amplitude')
plt.show()
def getfileintoframe(file,itt,file1,file2,file3):
    waveFile = opens(file, 'rb')
    [Fs, x] = audioBasicIO.readAudioFile(file)
     
    length = waveFile.getnframes()
    # Read them into the frames array
    samples=[]
    sample1=[]
    start=0
    prev=""
    for i in range(start,itt):
        waveData = waveFile.readframes(1)
        data=struct.unpack("%ih"%1,waveData)
        sample1.append(int(data[0]))
        start=start+1
    samples = np.array(sample1)
    signal = numpy.double(samples)
    signal = signal / (2.0 ** 15)
    DC = signal.mean()
    MAX = (numpy.abs(signal)).max()
    signal = (signal - DC) / MAX
    N = len(signal)                                # total number of samples
    curPos = 0
    countFrames = 0
    nFFT = int(1500 / 2)
    X = abs(fft(signal))                                  # get fft magnitude
    X = X[0:nFFT]                                    # normalize fft
    X = X / len(X)
    prev=X
    itt=itt+itt
    count=0
    flag_check=True
    while True:
        for i in range(start,itt):
            waveData = waveFile.readframes(1)
            try:
                data=struct.unpack("%ih"%1,waveData)
            except:
                       pass
            sample1.append(int(data[0]))
            start=start+1
        samples = np.array(sample1)
        signal = numpy.double(samples)
        signal = signal / (2.0 ** 15)
        DC = signal.mean()
        MAX = (numpy.abs(signal)).max()
        signal = (signal - DC) / MAX
        N = len(signal)                                # total number of samples
        curPos = 0
        countFrames = 0
        nFFT = int(1500 / 2)
        X = abs(fft(signal))                                  # get fft magnitude
        X = X[0:nFFT]                                    # normalize fft
        X = X / len(X)
        itt=itt+itt
        file1.write("Energy ")
        file1.write(str(stEnergy(X)))
        file1.write("\n")
        #print("Energy",stEnergy(X))
        file1.write("entropy ")
        file1.write(str(stEnergyEntropy(X)))
        file1.write("\n")
        #print("entropy",stEnergyEntropy(X))
        file1.write("flux ")
        file1.write(str(stSpectralFlux(X,prev)))
        file1.write("\n")
        #print("flux",stSpectralFlux(X,prev))
        file1.write("spectral_roll_off ")
        file1.write(str(stSpectralRollOff(X,stEnergy(X),Fs)))
        file1.write("\n")

        #print("spectral_roll_off",stSpectralRollOff(X,stEnergy(X),Fs))
        [nChroma, nFreqsPerChroma]=stChromaFeaturesInit(1500,Fs)
        [nChroma1, nFreqsPerChroma1]=stChromaFeatures(X, Fs, nChroma, nFreqsPerChroma)
        if(flag_check):
            for each in nChroma1:
                file2.write(each+" ")
            flag_check=False
        file2.write("\n")
        for each in nFreqsPerChroma1:
            for each1 in each:
                #try:
                str1=str(each1).replace('[',' ')
                str1=str(str1).replace(']',' ')
                str1=str1.split(' ')
                for each2 in str1:
                    if(each2!=" "):
                        try:
                           value=float(each2)
                           file2.write(str(value)+" ")
                        except:
                            pass
                #except:
                #    print("error")
        file2.write("\n")
        #print("Chroma Feature",stChromaFeatures(X, Fs, nChroma, nFreqsPerChroma))
        #print("Chromagram",stChromagram(samples, Fs, itt, count, True))
        #print("Zero Crossing",stHarmonic(X, Fs))
        #[fbank, freqs]=mfccInitFilterBanks(Fs,itt)
        #print("MFCC",stMFCC(X,fbank,freqs))
        
        count=count+1
        if  count>=10:
            break
        prev=X
    for each in mfccInitFilterBanks(Fs,1500):
        for each1 in each:
            
            try:
                for each2 in each1:
                    if(each2!='[' or each2!=']'):
                        if(float(each2!=0)):
                            file3.write(str(each2)+" ")
                            file3.write("\n")
            except:
                if(float(each1!=0)):
                            file3.write(str(each1)+" ")
            file3.write("\n")
Ejemplo n.º 40
0
def main(argv):
    if argv[2] == 'full':
        dirName = argv[1]
        types = ('*.wav', )
        filesList = []
        for files in types:
            filesList.extend(glob.glob(os.path.join(dirName, files)))
        filesList = sorted(filesList)

        filesListIrr = []

        filesListIrr = sorted(filesListIrr)

        stWin = 0.020
        stStep = 0.015
        for f in filesList:
            [Fs, x] = audioBasicIO.readAudioFile(f)
            x = audioBasicIO.stereo2mono(x)
            createSpectrogramFile(x, Fs, f.replace(".wav", ".png"), stWin,
                                  stStep)

    else:
        dirName = argv[1]
        dirNameIrrelevant = argv[2]
        types = ('*.wav', )
        filesList = []
        for files in types:
            filesList.extend(glob.glob(os.path.join(dirName, files)))
        filesList = sorted(filesList)

        filesListIrr = []
        for files in types:
            filesListIrr.extend(
                glob.glob(os.path.join(dirNameIrrelevant, files)))
        filesListIrr = sorted(filesListIrr)
        print filesListIrr

        WIDTH_SEC = 1.5
        stWin = 0.040
        stStep = 0.005
        WIDTH = WIDTH_SEC / stStep

        for f in filesList:
            print f
            [Fs, x] = audioBasicIO.readAudioFile(f)
            x = audioBasicIO.stereo2mono(x)
            x = x.astype(float) / x.max()
            for i in range(3):
                if x.shape[0] > WIDTH_SEC * Fs + 200:
                    randStartSignal = random.randrange(
                        0, int(x.shape[0] - WIDTH_SEC * Fs - 200))
                    x2 = x[randStartSignal:randStartSignal +
                           int((WIDTH_SEC + stStep) * Fs)]
                    createSpectrogramFile(x2, Fs, f.replace(".wav", ".png"),
                                          stWin, stStep)  # ORIGINAL

                    if len(dirNameIrrelevant) > 0:
                        # AUGMENTED
                        randIrrelevant = random.randrange(0, len(filesListIrr))
                        [Fs, xnoise] = audioBasicIO.readAudioFile(
                            filesListIrr[randIrrelevant])
                        xnoise = xnoise.astype(float) / xnoise.max()

                        randStartNoise = random.randrange(
                            0, xnoise.shape[0] - WIDTH_SEC * Fs - 200)
                        R = 5
                        xN = (R * x2.astype(float) +
                              xnoise[randStartNoise:randStartNoise +
                                     x2.shape[0]].astype(float)) / float(R + 1)
                        wavfile.write(
                            f.replace(".wav", "_rnoise{0:d}1.wav".format(i)),
                            Fs, (16000 * xN).astype('int16'))
                        #createSpectrogramFile(xN, Fs, f.replace(".wav","_rnoise{0:d}1.png".format(i)), stWin, stStep)

                        randStartNoise = random.randrange(
                            0, xnoise.shape[0] - WIDTH_SEC * Fs - 200)
                        R = 4
                        xN = (R * x2.astype(float) +
                              xnoise[randStartNoise:randStartNoise +
                                     x2.shape[0]].astype(float)) / float(R + 1)
                        wavfile.write(
                            f.replace(".wav", "_rnoise{0:d}2.wav".format(i)),
                            Fs, (16000 * xN).astype('int16'))
                        #createSpectrogramFile(xN, Fs, f.replace(".wav","_rnoise{0:d}2.png".format(i)), stWin, stStep)

                        randStartNoise = random.randrange(
                            0, xnoise.shape[0] - WIDTH_SEC * Fs - 200)
                        R = 3
                        xN = (R * x2.astype(float) +
                              xnoise[randStartNoise:randStartNoise +
                                     x2.shape[0]].astype(float)) / float(R + 1)
                        wavfile.write(
                            f.replace(".wav", "_rnoise{0:d}3.wav".format(i)),
                            Fs, (16000 * xN).astype('int16'))
                        #createSpectrogramFile(xN, Fs, f.replace(".wav","_rnoise{0:d}3.png".format(i)), stWin, stStep)

                        randStartNoise = random.randrange(
                            0, xnoise.shape[0] - WIDTH_SEC * Fs - 200)
                        R = 6
                        xN = (R * x2.astype(float) +
                              xnoise[randStartNoise:randStartNoise +
                                     x2.shape[0]].astype(float)) / float(R + 1)
                        wavfile.write(
                            f.replace(".wav", "_rnoise{0:d}4.wav".format(i)),
                            Fs, (16000 * xN).astype('int16'))
                        #createSpectrogramFile(xN, Fs, f.replace(".wav","_rnoise{0:d}4.png".format(i)), stWin, stStep)

                        randStartNoise = random.randrange(
                            0, xnoise.shape[0] - WIDTH_SEC * Fs - 200)
                        R = 2
                        xN = (R * x2.astype(float) +
                              xnoise[randStartNoise:randStartNoise +
                                     x2.shape[0]].astype(float)) / float(R + 1)
                        wavfile.write(
                            f.replace(".wav", "_rnoise{0:d}5.wav".format(i)),
                            Fs, (16000 * xN).astype('int16'))
                        #createSpectrogramFile(xN, Fs, f.replace(".wav","_rnoise{0:d}5.png".format(i)), stWin, stStep)

                        randStartNoise = random.randrange(
                            0, xnoise.shape[0] - WIDTH_SEC * Fs - 200)
                        R = 1
                        xN = (R * x2.astype(float) +
                              xnoise[randStartNoise:randStartNoise +
                                     x2.shape[0]].astype(float)) / float(R + 1)
                        wavfile.write(
                            f.replace(".wav", "_rnoise{0:d}6.wav".format(i)),
                            Fs, (16000 * xN).astype('int16'))
                        #createSpectrogramFile(xN, Fs, f.replace(".wav","_rnoise{0:d}6.png".format(i)), stWin, stStep)

                        #specgramOr, TimeAxis, FreqAxis = aF.stSpectogram(x2, Fs, round(Fs * stWin), round(Fs * stStep), False)
                        #im2 = Image.fromarray(numpy.uint8(matplotlib.cm.jet(specgram)*255))
                        #plt.subplot(2,1,1)
                        #plt.imshow(im1)
                        #plt.subplot(2,1,2)
                        #plt.imshow(im2)
                        #plt.show()
                        '''
def main(argv):
	if argv[1] == "-shortTerm":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)
			t1 = time.clock()
			F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "short-term feature extraction: {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-classifyFile":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)		
			t1 = time.clock()
			aT.fileClassification("diarizationExample.wav", "svmSM","svm")
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-mtClassify":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)		
			t1 = time.clock()
			[flagsInd, classesAll, acc] = aS.mtFileClassification("diarizationExample.wav", "svmSM", "svm", False, '')
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-hmmSegmentation":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)		
			t1 = time.clock()
			aS.hmmSegmentation('diarizationExample.wav', 'hmmRadioSM', False, '')             
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "HMM-based classification - segmentation \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-silenceRemoval":
		for i in range(nExp):
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			duration = x.shape[0] / float(Fs)				
			t1 = time.clock()
			[Fs, x] = audioBasicIO.readAudioFile("diarizationExample.wav");
			segments = aS.silenceRemoval(x, Fs, 0.050, 0.050, smoothWindow = 1.0, Weight = 0.3, plot = False)
			t2 = time.clock()
			perTime1 =  duration / (t2-t1); print "Silence removal \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-thumbnailing":
		for i in range(nExp):
			[Fs1, x1] = audioBasicIO.readAudioFile("scottish.wav")
			duration1 = x1.shape[0] / float(Fs1)		
			t1 = time.clock()
			[A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x1, Fs1, 1.0, 1.0, 15.0)	# find thumbnail endpoints			
			t2 = time.clock()
			perTime1 =  duration1 / (t2-t1); print "Thumbnail \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-diarization-noLDA":
		for i in range(nExp):
			[Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav")
			duration1 = x1.shape[0] / float(Fs1)		
			t1 = time.clock()		
			aS.speakerDiarization("diarizationExample.wav", 4, LDAdim = 0, PLOT = False)
			t2 = time.clock()
			perTime1 =  duration1 / (t2-t1); print "Diarization \t {0:.1f} x realtime".format(perTime1)
	elif argv[1] == "-diarization-LDA":
		for i in range(nExp):
			[Fs1, x1] = audioBasicIO.readAudioFile("diarizationExample.wav")
			duration1 = x1.shape[0] / float(Fs1)		
			t1 = time.clock()		
			aS.speakerDiarization("diarizationExample.wav", 4, PLOT = False)
			t2 = time.clock()
			perTime1 =  duration1 / (t2-t1); print "Diarization \t {0:.1f} x realtime".format(perTime1)
def dirWavFeatureExtraction(dirName, mt_win, mt_step, st_win, st_step,
                            compute_beat=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mt_win, mt_step:    mid-term window and step (in seconds)
        - st_win, st_step:    short-term window and step (in seconds)
    """

    all_mt_feats = numpy.array([])
    process_times = []

    types = ('*.wav', '*.aif',  '*.aiff', '*.mp3', '*.au', '*.ogg')
    wav_file_list = []
    for files in types:
        wav_file_list.extend(glob.glob(os.path.join(dirName, files)))

    wav_file_list = sorted(wav_file_list)    
    wav_file_list2, mt_feature_names = [], []
    for i, wavFile in enumerate(wav_file_list):        
        print("Analyzing file {0:d} of "
              "{1:d}: {2:s}".format(i+1,
                                    len(wav_file_list),
                                    wavFile))
        if os.stat(wavFile).st_size == 0:
            print("   (EMPTY FILE -- SKIPPING)")
            continue        
        [fs, x] = audioBasicIO.readAudioFile(wavFile)
        if isinstance(x, int):
            continue        

        t1 = time.clock()        
        x = audioBasicIO.stereo2mono(x)
        if x.shape[0]<float(fs)/5:
            print("  (AUDIO FILE TOO SMALL - SKIPPING)")
            continue
        wav_file_list2.append(wavFile)
        if compute_beat:
            [mt_term_feats, st_features, mt_feature_names] = \
                mtFeatureExtraction(x, fs, round(mt_win * fs),
                                    round(mt_step * fs),
                                    round(fs * st_win), round(fs * st_step))
            [beat, beat_conf] = beatExtraction(st_features, st_step)
        else:
            [mt_term_feats, _, mt_feature_names] = \
                mtFeatureExtraction(x, fs, round(mt_win * fs),
                                    round(mt_step * fs),
                                    round(fs * st_win), round(fs * st_step))

        mt_term_feats = numpy.transpose(mt_term_feats)
        mt_term_feats = mt_term_feats.mean(axis=0)
        # long term averaging of mid-term statistics
        if (not numpy.isnan(mt_term_feats).any()) and \
                (not numpy.isinf(mt_term_feats).any()):
            if compute_beat:
                mt_term_feats = numpy.append(mt_term_feats, beat)
                mt_term_feats = numpy.append(mt_term_feats, beat_conf)
            if len(all_mt_feats) == 0:
                # append feature vector
                all_mt_feats = mt_term_feats
            else:
                all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats))
            t2 = time.clock()
            duration = float(len(x)) / fs
            process_times.append((t2 - t1) / duration)
    if len(process_times) > 0:
        print("Feature extraction complexity ratio: "
              "{0:.1f} x realtime".format((1.0 / numpy.mean(numpy.array(process_times)))))
    return (all_mt_feats, wav_file_list2, mt_feature_names)
Ejemplo n.º 43
0
def train_classifier():
    data_set = []
    for file in os.listdir("training_dataset/unhappy"):
        temp = []
        mean_value = []
        if file.endswith(".mp3"):
            # print "training_dataset/unhappy/"+file
            sound = AudioSegment.from_mp3("training_dataset/unhappy/" + file)
            sound.export("test.wav", format="wav")
            [Fs, x] = audioBasicIO.readAudioFile("test.wav")
            F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs)
            for i in range(len(F)):
                temp.append(numpy.mean(F[i]))
            mean_value.append(temp)
            mean_value.append(1)
            data_set.append(mean_value)
    for file in os.listdir("training_dataset/happy"):
        temp = []
        mean_value = []
        if file.endswith(".mp3"):
            # print "training_dataset/happy/"+file
            sound = AudioSegment.from_mp3("training_dataset/happy/" + file)
            sound.export("test.wav", format="wav")
            [Fs, x] = audioBasicIO.readAudioFile("test.wav")
            F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs)
            for i in range(len(F)):
                temp.append(numpy.mean(F[i]))
            mean_value.append(temp)
            mean_value.append(2)
            data_set.append(mean_value)
    for file in os.listdir("training_dataset/angry"):
        temp = []
        mean_value = []
        if file.endswith(".mp3"):
            # print "training_dataset/angry/"+file
            sound = AudioSegment.from_mp3("training_dataset/angry/" + file)
            sound.export("test.wav", format="wav")
            [Fs, x] = audioBasicIO.readAudioFile("test.wav")
            F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs)
            for i in range(len(F)):
                temp.append(numpy.mean(F[i]))
            mean_value.append(temp)
            mean_value.append(3)
            data_set.append(mean_value)
    for file in os.listdir("training_dataset/neutral"):
        temp = []
        mean_value = []
        if file.endswith(".mp3"):
            # print "training_dataset/neutral/"+file
            sound = AudioSegment.from_mp3("training_dataset/neutral/" + file)
            sound.export("test.wav", format="wav")
            [Fs, x] = audioBasicIO.readAudioFile("test.wav")
            F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.025 * Fs)
            for i in range(len(F)):
                temp.append(numpy.mean(F[i]))
            mean_value.append(temp)
            mean_value.append(4)
            data_set.append(mean_value)

    x = []
    y = []
    for i in range(len(data_set)):
        x.append(data_set[i][0])
        y.append(data_set[i][1])

    clf = RandomForestClassifier(n_estimators=30, max_features=6, max_depth=None, min_samples_split=1, bootstrap=True)
    clf = clf.fit(x, y)
    f2 = open("classifier.pickle", "wb")
    pickle.dump(clf, f2)
    f2.close()
Ejemplo n.º 44
0
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import matplotlib.pyplot as plt


[Fs, x] = audioBasicIO.readAudioFile("happy.wav");
# Fs is frequency
# x is real data


F, f_names = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs);
# stFeatureExtraction(signal, fs, win, step):
# signal:       the input signal samples
# fs:           the sampling freq (in Hz)
# win:          the short-term window size (in samples)
# step:         the short-term window step (in samples)
'''
here, 
window size = 0.05*Fs = 0.05*16000 = 800
step size = 0.025*Fs = 0.024*16000 = 400
we can get n frames from signal with length 23776

400*n+800=23776 -> n=57.44 = 58

as below F.shape = (34,58)
'''




'''
Ejemplo n.º 45
0
def main(path):
    ds = Dataset(path)
    loader = Loader(path + "/train/", 32, 16)
    X = []
    y = []
    Z = []
    ii = 0
    for p in ds.trainTracks():
        f = p.split("/")
        name = f[len(f) - 1]
        labelTeller = loader.loadLabelsForSoundfile(name)
        [Fs, x] = audioBasicIO.readAudioFile(p)
        F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.032 * Fs, 0.016 * Fs)
        G = zip(*F)
        N = 0
        if len(G) > labelTeller.tellNoOfAllBlocks():
            N = labelTeller.tellNoOfAllBlocks()
        else:
            N = len(G)

        for i in range(N):
            Z.append([G[i], labelTeller.tell(i)])

        # i = 0
        # for w in ds.windows(x,44100, 1410, 705):
        #    mf = mfcc(w)
        # row = [i]
        #    Z.append([mf[0],labelTeller.tell(i)])
        #    i = i+1

        print p + " " + str(ii) + "/61"
        ii = ii + 1

    print "shuffle"
    random.shuffle(Z)
    Z = zip(*Z)

    NN = 20000
    L = NN
    R = NN
    FINAL = [[], []]
    for i in range(len(Z[0])):
        if Z[1][i] == "sing" and L > 0:
            L = L - 1
            FINAL[0].append(Z[0][i])
            FINAL[1].append(Z[1][i])

        if Z[1][i] == "nosing" and R > 0:
            R = R - 1
            FINAL[0].append(Z[0][i])
            FINAL[1].append(Z[1][i])

    clf = svm.SVC(cache_size=2000)
    print "######### " + str(len(Z[0]))
    clf.fit(FINAL[0], FINAL[1])
    loader = Loader(path + "/test/", 32, 16)

    print "Loading test"
    for p in ds.validationTracks():
        X = []
        y = []
        f = p.split("/")
        name = f[len(f) - 1]
        labelTeller = loader.loadLabelsForSoundfile(name)
        i = 0

        [Fs, x] = audioBasicIO.readAudioFile(p)
        F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.032 * Fs, 0.016 * Fs)
        G = zip(*F)
        N = 0
        if len(G) > labelTeller.tellNoOfAllBlocks():
            N = labelTeller.tellNoOfAllBlocks()
        else:
            N = len(G)

        for i in range(N):
            X.append(G[i])
            y.append(labelTeller.tell(i))

        print "Starting prediction " + p

        Y = clf.predict(X)
        ok = 0
        al = 0
        for i in range(len(y)):
            if y[i] == Y[i]:
                ok = ok + 1
            al = al + 1

        print ok / float(al)
Ejemplo n.º 46
0
def mtFileClassification(inputFile,
                         modelName,
                         modelType,
                         plotResults=False,
                         gtFile=""):
    '''
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used, i.e. a pre-trained classifier.
    ARGUMENTS:
        - inputFile:        path of the input WAV file
        - modelName:        name of the classification model
        - modelType:        svm or knn depending on the classifier type
        - plotResults:      True if results are to be plotted using matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the class ID of the i-th segment
    '''

    if not os.path.isfile(modelName):
        print("mtFileClassificationError: input modelType not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if modelType == "knn":
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = \
            aT.load_model_knn(modelName)
    else:
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = aT.load_model(modelName)

    if computeBEAT:
        print("Model " + modelName + " contains long-term music features "
              "(beat etc) and cannot be used in "
              "segmentation")
        return (-1, -1, -1, -1)
    [Fs, x] = audioBasicIO.readAudioFile(inputFile)  # load input file
    if Fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audioBasicIO.stereo2mono(x)  # convert stereo (if) to mono
    Duration = len(x) / Fs
    # mid-term feature extraction:
    [MidTermFeatures, _] = aF.mtFeatureExtraction(x, Fs,
                                                  mtWin * Fs, mtStep * Fs,
                                                  round(Fs * stWin),
                                                  round(Fs * stStep))
    flags = []
    Ps = []
    flagsInd = []
    for i in range(
            MidTermFeatures.shape[1]
    ):  # for each feature vector (i.e. for each fix-sized segment):
        curFV = (MidTermFeatures[:, i] -
                 MEAN) / STD  # normalize current feature vector
        [Result, P] = aT.classifierWrapper(Classifier, modelType,
                                           curFV)  # classify vector
        flagsInd.append(Result)
        flags.append(classNames[int(Result)])  # update class label matrix
        Ps.append(numpy.max(P))  # update probability matrix
    flagsInd = numpy.array(flagsInd)

    # 1-window smoothing
    for i in range(1, len(flagsInd) - 1):
        if flagsInd[i - 1] == flagsInd[i + 1]:
            flagsInd[i] = flagsInd[i + 1]
    (segs, classes) = flags2segs(
        flags, mtStep)  # convert fix-sized flags to segments and classes
    segs[-1] = len(x) / float(Fs)

    # Load grount-truth:
    if os.path.isfile(gtFile):
        [segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile)
        flagsGT, classNamesGT = segs2flags(segStartGT, segEndGT, segLabelsGT,
                                           mtStep)
        flagsIndGT = []
        for j, fl in enumerate(flagsGT):  # "align" labels with GT
            if classNamesGT[flagsGT[j]] in classNames:
                flagsIndGT.append(classNames.index(classNamesGT[flagsGT[j]]))
            else:
                flagsIndGT.append(-1)
        flagsIndGT = numpy.array(flagsIndGT)
        CM = numpy.zeros((len(classNamesGT), len(classNamesGT)))
        for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])):
            CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1
    else:
        CM = []
        flagsIndGT = numpy.array([])
    acc = plotSegmentationResults(flagsInd, flagsIndGT, classNames, mtStep,
                                  not plotResults)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc))
        return (flagsInd, classNamesGT, acc, CM)
    else:
        return (flagsInd, classNames, acc, CM)
Ejemplo n.º 47
0
def fileClassification(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(modelName):
        print("fileClassification: input modelName not found!")
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print("fileClassification: wav file not found!")
        return (-1, -1, -1)

    if (modelType) == 'svm' or (modelType == 'svm_rbf'):
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadSVModel(modelName)
    elif modelType == 'knn':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadKNNModel(modelName)
    elif modelType == 'randomforest':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadRandomForestModel(modelName)
    elif modelType == 'gradientboosting':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadGradientBoostingModel(modelName)
    elif modelType == 'extratrees':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadExtraTreesModel(modelName)

    # read audio file and convert to mono
    [Fs, x] = audioBasicIO.readAudioFile(inputFile)
    x = audioBasicIO.stereo2mono(x)

    if isinstance(x, int):  # audio file IO problem
        return (-1, -1, -1)
    if x.shape[0] / float(Fs) <= mtWin:
        return (-1, -1, -1)

    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs,
                                                  mtWin * Fs, mtStep * Fs,
                                                  round(Fs * stWin),
                                                  round(Fs * stStep))
    # long term averaging of mid-term statistics
    MidTermFeatures = MidTermFeatures.mean(axis=1)
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
    curFV = (MidTermFeatures - MEAN) / STD  # normalization

    [Result, P] = classifierWrapper(Classifier, modelType,
                                    curFV)  # classification
    return Result, P, classNames
Ejemplo n.º 48
0
    def Audio_Feature_Extraction_Extract(self):
        if self.Audio_Feature_Extraction_SingleFileWindowTerm.currentText(
        ) == "ShortTerm":
            [Fs, x] = audioBasicIO.readAudioFile(
                self.Audio_Feature_Extraction_SingleFilePath.text())
            stFeatures = audioFeatureExtraction.stFeatureExtraction(
                x, Fs,
                float(
                    self.Audio_Feature_Extraction_SingleFileWindowSize.text())
                * Fs,
                float(self.Audio_Feature_Extraction_SingleFileStepSize.text())
                * Fs)

            options = QFileDialog.Options()
            options |= QFileDialog.DontUseNativeDialog
            fileName, _ = QFileDialog.getSaveFileName(self.somethingToPass,
                                                      "Where to save?",
                                                      "",
                                                      "CSV files (*.csv)",
                                                      options=options)

            numpy.savetxt(fileName + ".csv", stFeatures, delimiter=',')
            QMessageBox.about(self.somethingToPass, "Files Created",
                              "File has been saved as CSV file")
            ## Let the User specify how many features and what features to see eventually.
#==============================================================================
#             if self.Audio_Feature_Extraction_SingleFileDataVisualisation.isChecked():
#                 stFeatures = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs)
#
#                 labels = ["Zero Crossing Rate", "Energy", "Entropy of Energy", "Spectral Centroid",
#                           "Spectral Spread", "Spectral Entropy", "Spectral Flux", "Spectral Rolloff",
#                           "MFCC 1", "MFCC 2", "MFCC 3", "MFCC 4",
#                           "MFCC 5", "MFCC 6", "MFCC 7", "MFCC 8",
#                           "MFCC 9", "MFCC 10", "MFCC 11", "MFCC 12", "MFCC 13",
#                           "Chroma Vector 1", "Chroma Vector 2", "Chroma Vector 3", "Chroma Vector 4",
#                           "Chroma Vector 5", "Chroma Vector 6", "Chroma Vector 7", "Chroma Vector 8",
#                           "Chroma Vector 9", "Chroma Vector 10", "Chroma Vector 11","Chroma Vector 12", "Chroma Deviation"]
#
#
#                 for x in range(0, len(labels)-1):
#                     plt.subplot(34,1,x+1); plt.plot(stFeatures[x,:]); plt.xlabel('Frame no'); plt.ylabel(labels[x])
#
#                 plt.show()
#
#==============================================================================

        else:
            options = QFileDialog.Options()
            options |= QFileDialog.DontUseNativeDialog
            fileName, _ = QFileDialog.getSaveFileName(self.somethingToPass,
                                                      "Where to save?",
                                                      "",
                                                      "CSV files (*.csv)",
                                                      options=options)

            audioFeatureExtraction.mtFeatureExtractionToFile(
                self.Audio_Feature_Extraction_SingleFilePath.text(),
                float(
                    self.Audio_Feature_Extraction_SingleFilemidTermWindowSize.
                    text()),
                float(self.
                      Audio_Feature_Extraction_SingleFilemidTermWindowStepSize.
                      text()),
                float(
                    self.Audio_Feature_Extraction_SingleFileWindowSize.text()),
                float(self.Audio_Feature_Extraction_SingleFileStepSize.text()),
                fileName,
                storeStFeatures=True,
                storeToCSV=True,
                PLOT=False)

            QMessageBox.about(
                self.somethingToPass, "Files Created",
                "Files have been saved as CSV files and .npy files")
Ejemplo n.º 49
0
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import numpy as np
import math
import matplotlib.pyplot as plt

# main process
[Fs, x] = audioBasicIO.readAudioFile("data/diarizationExample.wav")

TIME_OF_WINDOW = 0.050	#a window = 0.05s
TIME_OF_STEP = 0.025		#step = 0.01s
SIZE_OF_WINDOW = int(TIME_OF_WINDOW * Fs)	#the number of frame for one window
SIZE_OF_STEP = int(TIME_OF_STEP * Fs)		#the number of frame for one step
BLOCK_SIZE = 4		#a block has (6 * SIZE_OF_STEP) frame
BLOCK_STEP = 2

# variables 
END_OF_FILE = 0
FIRST_PAIR = 1
INDEX_BOUCLE = 1

def getMFCCs(block_start, block_end):
	return attribute[8:20,block_start:block_end+1]

def getMFCCsFromTime(moment_start, moment_end):
	block_start = int(moment_start / BLOCK_STEP / TIME_OF_STEP - 1)
	block_end = int(moment_end / BLOCK_STEP / TIME_OF_STEP - 1)
	return getMFCCs(block_start, block_end)

def gauss(x, mean, cov):
	[n, d] = x.shape
Ejemplo n.º 50
0
def train_classifier():
    data_set = []
    for file in os.listdir("training_dataset/unhappy"):
        temp = []
        mean_value = []
        if file.endswith(".mp3"):
            #print "training_dataset/unhappy/"+file
            sound=AudioSegment.from_mp3("training_dataset/unhappy/"+file)
            sound.export("test.wav",format="wav")
            [Fs, x] = audioBasicIO.readAudioFile("test.wav");
            F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05*Fs, 0.025*Fs);
            for i in range(len(F)):
                temp.append(numpy.mean(F[i]))
            mean_value.append(temp)
            mean_value.append(1)
            data_set.append(mean_value)
    for file in os.listdir("training_dataset/happy"):
        temp = []
        mean_value = []
        if file.endswith(".mp3"):
            #print "training_dataset/happy/"+file
            sound=AudioSegment.from_mp3("training_dataset/happy/"+file)
            sound.export("test.wav",format="wav")
            [Fs, x] = audioBasicIO.readAudioFile("test.wav");
            F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05*Fs, 0.025*Fs);
            for i in range(len(F)):
                temp.append(numpy.mean(F[i]))
            mean_value.append(temp)
            mean_value.append(2)
            data_set.append(mean_value)
    for file in os.listdir("training_dataset/angry"):
        temp = []
        mean_value = []
        if file.endswith(".mp3"):
            #print "training_dataset/angry/"+file
            sound=AudioSegment.from_mp3("training_dataset/angry/"+file)
            sound.export("test.wav",format="wav")
            [Fs, x] = audioBasicIO.readAudioFile("test.wav");
            F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05*Fs, 0.025*Fs);
            for i in range(len(F)):
                temp.append(numpy.mean(F[i]))
            mean_value.append(temp)
            mean_value.append(3)
            data_set.append(mean_value)
    for file in os.listdir("training_dataset/neutral"):
        temp = []
        mean_value = []
        if file.endswith(".mp3"):
            #print "training_dataset/neutral/"+file
            sound=AudioSegment.from_mp3("training_dataset/neutral/"+file)
            sound.export("test.wav",format="wav")
            [Fs, x] = audioBasicIO.readAudioFile("test.wav");
            F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05*Fs, 0.025*Fs);
            for i in range(len(F)):
                temp.append(numpy.mean(F[i]))
            mean_value.append(temp)
            mean_value.append(4)
            data_set.append(mean_value)

    x = []
    y = []
    for i in range(len(data_set)):
        x.append(data_set[i][0])
        y.append(data_set[i][1])

    clf = RandomForestClassifier(n_estimators=30,max_features=6,max_depth=None,min_samples_split=1,bootstrap=True)
    clf = clf.fit(x, y)
    f2 = open('classifier.pickle', 'wb')
    pickle.dump(clf, f2)
    f2.close()
Ejemplo n.º 51
0
def save(csv_, wav):
    #csv and wav file as argument
    print csv_, wav
    csvFileName = csv_
    wavFileName = wav

    Fs, x = audioBasicIO.readAudioFile(wavFileName)
    annotations = []
    silence = []
    folderName = None
    fileCounter = 0
    start, end = 0, 0

    #duration of wavFile
    spf = wave.open(wavFileName, 'r')
    #Get wavFile duration
    frames = spf.getnframes()
    rate = spf.getframerate()
    duration = frames / float(rate)
    #duration = int((duration))

    csvFile = open(csvFileName, 'rb')
    # >> Empty csv file is 1 Byte -> check for empty
    if os.path.getsize(csvFileName) > 1:
        read = csv.reader(csvFile)
        #startTimeToPlay, endTimeToPlay -> str2float
        for row in read:
            row[0] = round(float(row[0]) / 1000, 2)
            row[1] = round(float(row[1]) / 1000, 2)
            #print row[0], row[1]
            if row[2][:8] == "Speech::":
                folderName = row[2][:6]
            else:
                folderName = row[2]

            annotations.append([row[0], row[1], folderName])
            #check if the directory exists and create it if necessary
            if not os.path.exists(folderName):
                os.makedirs(folderName)

        #sort annotations alphabetically based on class name
        annotations = sorted(annotations,
                             key=operator.itemgetter(2),
                             reverse=False)

        # >> Save audio segments in folders, based on annotation class
        for i, an in enumerate(annotations):
            #find file ID for existing files in directory to continue writing..
            directory = os.listdir(an[2])
            #check for empty directory
            if directory:
                index = directory[0].index('_')
                fileCounter = 0
                for i in range(len(directory)):
                    if directory[i][index + 1] > fileCounter:
                        fileCounter = directory[i][index + 1]
                fileCounter = int(fileCounter) + 1
            else:
                fileCounter = 0

            strOut = an[2] + "/{1:s}_{2:d}.wav".format(
                wavFileName.replace(".wav", ""), an[2], fileCounter)
            fileCounter = fileCounter + 1
            #print strOut, int(Fs * an[0]), int(Fs * an[1])
            folderName = an[2]
            wavfile.write(strOut, Fs, x[int(Fs * an[0]):int(Fs * an[1])])

        # >> Find silence in audio file
        #sort annotations by start time
        annotations = sorted(annotations,
                             key=operator.itemgetter(0),
                             reverse=False)
        time = np.arange(0, duration, 0.01)

        #Get silence before-between-after annotations
        for i in range(len(annotations)):
            tS = np.searchsorted(time, annotations[i][0])
            tE = np.searchsorted(time, annotations[i][1])
            end = round(time[tS], 2)
            silence.append([start, end])
            start = round(time[tE], 2)
        silence.append([start, duration])

        #remove overlapping
        for i, s in enumerate(silence):
            if s[0] > s[1]:
                silence.remove(s)

        folderName = 'Silence'
        if not os.path.exists(folderName):
            os.makedirs(folderName)

        #find file ID for Silence
        directory = os.listdir(folderName)
        if directory:
            index = directory[0].index('_')
            fileCounter = 0
            for i in range(len(directory)):
                if directory[i][index + 1] > fileCounter:
                    fileCounter = directory[i][index + 1]
            fileCounter = int(fileCounter) + 1
        else:
            fileCounter = 0

        #save silence segment
        for i, s in enumerate(silence):
            strOut = folderName + "/Silence_{1:d}.wav".format(
                wavFileName.replace(".wav", ""), fileCounter)
            fileCounter = fileCounter + 1
            wavfile.write(strOut, Fs, x[int(Fs * s[0]):int(Fs * s[1])])
        print 'Finish saving audio segments...'
Ejemplo n.º 52
0
#import pyAudioAnalysis
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import matplotlib.pyplot as plt

path

from pydub import AudioSegment
sound = AudioSegment.from_mp3("../1.mp3")
sound = sound.set_channels(1)
sound.export("../1.mp3", format="mp3")

[Fs, x] = audioBasicIO.readAudioFile("../1.mp3")
F = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050 * Fs, 0.025 * Fs)
plt.subplot(2, 1, 1)
plt.plot(F[0, :])
plt.xlabel('Frame no')
plt.ylabel('ZCR')
plt.subplot(2, 1, 2)
plt.plot(F[1, :])
plt.xlabel('Frame no')
plt.ylabel('Energy')
plt.show()
Ejemplo n.º 53
0
def save(csv_, wav):
	#csv and wav file as argument
	print csv_, wav
	csvFileName = csv_
	wavFileName = wav

	Fs, x = audioBasicIO.readAudioFile(wavFileName)
	annotations = []
	silence = []
	folderName = None
	fileCounter = 0
	start, end = 0, 0

	#duration of wavFile
	spf = wave.open(wavFileName,'r')
	#Get wavFile duration
	frames = spf.getnframes()
	rate = spf.getframerate()
	duration = frames / float(rate)
	#duration = int((duration))


	csvFile = open(csvFileName, 'rb')
	# >> Empty csv file is 1 Byte -> check for empty
	if os.path.getsize(csvFileName) > 1:
		read = csv.reader(csvFile)
		#startTimeToPlay, endTimeToPlay -> str2float
		for row in read:
			row[0] = round(float(row[0])/1000,2)
			row[1] = round(float(row[1])/1000,2)
			#print row[0], row[1]
			if row[2][:8] == "Speech::":
				folderName = row[2][:6]
			else:
				folderName = row[2]

			annotations.append([row[0], row[1], folderName])
			#check if the directory exists and create it if necessary
			if not os.path.exists(folderName):
				os.makedirs(folderName)

		#sort annotations alphabetically based on class name
		annotations = sorted(annotations, key=operator.itemgetter(2), reverse=False)


		# >> Save audio segments in folders, based on annotation class
		for i,an in enumerate(annotations):
			#find file ID for existing files in directory to continue writing..
			directory = os.listdir(an[2])
			#check for empty directory
			if directory:
				index = directory[0].index('_')
				fileCounter = 0
				for i in range(len(directory)):
					if directory[i][index+1] > fileCounter:
						fileCounter = directory[i][index+1]
				fileCounter = int(fileCounter) + 1
			else:
				fileCounter = 0

			strOut = an[2] + "/{1:s}_{2:d}.wav".format(wavFileName.replace(".wav",""), an[2], fileCounter)
			fileCounter = fileCounter + 1
			#print strOut, int(Fs * an[0]), int(Fs * an[1])
			folderName = an[2]
			wavfile.write(strOut, Fs, x[int(Fs * an[0]):int(Fs * an[1])])

		# >> Find silence in audio file
		#sort annotations by start time
		annotations = sorted(annotations, key=operator.itemgetter(0), reverse=False)
		time = np.arange(0,duration,0.01)

		#Get silence before-between-after annotations
		for i in range(len(annotations)):
			tS = np.searchsorted(time, annotations[i][0])
			tE = np.searchsorted(time, annotations[i][1])
			end = round(time[tS],2)
			silence.append([start, end])
			start = round(time[tE],2)
		silence.append([start, duration])

		#remove overlapping 
		for i, s in enumerate(silence):
			if s[0]>s[1]:
				silence.remove(s)

		folderName = 'Silence'
		if not os.path.exists(folderName):
				os.makedirs(folderName)

		#find file ID for Silence
		directory = os.listdir(folderName)
		if directory:
			index = directory[0].index('_')
			fileCounter = 0
			for i in range(len(directory)):
				if directory[i][index+1] > fileCounter:
					fileCounter = directory[i][index+1]
			fileCounter = int(fileCounter) + 1
		else:
			fileCounter = 0

		#save silence segment
		for i, s in enumerate(silence):
			strOut = folderName + "/Silence_{1:d}.wav".format(wavFileName.replace(".wav",""), fileCounter)
			fileCounter = fileCounter + 1
			wavfile.write(strOut, Fs, x[int(Fs * s[0]):int(Fs * s[1])])
		print 'Finish saving audio segments...'

		for root, dirs, files in os.walk("/mydir"):
		    for silenceFile in files:
				if sys.getsizeof(silenceFile) <= 44:
					os.remove(silenceFile)
Ejemplo n.º 54
0
from __future__ import print_function
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
from pyAudioAnalysis import audioTrainTest as aT
from pyAudioAnalysis import audioSegmentation as aS
import matplotlib.pyplot as plt

root_data_path = "/Users/tyiannak/ResearchData/Audio Dataset/pyAudioAnalysisData/"

print("\n\n\n * * * TEST 1 * * * \n\n\n")
[Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/count.wav");
F, f_names = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs);
plt.subplot(2,1,1); plt.plot(F[0,:]); plt.xlabel('Frame no'); plt.ylabel(f_names[0]);
plt.subplot(2,1,2); plt.plot(F[1,:]); plt.xlabel('Frame no'); plt.ylabel(f_names[1]); plt.show()

print("\n\n\n * * * TEST 2 * * * \n\n\n")
[Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/doremi.wav")
x = audioBasicIO.stereo2mono(x)
specgram, TimeAxis, FreqAxis = audioFeatureExtraction.stSpectogram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True)

print("\n\n\n * * * TEST 3 * * * \n\n\n")
[Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/doremi.wav")
x = audioBasicIO.stereo2mono(x)
specgram, TimeAxis, FreqAxis = audioFeatureExtraction.stChromagram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True)

print("\n\n\n * * * TEST 4 * * * \n\n\n")
aT.featureAndTrain([root_data_path +"SM/speech",root_data_path + "SM/music"], 1.0, 1.0, 0.2, 0.2, "svm", "temp", True)

print("\n\n\n * * * TEST 5 * * * \n\n\n")
[flagsInd, classesAll, acc, CM] = aS.mtFileClassification(root_data_path + "pyAudioAnalysis/data//scottish.wav", root_data_path + "pyAudioAnalysis/data/svmSM", "svm", True, root_data_path + 'pyAudioAnalysis/data/scottish.segments')