def fileChromagramWrapper(wav_file):
    if not os.path.isfile(wav_file):
        raise Exception("Input audio file not found!")
    [fs, x] = audioBasicIO.readAudioFile(wav_file)
    x = audioBasicIO.stereo2mono(x)
    specgram, TimeAxis, FreqAxis = aF.stChromagram(x, fs, round(fs * 0.040),
                                                   round(fs * 0.040), True)
 def POST(self):
     x = web.input(myfile={})
     filename = 'tmp/'+uuid.uuid4().hex+'.wav'
     file = open(filename, 'w+')
     file.seek(0)
     file.write(x['myfile'].value)
     file.close()
     [Fs, x] = audioBasicIO.readAudioFile(filename);
     #os.remove(filename)
     x = audioBasicIO.stereo2mono(x)
     [F, _] = audioFeatureExtraction.mtFeatureExtraction(x, Fs, round(Fs*1.0), round(Fs * 1.0), round(Fs * 0.050), round(Fs * 0.050))
     F = F.transpose()
     for vec in F:
         results={}
         current_highest = ""
         current_highest_value = 0
         vec = numpy.around(vec.astype(numpy.float), 6)
         current = model.getNN(vec)
         result = current[0][1].partition("_")[0]
         if result in results:
             results[result] = results[result]+1
         else:
             results[result] = 1
         if results[result] > current_highest_value:
             current_highest_value = results[result]
             current_highest = result
     print results
     print current_highest
     raise web.seeother('/')
def dirWavFeatureExtractionNoAveraging(dirName, mt_win, mt_step, st_win, st_step):
    """
    This function extracts the mid-term features of the WAVE
    files of a particular folder without averaging each file.

    ARGUMENTS:
        - dirName:          the path of the WAVE directory
        - mt_win, mt_step:    mid-term window and step (in seconds)
        - st_win, st_step:    short-term window and step (in seconds)
    RETURNS:
        - X:                A feature matrix
        - Y:                A matrix of file labels
        - filenames:
    """

    all_mt_feats = numpy.array([])
    signal_idx = numpy.array([])
    process_times = []

    types = ('*.wav', '*.aif',  '*.aiff', '*.ogg')
    wav_file_list = []
    for files in types:
        wav_file_list.extend(glob.glob(os.path.join(dirName, files)))

    wav_file_list = sorted(wav_file_list)

    for i, wavFile in enumerate(wav_file_list):
        [fs, x] = audioBasicIO.readAudioFile(wavFile)
        if isinstance(x, int):
            continue        
        
        x = audioBasicIO.stereo2mono(x)
        [mt_term_feats, _, _] = mtFeatureExtraction(x, fs, round(mt_win * fs),
                                                    round(mt_step * fs),
                                                    round(fs * st_win),
                                                    round(fs * st_step))

        mt_term_feats = numpy.transpose(mt_term_feats)
        if len(all_mt_feats) == 0:                # append feature vector
            all_mt_feats = mt_term_feats
            signal_idx = numpy.zeros((mt_term_feats.shape[0], ))
        else:
            all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats))
            signal_idx = numpy.append(signal_idx, i * numpy.ones((mt_term_feats.shape[0], )))

    return (all_mt_feats, signal_idx, wav_file_list)
def mtFeatureExtractionToFile(fileName, midTermSize, midTermStep, shortTermSize, shortTermStep, outPutFile,
                              storeStFeatures=False, storeToCSV=False, PLOT=False):
    """
    This function is used as a wrapper to:
    a) read the content of a WAV file
    b) perform mid-term feature extraction on that signal
    c) write the mid-term feature sequences to a numpy file
    """
    [fs, x] = audioBasicIO.readAudioFile(fileName)
    x = audioBasicIO.stereo2mono(x)
    if storeStFeatures:
        [mtF, stF, _] = mtFeatureExtraction(x, fs,
                                         round(fs * midTermSize),
                                         round(fs * midTermStep),
                                         round(fs * shortTermSize),
                                         round(fs * shortTermStep))
    else:
        [mtF, _, _] = mtFeatureExtraction(x, fs, round(fs*midTermSize),
                                       round(fs * midTermStep),
                                       round(fs * shortTermSize),
                                       round(fs * shortTermStep))
    # save mt features to numpy file
    numpy.save(outPutFile, mtF)
    if PLOT:
        print("Mid-term numpy file: " + outPutFile + ".npy saved")
    if storeToCSV:
        numpy.savetxt(outPutFile+".csv", mtF.T, delimiter=",")
        if PLOT:
            print("Mid-term CSV file: " + outPutFile + ".csv saved")

    if storeStFeatures:
        # save st features to numpy file
        numpy.save(outPutFile+"_st", stF)
        if PLOT:
            print("Short-term numpy file: " + outPutFile + "_st.npy saved")
        if storeToCSV:
            # store st features to CSV file
            numpy.savetxt(outPutFile+"_st.csv", stF.T, delimiter=",")
            if PLOT:
                print("Short-term CSV file: " + outPutFile + "_st.csv saved")
from pyAudioAnalysis import audioFeatureExtraction
from pyAudioAnalysis import audioTrainTest as aT
from pyAudioAnalysis import audioSegmentation as aS
import matplotlib.pyplot as plt

root_data_path = "/Users/tyiannak/ResearchData/Audio Dataset/pyAudioAnalysisData/"

print("\n\n\n * * * TEST 1 * * * \n\n\n")
[Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/count.wav");
F, f_names = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs);
plt.subplot(2,1,1); plt.plot(F[0,:]); plt.xlabel('Frame no'); plt.ylabel(f_names[0]);
plt.subplot(2,1,2); plt.plot(F[1,:]); plt.xlabel('Frame no'); plt.ylabel(f_names[1]); plt.show()

print("\n\n\n * * * TEST 2 * * * \n\n\n")
[Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/doremi.wav")
x = audioBasicIO.stereo2mono(x)
specgram, TimeAxis, FreqAxis = audioFeatureExtraction.stSpectogram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True)

print("\n\n\n * * * TEST 3 * * * \n\n\n")
[Fs, x] = audioBasicIO.readAudioFile(root_data_path + "pyAudioAnalysis/data/doremi.wav")
x = audioBasicIO.stereo2mono(x)
specgram, TimeAxis, FreqAxis = audioFeatureExtraction.stChromagram(x, Fs, round(Fs * 0.040), round(Fs * 0.040), True)

print("\n\n\n * * * TEST 4 * * * \n\n\n")
aT.featureAndTrain([root_data_path +"SM/speech",root_data_path + "SM/music"], 1.0, 1.0, 0.2, 0.2, "svm", "temp", True)

print("\n\n\n * * * TEST 5 * * * \n\n\n")
[flagsInd, classesAll, acc, CM] = aS.mtFileClassification(root_data_path + "pyAudioAnalysis/data//scottish.wav", root_data_path + "pyAudioAnalysis/data/svmSM", "svm", True, root_data_path + 'pyAudioAnalysis/data/scottish.segments')

print("\n\n\n * * * TEST 6 * * * \n\n\n")
aS.trainHMM_fromFile(root_data_path + 'radioFinal/train/bbc4A.wav', root_data_path + 'radioFinal/train/bbc4A.segments', 'hmmTemp1', 1.0, 1.0)	
Exemple #6
0
def fileClassification(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(modelName):
        print("fileClassification: input modelName not found!")
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print("fileClassification: wav file not found!")
        return (-1, -1, -1)

    if (modelType) == 'svm' or (modelType == 'svm_rbf'):
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadSVModel(modelName)
    elif modelType == 'knn':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadKNNModel(modelName)
    elif modelType == 'randomforest':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadRandomForestModel(modelName)
    elif modelType == 'gradientboosting':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadGradientBoostingModel(modelName)
    elif modelType == 'extratrees':
        [
            Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT
        ] = loadExtraTreesModel(modelName)

    # read audio file and convert to mono
    [Fs, x] = audioBasicIO.readAudioFile(inputFile)
    x = audioBasicIO.stereo2mono(x)

    if isinstance(x, int):  # audio file IO problem
        return (-1, -1, -1)
    if x.shape[0] / float(Fs) <= mtWin:
        return (-1, -1, -1)

    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs,
                                                  mtWin * Fs, mtStep * Fs,
                                                  round(Fs * stWin),
                                                  round(Fs * stStep))
    # long term averaging of mid-term statistics
    MidTermFeatures = MidTermFeatures.mean(axis=1)
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
    curFV = (MidTermFeatures - MEAN) / STD  # normalization

    [Result, P] = classifierWrapper(Classifier, modelType,
                                    curFV)  # classification
    return Result, P, classNames
Exemple #7
0
def speakerDiarization(fileName,
                       numOfSpeakers,
                       mtSize=2.0,
                       mtStep=0.2,
                       stWin=0.05,
                       LDAdim=35,
                       PLOT=False):
    '''
    ARGUMENTS:
        - fileName:        the name of the WAV file to be analyzed
        - numOfSpeakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mtSize (opt)     mid-term window size
        - mtStep (opt)     mid-term window step
        - stWin  (opt)     short-term window size
        - LDAdim (opt)     LDA dimension (0 for no LDA)
        - PLOT     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [Fs, x] = audioBasicIO.readAudioFile(fileName)
    x = audioBasicIO.stereo2mono(x)
    Duration = len(x) / Fs

    [
        Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1,
        stStep1, computeBEAT1
    ] = aT.load_model_knn(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "data",
                     "knnSpeakerAll"))
    [
        Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2,
        stStep2, computeBEAT2
    ] = aT.load_model_knn(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "data",
                     "knnSpeakerFemaleMale"))

    [MidTermFeatures,
     ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs,
                                                 mtSize * Fs, mtStep * Fs,
                                                 round(Fs * stWin),
                                                 round(Fs * stWin * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2),
         MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2
        [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
        [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::,
                         i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2  # TODO
    # SELECT FEATURES:
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20];                                                                                         # SET 0A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100];                                                                                 # SET 0B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,
    #   97,98, 99,100];     # SET 0C

    iFeaturesSelect = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]  # SET 1A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];                                          # SET 1B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 1C

    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53];             # SET 2A
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];     # SET 2B
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 2C

    #iFeaturesSelect = range(100);                                                                                                    # SET 3
    #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010

    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    (MidTermFeaturesNorm, MEAN,
     STD) = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    # remove outliers:
    DistancesAll = numpy.sum(distance.squareform(
        distance.pdist(MidTermFeaturesNorm.T)),
                             axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = numpy.min(MidTermFeatures[1,:])
    #EnergyMean = numpy.mean(MidTermFeatures[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
    #print iNonOutLiers

    perOutLier = (100.0 *
                  (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    # LDA dimensionality reduction:
    if LDAdim > 0:
        #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));
        # extract mid-term features with minimum step:
        mtWinRatio = int(round(mtSize / stWin))
        mtStepRatio = int(round(stWin / stWin))
        mtFeaturesToReduce = []
        numOfFeatures = len(ShortTermFeatures)
        numOfStatistics = 2
        #for i in range(numOfStatistics * numOfFeatures + 1):
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append([])

        for i in range(numOfFeatures):  # for each of the short-term features:
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mtWinRatio
                if N2 > N:
                    N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i + numOfFeatures].append(
                    numpy.std(curStFeatures))
                curPos += mtStepRatio
        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros(
            (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2),
             mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
            [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0],
                                i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[
                mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] +
                                len(classNames1)::, i] = P2 + 0.0001
        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
        (mtFeaturesToReduce, MEAN,
         STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T
        #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
        #MDistancesAll = numpy.mean(DistancesAll)
        #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
        #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
        Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * stWin / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels)
        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    if numOfSpeakers <= 0:
        sRange = range(2, 10)
    else:
        sRange = [numOfSpeakers]
    clsAll = []
    silAll = []
    centersAll = []

    for iSpeakers in sRange:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(MidTermFeaturesNorm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
        clsAll.append(cls)
        centersAll.append(means)
        silA = []
        silB = []
        for c in range(iSpeakers
                       ):  # for each speaker (i.e. for each extracted cluster)
            clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(
                len(cls))
            if clusterPerCent < 0.020:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls ==
                                                              c]  # get subset of feature vectors
                Yt = distance.pdist(
                    MidTermFeaturesNormTemp.T
                )  # compute average distance between samples that belong to the cluster (a values)
                silA.append(numpy.mean(Yt) * clusterPerCent)
                silBs = []
                for c2 in range(
                        iSpeakers
                ):  # compute distances from samples of other clusters
                    if c2 != c:
                        clusterPerCent2 = numpy.nonzero(
                            cls == c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,
                                                                       cls ==
                                                                       c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clusterPerCent + clusterPerCent2) / 2.0)
                silBs = numpy.array(silBs)
                silB.append(
                    min(silBs)
                )  # ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
        silA = numpy.array(silA)
        silB = numpy.array(silB)
        sil = []
        for c in range(iSpeakers):  # for each cluster (speaker)
            sil.append((silB[c] - silA[c]) /
                       (max(silB[c], silA[c]) + 0.00001))  # compute silhouette

        silAll.append(numpy.mean(sil))  # keep the AVERAGE SILLOUETTE

    #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
    imax = numpy.argmax(silAll)  # position of the maximum sillouette value
    nSpeakersFinal = sRange[imax]  # optimal number of clusters

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window)
    cls = numpy.zeros((numOfWindows, ))
    for i in range(numOfWindows):
        j = numpy.argmin(numpy.abs(i - iNonOutLiers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        startprob, transmat, means, cov = trainHMM_computeStatistics(
            MidTermFeaturesNormOr, cls)
        hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0],
                                       "diag")  # hmm training
        hmm.startprob_ = startprob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(MidTermFeaturesNormOr.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = silAll[imax]  # final sillouette
    classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    # load ground-truth if available
    gtFile = fileName.replace('.wav', '.segments')
    # open for annotated file
    if os.path.isfile(gtFile):  # if groundturh exists
        [segStart, segEnd, segLabels] = readSegmentGT(gtFile)  # read GT data
        flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels,
                                           mtStep)  # convert to flags

    if PLOT:
        fig = plt.figure()
        if numOfSpeakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(numpy.array(range(len(classNames))))
        ax1.axis((0, Duration, -1, len(classNames)))
        ax1.set_yticklabels(classNames)
        ax1.plot(numpy.array(range(len(cls))) * mtStep + mtStep / 2.0, cls)

    if os.path.isfile(gtFile):
        if PLOT:
            ax1.plot(
                numpy.array(range(len(flagsGT))) * mtStep + mtStep / 2.0,
                flagsGT, 'r')
        purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(
            cls, flagsGT)
        print("{0:.1f}\t{1:.1f}".format(100 * purityClusterMean,
                                        100 * puritySpeakerMean))
        if PLOT:
            plt.title(
                "Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(
                    100 * purityClusterMean, 100 * puritySpeakerMean))
    if PLOT:
        plt.xlabel("time (seconds)")
        #print sRange, silAll
        if numOfSpeakers <= 0:
            plt.subplot(212)
            plt.plot(sRange, silAll)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
    return cls
Exemple #8
0
    return x[15:17]

def getFourth(val):
    return val[3]

os.chdir('C:/Users/konst_000/Desktop/Σχολή/6ο Εξάμηνο/ΨΕΣ/Speech Emotion Recognition/Audio Database/Complete')
fileList = os.listdir('C:/Users/konst_000/Desktop/Σχολή/6ο Εξάμηνο/ΨΕΣ/Speech Emotion Recognition/Audio Database/Complete')
featureList = [] #list of lists used to store the extracted features of each training sample
labelListAct = []   #list of strings used to store the labels(emotions) for each training sample
labelListVal = []
speakerList = [] #list of strings used to store the speaker identity

for f in fileList:
    label = getEmotionLabel(f)
    [Fs, sample] = audioBasicIO.readAudioFile(f)
    sample = audioBasicIO.stereo2mono(sample) #feature extraction can be performed only on mono signals
    speaker = getSpeakerLabel(f)
    features = emoFeatExtract(sample, Fs, 0.050*Fs, 0.025*Fs)
    featureList.append(features)
    #Binary Activation Labels
    if (label == '01' or label == '02' or label == '04' or label == '07'):
        labelListAct.append('Low')
    else:
        labelListAct.append('High')   
    if (label == '04' or label == '05' or label == '06' or label == '07'):
        labelListVal.append('Negative')
    else:
        labelListVal.append('Positive')
    speakerList.append(speaker)

final = []
Exemple #9
0
import matplotlib.pyplot as plt
from pyAudioAnalysis import audioBasicIO
import numpy
import cPickle
import audioTrainTest
# [features, classNames, fileNames] = audioFeatureExtraction.dirsWavFeatureExtraction(['train/bumps', 'train/door',
#                                                                                      'train/steps', 'train/speech',
#                                                                                      'train/specificDoor',
#                                                                                      'train/background'],
#                                                                                     0.25, 0.25, 0.02, 0.02)

[Fs1, x1] = audioBasicIO.readAudioFile("backgr5.wav")
x1 = audioBasicIO.stereo2mono(x1)
[Fs2, x2] = audioBasicIO.readAudioFile("boots10.wav")
x2 = audioBasicIO.stereo2mono(x2)
[Fs3, x3] = audioBasicIO.readAudioFile("door12.wav")
x3 = audioBasicIO.stereo2mono(x3)
[Fs4, x4] = audioBasicIO.readAudioFile("drop_key1.wav")
x4 = audioBasicIO.stereo2mono(x4)
[Fs5, x5] = audioBasicIO.readAudioFile("eng6.wav")
x5 = audioBasicIO.stereo2mono(x5)
[Fs6, x6] = audioBasicIO.readAudioFile("man_scream1.wav")
x6 = audioBasicIO.stereo2mono(x6)
[Fs7, x7] = audioBasicIO.readAudioFile("door19.wav")
x7 = audioBasicIO.stereo2mono(x7)
print Fs1
print x1

# print(Fs1*0.040)
# print(len(x1))
# print(Fs1,Fs2,Fs3,Fs4,Fs5,Fs6,Fs7)
def evaluateSpeechMusic(fileName,
                        modelName,
                        method="svm",
                        postProcess=0,
                        postProcessModelName="",
                        PLOT=False):
    # load grount truth file (matlab annotation)

    matFile = fileName.replace(".wav", "_true.mat")
    if os.path.isfile(matFile):
        matfile = loadmat(matFile)
        segs_gt = matfile["segs_r"]
        classes_gt1 = matfile["classes_r"]
        classes_gt = []
        for c in classes_gt1[0]:
            if c == "M":
                classes_gt.append("music")
            if c == "S" or c == "E":
                classes_gt.append("speech")
        flagsIndGT, classesAllGT = audioSegmentation.segs2flags(
            [s[0] for s in segs_gt], [s[1] for s in segs_gt], classes_gt, 1.0)
    if method == "svm" or method == "randomforest" or method == "gradientboosting" or method == "extratrees":
        # speech-music segmentation:
        [flagsInd, classesAll, acc,
         CM] = audioSegmentation.mtFileClassification(fileName, modelName,
                                                      method, False, '')
    elif method == "hmm":
        [flagsInd, classesAll, _,
         _] = audioSegmentation.hmmSegmentation(fileName,
                                                modelName,
                                                PLOT=False,
                                                gtFileName="")
    elif method == "cnn":
        WIDTH_SEC = 2.4
        [Fs, x] = io.readAudioFile(fileName)
        x = io.stereo2mono(x)
        [flagsInd, classesAll,
         CNNprobs] = mtCNN_classification(x, Fs, WIDTH_SEC, 1.0,
                                          RGB_singleFrame_net, SOUND_mean_RGB,
                                          transformer_RGB, classNamesCNN)

    for i in range(flagsIndGT.shape[0]):
        flagsIndGT[i] = classesAll.index(classesAllGT[flagsIndGT[i]])

    #plt.plot(flagsIndGT, 'r')
    #plt.plot(flagsInd)
    #plt.show()

    #print classesAllGT, classesAll
    if postProcess >= 1:
        # medfilt here!
        flagsInd = scipy.signal.medfilt(flagsInd, 11)
    if postProcess >= 2:  #load HMM
        try:
            fo = open(postProcessModelName, "rb")
        except IOError:
            print "didn't find file"
            return
        try:
            hmm = cPickle.load(fo)
            classesAll = cPickle.load(fo)
        except:
            fo.close()

#Features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);    # feature extraction
#[Features, _] = aF.mtFeatureExtraction(x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050))
        flagsInd = hmm.predict(CNNprobs)
        flagsInd = scipy.signal.medfilt(flagsInd, 3)

    if PLOT:
        plt.plot(flagsInd + 0.01)
        plt.plot(flagsIndGT, 'r')
        plt.show()
    CM = np.zeros((2, 2))
    for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])):
        CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1
    print CM
    return CM, classesAll
def musicThumbnailing(x,
                      fs,
                      short_term_size=1.0,
                      short_term_step=0.5,
                      thumb_size=10.0,
                      limit_1=0,
                      limit_2=1):
    '''
    This function detects instances of the most representative part of a
    music recording, also called "music thumbnails".
    A technique similar to the one proposed in [1], however a wider set of
    audio features is used instead of chroma features.
    In particular the following steps are followed:
     - Extract short-term audio features. Typical short-term window size: 1 second
     - Compute the self-silimarity matrix, i.e. all pairwise similarities between feature vectors
     - Apply a diagonal mask is as a moving average filter on the values of the self-similarty matrix.
       The size of the mask is equal to the desirable thumbnail length.
     - Find the position of the maximum value of the new (filtered) self-similarity matrix.
       The audio segments that correspond to the diagonial around that position are the selected thumbnails
    

    ARGUMENTS:
     - x:            input signal
     - fs:            sampling frequency
     - short_term_size:     window size (in seconds)
     - short_term_step:    window step (in seconds)
     - thumb_size:    desider thumbnail size (in seconds)
    
    RETURNS:
     - A1:            beginning of 1st thumbnail (in seconds)
     - A2:            ending of 1st thumbnail (in seconds)
     - B1:            beginning of 2nd thumbnail (in seconds)
     - B2:            ending of 2nd thumbnail (in seconds)

    USAGE EXAMPLE:
       import audioFeatureExtraction as aF
     [fs, x] = basicIO.readAudioFile(input_file)
     [A1, A2, B1, B2] = musicThumbnailing(x, fs)

    [1] Bartsch, M. A., & Wakefield, G. H. (2005). Audio thumbnailing
    of popular music using chroma-based representations.
    Multimedia, IEEE Transactions on, 7(1), 96-104.
    '''
    x = audioBasicIO.stereo2mono(x)
    # feature extraction:
    st_feats, _ = aF.stFeatureExtraction(x, fs, fs * short_term_size,
                                         fs * short_term_step)

    # self-similarity matrix
    S = selfSimilarityMatrix(st_feats)

    # moving filter:
    M = int(round(thumb_size / short_term_step))
    B = numpy.eye(M, M)
    S = scipy.signal.convolve2d(S, B, 'valid')

    # post-processing (remove main diagonal elements)
    min_sm = numpy.min(S)
    for i in range(S.shape[0]):
        for j in range(S.shape[1]):
            if abs(i - j) < 5.0 / short_term_step or i > j:
                S[i, j] = min_sm

    # find max position:
    S[0:int(limit_1 * S.shape[0]), :] = min_sm
    S[:, 0:int(limit_1 * S.shape[0])] = min_sm
    S[int(limit_2 * S.shape[0])::, :] = min_sm
    S[:, int(limit_2 * S.shape[0])::] = min_sm

    maxVal = numpy.max(S)
    [I, J] = numpy.unravel_index(S.argmax(), S.shape)
    #plt.imshow(S)
    #plt.show()
    # expand:
    i1 = I
    i2 = I
    j1 = J
    j2 = J

    while i2 - i1 < M:
        if i1 <= 0 or j1 <= 0 or i2 >= S.shape[0] - 2 or j2 >= S.shape[1] - 2:
            break
        if S[i1 - 1, j1 - 1] > S[i2 + 1, j2 + 1]:
            i1 -= 1
            j1 -= 1
        else:
            i2 += 1
            j2 += 1

    return short_term_step * i1, short_term_step * i2, \
           short_term_step * j1, short_term_step * j2, S
def silenceRemoval(x,
                   fs,
                   st_win,
                   st_step,
                   minDuration=0.2,
                   maxDuration=0,
                   smoothWindow=1.0,
                   weight=0.5,
                   plot=False):
    '''
    Event Detection (silence removal)
    ARGUMENTS:
         - x:                   the input audio signal
         - fs:                  sampling freq
         - st_win, st_step:     window size and step in seconds
         - minDuration:         (optional) minium duration in seconds of segments to keep
         - maxDuration:         (optional) maximum duration in seconds of segments to keep (0 is no limit)
         - smoothWindow:        (optional) smooth window (in seconds)
         - weight:              (optional) weight factor (0 < weight < 1) the higher, the more strict
         - plot:                (optional) True if results are to be plotted
    RETURNS:
         - seg_limits:    list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that
                    the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds
    '''

    if weight >= 1:
        weight = 0.99
    if weight <= 0:
        weight = 0.01

    # Step 1: feature extraction
    x = audioBasicIO.stereo2mono(x)
    st_feats, _ = aF.stFeatureExtraction(x, fs, st_win * fs, st_step * fs)

    # Step 2: train binary svm classifier of low vs high energy frames
    # keep only the energy short-term sequence (2nd feature)
    st_energy = st_feats[1, :]
    en = numpy.sort(st_energy)
    # number of 10% of the total short-term windows
    l1 = int(len(en) / 10)
    # compute "lower" 10% energy threshold
    t1 = numpy.mean(en[0:l1]) + 0.000000000000001
    # compute "higher" 10% energy threshold
    t2 = numpy.mean(en[-l1:-1]) + 0.000000000000001
    # get all features that correspond to low energy
    class1 = st_feats[:, numpy.where(st_energy <= t1)[0]]
    # get all features that correspond to high energy
    class2 = st_feats[:, numpy.where(st_energy >= t2)[0]]
    # form the binary classification task and ...
    faets_s = [class1.T, class2.T]
    # normalize and train the respective svm probabilistic model
    # (ONSET vs SILENCE)
    [faets_s_norm, means_s, stds_s] = aT.normalizeFeatures(faets_s)
    svm = aT.trainSVM(faets_s_norm, 1.0)

    # Step 3: compute onset probability based on the trained svm
    prob_on_set = []
    for i in range(st_feats.shape[1]):
        # for each frame
        cur_fv = (st_feats[:, i] - means_s) / stds_s
        # get svm probability (that it belongs to the ONSET class)
        prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1])
    prob_on_set = numpy.array(prob_on_set)
    # smooth probability:
    prob_on_set = smoothMovingAvg(prob_on_set, smoothWindow / st_step)

    # Step 4A: detect onset frame indices:
    prog_on_set_sort = numpy.sort(prob_on_set)
    # find probability Threshold as a weighted average
    # of top 10% and lower 10% of the values
    Nt = int(prog_on_set_sort.shape[0] / 10)
    T = (numpy.mean((1 - weight) * prog_on_set_sort[0:Nt]) +
         weight * numpy.mean(prog_on_set_sort[-Nt::]))

    max_idx = numpy.where(prob_on_set > T)[0]
    # get the indices of the frames that satisfy the thresholding
    i = 0
    time_clusters = []
    seg_limits = []

    # Step 4B: group frame indices to onset segments
    while i < len(max_idx):
        # for each of the detected onset indices
        cur_cluster = [max_idx[i]]
        if i == len(max_idx) - 1:
            break
        while max_idx[i + 1] - cur_cluster[-1] <= 2:
            cur_cluster.append(max_idx[i + 1])
            i += 1
            if i == len(max_idx) - 1:
                break
        i += 1
        time_clusters.append(cur_cluster)
        seg_limits.append(
            [cur_cluster[0] * st_step, cur_cluster[-1] * st_step])

    # Step 5: Post process: remove segments that are outside the min and max segment bounds:
    seg_limits_2 = []
    for s in seg_limits:
        if s[1] - s[0] > minDuration and (
            (maxDuration == None or maxDuration == 0) or
            (s[1] - s[0] < maxDuration)):
            seg_limits_2.append(s)
    seg_limits = seg_limits_2

    if plot:
        timeX = numpy.arange(0, x.shape[0] / float(fs), 1.0 / fs)

        plt.subplot(2, 1, 1)
        plt.plot(timeX, x)
        for s in seg_limits:
            plt.axvline(x=s[0])
            plt.axvline(x=s[1])
        plt.subplot(2, 1, 2)
        plt.plot(
            numpy.arange(0, prob_on_set.shape[0] * st_step,
                         st_step)[:len(prob_on_set)], prob_on_set)
        plt.title('Signal')
        for s in seg_limits:
            plt.axvline(x=s[0])
            plt.axvline(x=s[1])
        plt.title('svm Probability')
        plt.show()

    return seg_limits
Exemple #13
0
"""! 
@brief Example 29
@details: Music segmentation example
@author Theodoros Giannakopoulos {[email protected]}
"""
import os, readchar, sklearn.cluster
from pyAudioAnalysis.audioFeatureExtraction import mtFeatureExtraction as mT
from pyAudioAnalysis.audioBasicIO import readAudioFile, stereo2mono
from pyAudioAnalysis.audioSegmentation import flags2segs
from pyAudioAnalysis.audioTrainTest import normalizeFeatures

if __name__ == '__main__':
    # read signal and get normalized segment features:
    input_file = "../data/song1.mp3"
    fs, x = readAudioFile(input_file)
    x = stereo2mono(x)
    mt_size, mt_step, st_win = 5, 0.5, 0.05
    [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs,
                                 round(fs * st_win), round(fs * st_win * 0.5))
    (mt_feats_norm, MEAN, STD) = normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    # perform clustering (k = 4)
    n_clusters = 4
    k_means = sklearn.cluster.KMeans(n_clusters=n_clusters)
    k_means.fit(mt_feats_norm.T)
    cls = k_means.labels_
    segs, c = flags2segs(cls, mt_step)  # convert flags to segment limits
    for sp in range(n_clusters):  # play each cluster's segment
        for i in range(len(c)):
            if c[i] == sp and segs[i, 1] - segs[i, 0] > 5:
                # play long segments of current cluster (only win_to_play seconds)
Exemple #14
0
print('Converting...')
sound = pydub.AudioSegment.from_mp3(
    os.path.join('C:\\', 'Users', 'akauf', 'Desktop', 'song.mp3'))
sound.export(os.path.join('E:\\', 'Python_Projects', 'Audio_engine',
                          'temp.wav'),
             format="wav")
print('Converted File to wav!')
print('Horaay!')
print('I am coolguy')

#Loads audio into bits file
print('Extracting Data...')
[Fs, x] = audioBasicIO.readAudioFile(
    os.path.join('E:\\', 'Python_Projects', 'Audio_engine', 'temp.wav'))
x = audioBasicIO.stereo2mono(x)  # Collapses to mono signal
F = audioFeatureExtraction.stFeatureExtraction(
    x, Fs, 0.050 * Fs, 0.025 * Fs)  #Creates an array of features per frame
print('Extracted!')

N = []  #Array of features that we are going to use and modify
harmonic_p = []  #Item for array N, power of higher frequencies
percussive_p = []  #Item for array N, power of lower frequencies
arc_p = []  #Item for array N, total track power
spec_center_p = []
chaos_p = []

#PHASE THIS OUT IN FUTURE VERSIONS
perc_colors = [0, 10879, 5461]  #List of hue values for percparse to use

def silenceCounter(x,
                   fs,
                   st_win,
                   st_step,
                   smoothWindow=0.5,
                   weight=0.5,
                   plot=False):
    if weight >= 1:
        weight = 0.99
    if weight <= 0:
        weight = 0.01

    # Step 1: feature extraction
    x = audioBasicIO.stereo2mono(x)
    st_feats, _ = aF.stFeatureExtraction(x, fs, st_win * fs, st_step * fs)

    # Step 2: train binary svm classifier of low vs high energy frames
    # keep only the energy short-term sequence (2nd feature)
    st_energy = st_feats[1, :]
    en = numpy.sort(st_energy)
    # number of 10% of the total short-term windows
    l1 = int(len(en) / 10)
    # compute "lower" 10% energy threshold
    t1 = numpy.mean(en[0:l1]) + 0.000000000000001
    # compute "higher" 10% energy threshold
    t2 = numpy.mean(en[-l1:-1]) + 0.000000000000001
    # get all features that correspond to low energy
    class1 = st_feats[:, numpy.where(st_energy <= t1)[0]]
    # get all features that correspond to high energy
    class2 = st_feats[:, numpy.where(st_energy >= t2)[0]]
    # form the binary classification task and ...

    # change the order of the array
    # faets_s = [class1.T, class2.T]

    # changing order gives the segmens with silence
    faets_s = [class2.T, class1.T]

    # normalize and train the respective svm probabilistic model
    # (SILENCE vs ONSET)
    [faets_s_norm, means_s, stds_s] = aT.normalizeFeatures(faets_s)
    svm = aT.trainSVM(faets_s_norm, 1.0)

    # Step 3: compute onset probability based on the trained svm
    prob_on_set = []
    for i in range(st_feats.shape[1]):
        # for each frame
        cur_fv = (st_feats[:, i] - means_s) / stds_s
        # get svm probability (that it belongs to the ONSET class)
        prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1])
    prob_on_set = numpy.array(prob_on_set)
    # smooth probability:
    prob_on_set = smoothMovingAvg(prob_on_set, smoothWindow / st_step)

    # Step 4A: detect onset frame indices:
    prog_on_set_sort = numpy.sort(prob_on_set)
    # find probability Threshold as a weighted average
    # of top 10% and lower 10% of the values
    Nt = int(prog_on_set_sort.shape[0] / 10)
    T = (numpy.mean((1 - weight) * prog_on_set_sort[0:Nt]) +
         weight * numpy.mean(prog_on_set_sort[-Nt::]))

    max_idx = numpy.where(prob_on_set > T)[0]
    # get the indices of the frames that satisfy the thresholding
    i = 0
    time_clusters = []
    seg_limits = []

    # Step 4B: group frame indices to onset segments
    while i < len(max_idx):
        # for each of the detected onset indices
        cur_cluster = [max_idx[i]]
        if i == len(max_idx) - 1:
            break
        while max_idx[i + 1] - cur_cluster[-1] <= 2:
            cur_cluster.append(max_idx[i + 1])
            i += 1
            if i == len(max_idx) - 1:
                break
        i += 1
        time_clusters.append(cur_cluster)
        seg_limits.append(
            [cur_cluster[0] * st_step, cur_cluster[-1] * st_step])

    # Step 5: Post process: remove very small segments:
    min_dur = 0.2
    seg_limits_2 = []
    for s in seg_limits:
        if s[1] - s[0] > min_dur:
            seg_limits_2.append(s)
    print(f"SEGMENTS 0.2: {seg_limits_2}")
    print(F"SEGMENTS: {seg_limits}")
Exemple #16
0
def dirWavFeatureExtraction(dirName,
                            mtWin,
                            mtStep,
                            stWin,
                            stStep,
                            computeBEAT=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mtWin, mtStep:    mid-term window and step (in seconds)
        - stWin, stStep:    short-term window and step (in seconds)
    """

    allMtFeatures = numpy.array([])
    processingTimes = []

    types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(dirName, files)))

    wavFilesList = sorted(wavFilesList)
    wavFilesList2 = []
    for i, wavFile in enumerate(wavFilesList):
        print("Analyzing file {0:d} of "
              "{1:d}: {2:s}".format(i + 1, len(wavFilesList), wavFile))
        if os.stat(wavFile).st_size == 0:
            print("   (EMPTY FILE -- SKIPPING)")
            continue
        [Fs, x] = audioBasicIO.readAudioFile(wavFile)  # read file
        if isinstance(x, int):
            continue

        t1 = time.clock()
        x = audioBasicIO.stereo2mono(x)  # convert stereo to mono
        if x.shape[0] < float(Fs) / 5:
            print("  (AUDIO FILE TOO SMALL - SKIPPING)")
            continue
        wavFilesList2.append(wavFile)
        if computeBEAT:  # mid-term feature extraction for current file
            [MidTermFeatures,
             stFeatures] = mtFeatureExtraction(x, Fs, round(mtWin * Fs),
                                               round(mtStep * Fs),
                                               round(Fs * stWin),
                                               round(Fs * stStep))
            [beat, beatConf] = beatExtraction(stFeatures, stStep)
        else:
            [MidTermFeatures, _] = mtFeatureExtraction(x, Fs,
                                                       round(mtWin * Fs),
                                                       round(mtStep * Fs),
                                                       round(Fs * stWin),
                                                       round(Fs * stStep))

        MidTermFeatures = numpy.transpose(MidTermFeatures)
        MidTermFeatures = MidTermFeatures.mean(
            axis=0)  # long term averaging of mid-term statistics
        if (not numpy.isnan(MidTermFeatures).any()) and (
                not numpy.isinf(MidTermFeatures).any()):
            if computeBEAT:
                MidTermFeatures = numpy.append(MidTermFeatures, beat)
                MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
            if len(allMtFeatures) == 0:  # append feature vector
                allMtFeatures = MidTermFeatures
            else:
                allMtFeatures = numpy.vstack((allMtFeatures, MidTermFeatures))
            t2 = time.clock()
            duration = float(len(x)) / Fs
            processingTimes.append((t2 - t1) / duration)
    if len(processingTimes) > 0:
        print("Feature extraction complexity ratio: "
              "{0:.1f} x realtime".format(
                  (1.0 / numpy.mean(numpy.array(processingTimes)))))
    return (allMtFeatures, wavFilesList2)
Exemple #17
0
def speakerDiarization(fileName,
                       sRange=xrange(2, 10),
                       mtSize=2.0,
                       mtStep=0.2,
                       stWin=0.05,
                       LDAdim=35):
    Fs, x = audioBasicIO.readAudioFile(fileName)
    x = audioBasicIO.stereo2mono(x)
    duration = len(x) / Fs

    Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel(
        os.path.join(
            '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data',
            'knnSpeakerAll'))
    Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel(
        os.path.join(
            '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data',
            'knnSpeakerFemaleMale'))

    MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction(
        x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin),
        round(Fs * stWin * 0.5))
    MidTermFeatures2 = numpy.zeros(
        (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2),
         MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2

        Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1)
        Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2)

        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):,
                         i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2
    iFeaturesSelect = range(8, 21) + range(41, 54)
    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    DistancesAll = numpy.sum(distance.squareform(
        distance.pdist(MidTermFeaturesNorm.T)),
                             axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    perOutLier = (100.0 *
                  (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    if LDAdim > 0:
        mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int(
            round(mtSize / stWin)), int(round(
                stWin / stWin)), list(), len(ShortTermFeatures), 2
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append(list())

        for i in range(numOfFeatures):
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1, N2 = curPos, curPos + mtWinRatio
                if N2 > N: N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i + numOfFeatures].append(
                    numpy.std(curStFeatures))
                curPos += mtStepRatio

        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros(
            (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2),
             mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1)
            Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0],
                                i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[
                mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] +
                                len(classNames1):, i] = P2 + 0.0001

        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures(
            [mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T

        Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin

        for i in range(Labels.shape[0]):
            Labels[i] = int(i * stWin / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels)

        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    clsAll, silAll, centersAll = list(), list(), list()

    for iSpeakers in sRange:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(MidTermFeaturesNorm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        clsAll.append(cls)
        centersAll.append(means)
        silA, silB = list(), list()
        for c in range(iSpeakers):
            clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(
                len(cls))
            if clusterPerCent < 0.02:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c]
                Yt = distance.pdist(MidTermFeaturesNormTemp.T)
                silA.append(numpy.mean(Yt) * clusterPerCent)
                silBs = list()
                for c2 in range(iSpeakers):
                    if c2 != c:
                        clusterPerCent2 = numpy.nonzero(
                            cls == c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,
                                                                       cls ==
                                                                       c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clusterPerCent + clusterPerCent2) / 2.0)
                silBs = numpy.array(silBs)
                silB.append(min(silBs))
        silA, silB, sil = numpy.array(silA), numpy.array(silB), list()
        for c in range(iSpeakers):
            sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001))
        silAll.append(numpy.mean(sil))

    imax = numpy.argmax(silAll)
    nSpeakersFinal = sRange[imax]

    cls = numpy.zeros((numOfWindows, ))
    for i in range(numOfWindows):
        j = numpy.argmin(numpy.abs(i - iNonOutLiers))
        cls[i] = clsAll[imax][j]

    startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls)
    hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag')
    hmm.startprob_ = startprob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov
    cls = hmm.predict(MidTermFeaturesNormOr.T)
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = silAll[imax]
    classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)]

    return cls, classNames, duration, mtStep, silAll
def main(argv):
    dirName = argv[1]
    types = ('*.wav', )
    filesList = []
    for files in types:
        filesList.extend(glob.glob(os.path.join(dirName, files)))
    filesList = sorted(filesList)
    WIDTH_SEC = 2.4
    stWin = 0.020
    stStep = 0.015
    WIDTH = WIDTH_SEC / stStep

    for f in filesList:
        [Fs, x] = audioBasicIO.readAudioFile(f)
        print(Fs)
        x = audioBasicIO.stereo2mono(x)
        specgramOr, TimeAxis, FreqAxis = aF.stSpectogram(
            x, Fs, round(Fs * stWin), round(Fs * stStep), False)
        if specgramOr.shape[0] > WIDTH:
            specgram = specgramOr[int(specgramOr.shape[0] / 2) -
                                  WIDTH / 2:int(specgramOr.shape[0] / 2) +
                                  WIDTH / 2, :]
            specgram = scipy.misc.imresize(specgram,
                                           float(227.0) /
                                           float(specgram.shape[0]),
                                           interp='bilinear')
            print specgram.shape
            im = Image.fromarray(numpy.uint8(
                matplotlib.cm.jet(specgram) * 255))
            #plt.imshow(im)
            scipy.misc.imsave(f.replace(".wav", ".jpg"), im)

            if int(specgramOr.shape[0] / 2) - WIDTH / 2 - int(
                (0.2) / stStep) > 0:
                specgram = specgramOr[
                    int(specgramOr.shape[0] / 2) - WIDTH / 2 -
                    int((0.2) / stStep):int(specgramOr.shape[0] / 2) +
                    WIDTH / 2 - int((0.2) / stStep), :]
                specgram = scipy.misc.imresize(specgram,
                                               float(227.0) /
                                               float(specgram.shape[0]),
                                               interp='bilinear')
                im = Image.fromarray(
                    numpy.uint8(matplotlib.cm.jet(specgram) * 255))
                print specgram.shape
                scipy.misc.imsave(f.replace(".wav", "_02A.jpg"), im)

                specgram = specgramOr[
                    int(specgramOr.shape[0] / 2) - WIDTH / 2 +
                    int((0.2) / stStep):int(specgramOr.shape[0] / 2) +
                    WIDTH / 2 + int((0.2) / stStep), :]
                specgram = scipy.misc.imresize(specgram,
                                               float(227.0) /
                                               float(specgram.shape[0]),
                                               interp='bilinear')
                print specgram.shape
                im = Image.fromarray(
                    numpy.uint8(matplotlib.cm.jet(specgram) * 255))
                scipy.misc.imsave(f.replace(".wav", "_02B.jpg"), im)

                # ONLY FOR SPEECH (fewer samples). Must comment for music
                """specgram = specgramOr[int(specgramOr.shape[0]/2) - WIDTH/2 - int((0.1) / stStep):int(specgramOr.shape[0]/2) + WIDTH/2 - int((0.1) / stStep), :]                
Exemple #19
0
def speakerDiarization(fileName, sRange = xrange(2, 10), mtSize = 2.0, mtStep = 0.2, stWin = 0.05, LDAdim = 35):
	Fs, x = audioBasicIO.readAudioFile(fileName)
	x = audioBasicIO.stereo2mono(x)
	duration = len(x) / Fs

	Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll'))
	Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale'))

	MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5))
	MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1]))

	for i in range(MidTermFeatures.shape[1]):
		curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
		curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2

		Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1)
		Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2)

		MidTermFeatures2[0: MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
		MidTermFeatures2[MidTermFeatures.shape[0]: MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001
		MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001

	MidTermFeatures = MidTermFeatures2
	iFeaturesSelect = range(8, 21) + range(41, 54)
	MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

	MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T])
	MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
	numOfWindows = MidTermFeatures.shape[1]

	DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis = 0)
	MDistancesAll = numpy.mean(DistancesAll)
	iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

	perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
	MidTermFeaturesNormOr = MidTermFeaturesNorm
	MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

	if LDAdim > 0:
		mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int(round(mtSize / stWin)), int(round(stWin / stWin)), list(), len(ShortTermFeatures), 2
		for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list())

		for i in range(numOfFeatures):
			curPos = 0
			N = len(ShortTermFeatures[i])
			while (curPos < N):
				N1, N2 = curPos, curPos + mtWinRatio
				if N2 > N: N2 = N
				curStFeatures = ShortTermFeatures[i][N1: N2]
				mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
				mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures))
				curPos += mtStepRatio

		mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
		mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1]))
		for i in range(mtFeaturesToReduce.shape[1]):
			curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
			curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
			Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1)
			Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2)
			mtFeaturesToReduce2[0: mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]: mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001

		mtFeaturesToReduce = mtFeaturesToReduce2
		mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
		mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures([mtFeaturesToReduce.T])
		mtFeaturesToReduce = mtFeaturesToReduce[0].T
	
		Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ))
		LDAstep = 1.0
		LDAstepRatio = LDAstep / stWin

		for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio)
		clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components = LDAdim)
		clf.fit(mtFeaturesToReduce.T, Labels)

		MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

	clsAll, silAll, centersAll = list(), list(), list()

	for iSpeakers in sRange:
		k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers)
		k_means.fit(MidTermFeaturesNorm.T)
		cls = k_means.labels_
		means = k_means.cluster_centers_

		clsAll.append(cls)
		centersAll.append(means)
		silA, silB = list(), list()
		for c in range(iSpeakers):
			clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls))
			if clusterPerCent < 0.02:
				silA.append(0.0)
				silB.append(0.0)
			else:
				MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c]
				Yt = distance.pdist(MidTermFeaturesNormTemp.T)
				silA.append(numpy.mean(Yt) * clusterPerCent)
				silBs = list()
				for c2 in range(iSpeakers):
					if c2 != c:
						clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls))
						MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2]
						Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
						silBs.append(numpy.mean(Yt) * (clusterPerCent+clusterPerCent2) / 2.0)
				silBs = numpy.array(silBs)
				silB.append(min(silBs))
		silA, silB, sil = numpy.array(silA), numpy.array(silB), list()
		for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c],  silA[c]) + 0.00001))
		silAll.append(numpy.mean(sil))

	imax = numpy.argmax(silAll)
	nSpeakersFinal = sRange[imax]

	cls = numpy.zeros((numOfWindows, ))
	for i in range(numOfWindows):
		j = numpy.argmin(numpy.abs(i - iNonOutLiers))
		cls[i] = clsAll[imax][j]

	startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls)
	hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag')
	hmm.startprob_ = startprob
	hmm.transmat_ = transmat
	hmm.means_ = means
	hmm.covars_ = cov
	cls = hmm.predict(MidTermFeaturesNormOr.T)
	cls = scipy.signal.medfilt(cls, 13)
	cls = scipy.signal.medfilt(cls, 11)

	sil = silAll[imax]
	classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)]

	return cls, classNames, duration, mtStep, silAll
Exemple #20
0
def dirWavFeatureExtraction(dirName,
                            mt_win,
                            mt_step,
                            st_win,
                            st_step,
                            feats,
                            compute_beat=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mt_win, mt_step:    mid-term window and step (in seconds)
        - st_win, st_step:    short-term window and step (in seconds)
    """

    all_mt_feats = numpy.array([])
    process_times = []

    types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au', '*.ogg')
    wav_file_list = []
    for files in types:
        wav_file_list.extend(glob.glob(os.path.join(dirName, files)))

    wav_file_list = sorted(wav_file_list)
    wav_file_list2, mt_feature_names = [], []
    for i, wavFile in enumerate(wav_file_list):
        print("Analyzing file {0:d} of "
              "{1:d}: {2:s}".format(i + 1, len(wav_file_list), wavFile))
        if os.stat(wavFile).st_size == 0:
            print("   (EMPTY FILE -- SKIPPING)")
            continue
        [fs, x] = audioBasicIO.readAudioFile(wavFile)
        if isinstance(x, int):
            continue

        t1 = time.clock()
        x = audioBasicIO.stereo2mono(x)
        if x.shape[0] < float(fs) / 5:
            print("  (AUDIO FILE TOO SMALL - SKIPPING)")
            continue
        wav_file_list2.append(wavFile)
        if compute_beat:
            [mt_term_feats, st_features, mt_feature_names] = \
                mtFeatureExtraction(x, fs, round(mt_win * fs),
                                    round(mt_step * fs),
                                    round(fs * st_win), round(fs * st_step), feats)
            [beat, beat_conf] = beatExtraction(st_features, st_step)
        else:
            [mt_term_feats, _, mt_feature_names] = \
                mtFeatureExtraction(x, fs, round(mt_win * fs),
                                    round(mt_step * fs),
                                    round(fs * st_win), round(fs * st_step), feats)

        mt_term_feats = numpy.transpose(mt_term_feats)
        mt_term_feats = mt_term_feats.mean(axis=0)
        # long term averaging of mid-term statistics
        if (not numpy.isnan(mt_term_feats).any()) and \
                (not numpy.isinf(mt_term_feats).any()):
            if compute_beat:
                mt_term_feats = numpy.append(mt_term_feats, beat)
                mt_term_feats = numpy.append(mt_term_feats, beat_conf)
            if len(all_mt_feats) == 0:
                # append feature vector
                all_mt_feats = mt_term_feats
            else:
                all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats))
            t2 = time.clock()
            duration = float(len(x)) / fs
            process_times.append((t2 - t1) / duration)
    if len(process_times) > 0:
        print("Feature extraction complexity ratio: "
              "{0:.1f} x realtime".format(
                  (1.0 / numpy.mean(numpy.array(process_times)))))
    return (all_mt_feats, wav_file_list2, mt_feature_names)
Exemple #21
0
def mtFileClassification(inputFile, modelName, modelType, plotResults=False, gtFile=""):
    '''
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used, i.e. a pre-trained classifier.
    ARGUMENTS:
        - inputFile:        path of the input WAV file
        - modelName:        name of the classification model
        - modelType:        svm or knn depending on the classifier type
        - plotResults:      True if results are to be plotted using matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the class ID of the i-th segment
    '''

    if not os.path.isfile(modelName):
        print("mtFileClassificationError: input modelType not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if (modelType == 'svm') or (modelType == 'svm_rbf'):
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin,
            stStep, computeBEAT] = aT.loadSVModel(modelName)
    elif modelType == 'knn':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin,
            stStep, computeBEAT] = aT.loadKNNModel(modelName)
    elif modelType == 'randomforest':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin,
            stStep, computeBEAT] = aT.loadRandomForestModel(modelName)
    elif modelType == 'gradientboosting':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT] = aT.loadGradientBoostingModel(modelName)
    elif modelType == 'extratrees':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin,
            stStep, computeBEAT] = aT.loadExtraTreesModel(modelName)

    if computeBEAT:
        print("Model " + modelName +
              " contains long-term music features (beat etc) and cannot be used in segmentation")
        return (-1, -1, -1, -1)
    [Fs, x] = audioBasicIO.readAudioFile(inputFile)        # load input file
    if Fs == -1:                                           # could not read file
        return (-1, -1, -1, -1)
    # convert stereo (if) to mono
    x = audioBasicIO.stereo2mono(x)
    Duration = len(x) / Fs
    # mid-term feature extraction:
    [MidTermFeatures, _] = aF.mtFeatureExtraction(
        x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep))
    flags = []
    Ps = []
    flagsInd = []
    # for each feature vector (i.e. for each fix-sized segment):
    for i in range(MidTermFeatures.shape[1]):
        # normalize current feature vector
        curFV = (MidTermFeatures[:, i] - MEAN) / STD
        [Result, P] = aT.classifierWrapper(
            Classifier, modelType, curFV)    # classify vector
        flagsInd.append(Result)
        # update class label matrix
        flags.append(classNames[int(Result)])
        # update probability matrix
        Ps.append(numpy.max(P))
    flagsInd = numpy.array(flagsInd)

    # 1-window smoothing
    for i in range(1, len(flagsInd) - 1):
        if flagsInd[i - 1] == flagsInd[i + 1]:
            flagsInd[i] = flagsInd[i + 1]
    # convert fix-sized flags to segments and classes
    (segs, classes) = flags2segs(flags, mtStep)
    segs[-1] = len(x) / float(Fs)

    # Load grount-truth:
    if os.path.isfile(gtFile):
        [segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile)
        flagsGT, classNamesGT = segs2flags(
            segStartGT, segEndGT, segLabelsGT, mtStep)
        flagsIndGT = []
        for j, fl in enumerate(flagsGT):                    # "align" labels with GT
            if classNamesGT[flagsGT[j]] in classNames:
                flagsIndGT.append(classNames.index(classNamesGT[flagsGT[j]]))
            else:
                flagsIndGT.append(-1)
        flagsIndGT = numpy.array(flagsIndGT)
        CM = numpy.zeros((len(classNamesGT), len(classNamesGT)))
        for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])):
            CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1
    else:
        CM = []
        flagsIndGT = numpy.array([])
    acc = plotSegmentationResults(
        flagsInd, flagsIndGT, classNames, mtStep, not plotResults)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc))
        return (flagsInd, classNamesGT, acc, CM)
    else:
        return (flagsInd, classNames, acc, CM)
Exemple #22
0
         feeling_list.append('male_surprised')
     elif (item[6:8]=='08' and int(item[18:20])%2==0) or (item[:3]=='sor' and item[4]=='f'):
         feeling_list.append('female_surprised')
     elif item[:1] == 'd' or (item[:3]=='dis' and item[4]=='m') or (item[6:8]=='07' and int(item[18:20])%2==1):
         feeling_list.append('male_disgust')
     elif (item[:3]=='dis' and item[4]=='f') or (item[6:8]=='07' and int(item[18:20])%2==0):
         feeling_list.append('female_disgust')
     stdout.write('\033[F')
     
 labels = np.array(feeling_list)
 np.save(EMOTION_LABEL_PICKLE, labels)
 bookmark=0
 for file_name in fileList:
     print('[INFO] Extracting features - {} / {} - {}'.format(bookmark + 1, len(fileList), file_name))
     [Fs, x] = audioBasicIO.readAudioFile(args['dataset'] + '/' + file_name)
     x = audioBasicIO.stereo2mono(x)
     features, feature_names = audioFeatureExtraction.stFeatureExtraction(x, Fs, FRAME_SIZE * Fs, FRAME_SIZE / 2 * Fs);
     # npArray = np.array([np.array(feature, copy=False) for feature in features], copy=False, ndmin=3)
     npArray = np.array(features, ndmin=3)
     if (bookmark == 0):
         # soundData = pd.DataFrame(data=npArray, columns=feature_names)
         soundData = npArray
         columns = np.array(feature_names)
     else:
         if soundData.shape[2] > npArray.shape[2]:
             npArray = np.pad(npArray, ( (0, 0), (0, 0), (0, soundData.shape[2] - npArray.shape[2]) ), 'constant')
         else:
             soundData = np.pad(soundData, ( (0, 0), (0, 0), (0, npArray.shape[2] - soundData.shape[2]) ), 'constant')
         soundData = np.concatenate((soundData, npArray))
     bookmark=bookmark+1
     stdout.write('\033[F')
Exemple #23
0
import time

#set chunk size in seconds to work with
chunk_size = 3
start_time = time.time()
Fs = wf.getframerate()
CHUNK = Fs * chunk_size

#read next chunk from audio file
data = wf.readframes(CHUNK)

#iterate over track
while data != '':
    #stream.write(data)
    array = _wav2array(wf.getnchannels(), wf.getsampwidth(), data)
    array = audioBasicIO.stereo2mono(array)
    #extract features
    MidTermFeatures = aF.mtFeatureExtraction(array, Fs, mtWin * Fs,
                                             mtStep * Fs, round(Fs * stWin),
                                             round(Fs * stStep))
    MidTermFeatures = MidTermFeatures[0]

    #classify chunks to speech/music
    flags = []
    Ps = []
    flagsInd = []
    for i in range(
            MidTermFeatures[0].shape[0]
    ):  # for each feature vector (i.e. for each fix-sized segment):
        curFV = (MidTermFeatures[:, i] -
                 MEAN) / STD  # normalize current feature vector
def mtFileClassification(input_file,
                         model_name,
                         model_type,
                         plot_results=False,
                         gt_file=""):
    '''
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used, i.e. a pre-trained classifier.
    ARGUMENTS:
        - input_file:        path of the input WAV file
        - model_name:        name of the classification model
        - model_type:        svm or knn depending on the classifier type
        - plot_results:      True if results are to be plotted using
                             matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the
                            endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the
                            class ID of the i-th segment
    '''

    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if model_type == "knn":
        [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = \
            aT.load_model_knn(model_name)
    else:
        [
            classifier, MEAN, STD, class_names, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = aT.load_model(model_name)

    if compute_beat:
        print("Model " + model_name + " contains long-term music features "
              "(beat etc) and cannot be used in "
              "segmentation")
        return (-1, -1, -1, -1)
    [fs, x] = audioBasicIO.readAudioFile(input_file)  # load input file
    if fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audioBasicIO.stereo2mono(x)  # convert stereo (if) to mono
    duration = len(x) / fs
    # mid-term feature extraction:
    [mt_feats, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs,
                                              round(fs * st_win),
                                              round(fs * st_step))
    flags = []
    Ps = []
    flags_ind = []
    for i in range(
            mt_feats.shape[1]
    ):  # for each feature vector (i.e. for each fix-sized segment):
        cur_fv = (mt_feats[:, i] -
                  MEAN) / STD  # normalize current feature vector
        [res, P] = aT.classifierWrapper(classifier, model_type,
                                        cur_fv)  # classify vector
        flags_ind.append(res)
        flags.append(class_names[int(res)])  # update class label matrix
        Ps.append(numpy.max(P))  # update probability matrix
    flags_ind = numpy.array(flags_ind)

    # 1-window smoothing
    for i in range(1, len(flags_ind) - 1):
        if flags_ind[i - 1] == flags_ind[i + 1]:
            flags_ind[i] = flags_ind[i + 1]
    # convert fix-sized flags to segments and classes
    (segs, classes) = flags2segs(flags, mt_step)
    segs[-1] = len(x) / float(fs)

    # Load grount-truth:
    if os.path.isfile(gt_file):
        [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt,
                                              seg_l_gt, mt_step)
        flags_ind_gt = []
        for j, fl in enumerate(flags_gt):
            # "align" labels with GT
            if class_names_gt[flags_gt[j]] in class_names:
                flags_ind_gt.append(
                    class_names.index(class_names_gt[flags_gt[j]]))
            else:
                flags_ind_gt.append(-1)
        flags_ind_gt = numpy.array(flags_ind_gt)
        cm = numpy.zeros((len(class_names_gt), len(class_names_gt)))
        for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])):
            cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1
    else:
        cm = []
        flags_ind_gt = numpy.array([])
    acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names,
                                  mt_step, not plot_results)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc))
        return (flags_ind, class_names_gt, acc, cm)
    else:
        return (flags_ind, class_names, acc, cm)
Exemple #25
0
def main(argv):
    if argv[2] == 'full':
        dirName = argv[1]
        types = ('*.wav', )
        filesList = []
        for files in types:
            filesList.extend(glob.glob(os.path.join(dirName, files)))
        filesList = sorted(filesList)

        filesListIrr = []

        filesListIrr = sorted(filesListIrr)

        stWin = 0.020
        stStep = 0.015
        for f in filesList:
            [Fs, x] = audioBasicIO.readAudioFile(f)
            x = audioBasicIO.stereo2mono(x)
            createSpectrogramFile(x, Fs, f.replace(".wav", ".png"), stWin,
                                  stStep)

    else:
        dirName = argv[1]
        dirNameIrrelevant = argv[2]
        types = ('*.wav', )
        filesList = []
        for files in types:
            filesList.extend(glob.glob(os.path.join(dirName, files)))
        filesList = sorted(filesList)

        filesListIrr = []
        for files in types:
            filesListIrr.extend(
                glob.glob(os.path.join(dirNameIrrelevant, files)))
        filesListIrr = sorted(filesListIrr)
        print filesListIrr

        WIDTH_SEC = 1.5
        stWin = 0.040
        stStep = 0.005
        WIDTH = WIDTH_SEC / stStep

        for f in filesList:
            print f
            [Fs, x] = audioBasicIO.readAudioFile(f)
            x = audioBasicIO.stereo2mono(x)
            x = x.astype(float) / x.max()
            for i in range(3):
                if x.shape[0] > WIDTH_SEC * Fs + 200:
                    randStartSignal = random.randrange(
                        0, int(x.shape[0] - WIDTH_SEC * Fs - 200))
                    x2 = x[randStartSignal:randStartSignal +
                           int((WIDTH_SEC + stStep) * Fs)]
                    createSpectrogramFile(x2, Fs, f.replace(".wav", ".png"),
                                          stWin, stStep)  # ORIGINAL

                    if len(dirNameIrrelevant) > 0:
                        # AUGMENTED
                        randIrrelevant = random.randrange(0, len(filesListIrr))
                        [Fs, xnoise] = audioBasicIO.readAudioFile(
                            filesListIrr[randIrrelevant])
                        xnoise = xnoise.astype(float) / xnoise.max()

                        randStartNoise = random.randrange(
                            0, xnoise.shape[0] - WIDTH_SEC * Fs - 200)
                        R = 5
                        xN = (R * x2.astype(float) +
                              xnoise[randStartNoise:randStartNoise +
                                     x2.shape[0]].astype(float)) / float(R + 1)
                        wavfile.write(
                            f.replace(".wav", "_rnoise{0:d}1.wav".format(i)),
                            Fs, (16000 * xN).astype('int16'))
                        createSpectrogramFile(
                            xN, Fs,
                            f.replace(".wav", "_rnoise{0:d}1.png".format(i)),
                            stWin, stStep)

                        randStartNoise = random.randrange(
                            0, xnoise.shape[0] - WIDTH_SEC * Fs - 200)
                        R = 4
                        xN = (R * x2.astype(float) +
                              xnoise[randStartNoise:randStartNoise +
                                     x2.shape[0]].astype(float)) / float(R + 1)
                        wavfile.write(
                            f.replace(".wav", "_rnoise{0:d}2.wav".format(i)),
                            Fs, (16000 * xN).astype('int16'))
                        createSpectrogramFile(
                            xN, Fs,
                            f.replace(".wav", "_rnoise{0:d}2.png".format(i)),
                            stWin, stStep)

                        randStartNoise = random.randrange(
                            0, xnoise.shape[0] - WIDTH_SEC * Fs - 200)
                        R = 3
                        xN = (R * x2.astype(float) +
                              xnoise[randStartNoise:randStartNoise +
                                     x2.shape[0]].astype(float)) / float(R + 1)
                        wavfile.write(
                            f.replace(".wav", "_rnoise{0:d}3.wav".format(i)),
                            Fs, (16000 * xN).astype('int16'))
                        createSpectrogramFile(
                            xN, Fs,
                            f.replace(".wav", "_rnoise{0:d}3.png".format(i)),
                            stWin, stStep)

                        #specgramOr, TimeAxis, FreqAxis = aF.stSpectogram(x2, Fs, round(Fs * stWin), round(Fs * stStep), False)
                        #im2 = Image.fromarray(numpy.uint8(matplotlib.cm.jet(specgram)*255))
                        #plt.subplot(2,1,1)
                        #plt.imshow(im1)
                        #plt.subplot(2,1,2)
                        #plt.imshow(im2)
                        #plt.show()
                        '''
def dirWavFeatureExtraction(dirName, mt_win, mt_step, st_win, st_step,
                            compute_beat=False):
    """
    This function extracts the mid-term features of the WAVE files of a particular folder.

    The resulting feature vector is extracted by long-term averaging the mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mt_win, mt_step:    mid-term window and step (in seconds)
        - st_win, st_step:    short-term window and step (in seconds)
    """

    all_mt_feats = numpy.array([])
    process_times = []

    types = ('*.wav', '*.aif',  '*.aiff', '*.mp3', '*.au', '*.ogg')
    wav_file_list = []
    for files in types:
        wav_file_list.extend(glob.glob(os.path.join(dirName, files)))

    wav_file_list = sorted(wav_file_list)    
    wav_file_list2, mt_feature_names = [], []
    for i, wavFile in enumerate(wav_file_list):        
        print("Analyzing file {0:d} of "
              "{1:d}: {2:s}".format(i+1,
                                    len(wav_file_list),
                                    wavFile))
        if os.stat(wavFile).st_size == 0:
            print("   (EMPTY FILE -- SKIPPING)")
            continue        
        [fs, x] = audioBasicIO.readAudioFile(wavFile)
        if isinstance(x, int):
            continue        

        t1 = time.clock()        
        x = audioBasicIO.stereo2mono(x)
        if x.shape[0]<float(fs)/5:
            print("  (AUDIO FILE TOO SMALL - SKIPPING)")
            continue
        wav_file_list2.append(wavFile)
        if compute_beat:
            [mt_term_feats, st_features, mt_feature_names] = \
                mtFeatureExtraction(x, fs, round(mt_win * fs),
                                    round(mt_step * fs),
                                    round(fs * st_win), round(fs * st_step))
            [beat, beat_conf] = beatExtraction(st_features, st_step)
        else:
            [mt_term_feats, _, mt_feature_names] = \
                mtFeatureExtraction(x, fs, round(mt_win * fs),
                                    round(mt_step * fs),
                                    round(fs * st_win), round(fs * st_step))

        mt_term_feats = numpy.transpose(mt_term_feats)
        mt_term_feats = mt_term_feats.mean(axis=0)
        # long term averaging of mid-term statistics
        if (not numpy.isnan(mt_term_feats).any()) and \
                (not numpy.isinf(mt_term_feats).any()):
            if compute_beat:
                mt_term_feats = numpy.append(mt_term_feats, beat)
                mt_term_feats = numpy.append(mt_term_feats, beat_conf)
            if len(all_mt_feats) == 0:
                # append feature vector
                all_mt_feats = mt_term_feats
            else:
                all_mt_feats = numpy.vstack((all_mt_feats, mt_term_feats))
            t2 = time.clock()
            duration = float(len(x)) / fs
            process_times.append((t2 - t1) / duration)
    if len(process_times) > 0:
        print("Feature extraction complexity ratio: "
              "{0:.1f} x realtime".format((1.0 / numpy.mean(numpy.array(process_times)))))
    return (all_mt_feats, wav_file_list2, mt_feature_names)
Exemple #27
0
def speakerDiarization(filename,
                       n_speakers,
                       mt_size=2.0,
                       mt_step=0.2,
                       st_win=0.05,
                       lda_dim=35,
                       plot_res=False):
    '''
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
        - n_speakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mt_size (opt)     mid-term window size
        - mt_step (opt)     mid-term window step
        - st_win  (opt)     short-term window size
        - lda_dim (opt)     LDA dimension (0 for no LDA)
        - plot_res     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [fs, x] = audioBasicIO.readAudioFile(filename)
    x = audioBasicIO.stereo2mono(x)
    duration = len(x) / fs

    [
        classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1,
        stStep1, computeBEAT1
    ] = aT.load_model_knn(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "data",
                     "knnSpeakerAll"))
    [
        classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2,
        stStep2, computeBEAT2
    ] = aT.load_model_knn(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "data",
                     "knnSpeakerFemaleMale"))

    [mt_feats, st_feats, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs,
                                                     mt_step * fs,
                                                     round(fs * st_win),
                                                     round(fs * st_win * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (mt_feats.shape[0] + len(classNames1) + len(classNames2),
         mt_feats.shape[1]))

    for i in range(mt_feats.shape[1]):
        cur_f1 = (mt_feats[:, i] - MEAN1) / STD1
        cur_f2 = (mt_feats[:, i] - MEAN2) / STD2
        [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
        [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
        MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
        MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::,
                         i] = P2 + 0.0001

    mt_feats = MidTermFeatures2  # TODO
    iFeaturesSelect = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]

    mt_feats = mt_feats[iFeaturesSelect, :]

    (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    n_wins = mt_feats.shape[1]

    # remove outliers:
    dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_norm.T)),
                         axis=0)
    m_dist_all = numpy.mean(dist_all)
    i_non_outliers = numpy.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = numpy.min(mt_feats[1,:])
    #EnergyMean = numpy.mean(mt_feats[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #i_non_outliers = numpy.nonzero(mt_feats[1,:] > Thres)[0]
    #print i_non_outliers

    perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins
    mt_feats_norm_or = mt_feats_norm
    mt_feats_norm = mt_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, st_win * fs, round(fs*st_win), round(fs*st_win));
        # extract mid-term features with minimum step:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        #for i in range(num_of_stats * num_of_features + 1):
        for i in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        for i in range(
                num_of_features):  # for each of the short-term features:
            curPos = 0
            N = len(st_feats[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mt_win_ratio
                if N2 > N:
                    N2 = N
                curStFeatures = st_feats[i][N1:N2]
                mt_feats_to_red[i].append(numpy.mean(curStFeatures))
                mt_feats_to_red[i + num_of_features].append(
                    numpy.std(curStFeatures))
                curPos += mt_step_ratio
        mt_feats_to_red = numpy.array(mt_feats_to_red)
        mt_feats_to_red_2 = numpy.zeros(
            (mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2),
             mt_feats_to_red.shape[1]))
        for i in range(mt_feats_to_red.shape[1]):
            cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1
            cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2
            [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
            [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0],
                              i] = mt_feats_to_red[:, i]
            mt_feats_to_red_2[
                mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::,
                              i] = P2 + 0.0001
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :]
        #mt_feats_to_red += numpy.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010
        (mt_feats_to_red, MEAN,
         STD) = aT.normalizeFeatures([mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        #dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0)
        #m_dist_all = numpy.mean(dist_all)
        #iNonOutLiers2 = numpy.nonzero(dist_all < 3.0*m_dist_all)[0]
        #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2]
        Labels = numpy.zeros((mt_feats_to_red.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / st_win
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * st_win / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, Labels)
        mt_feats_norm = (clf.transform(mt_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    clsAll = []
    sil_all = []
    centersAll = []

    for iSpeakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(mt_feats_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(mt_feats_norm.T))
        clsAll.append(cls)
        centersAll.append(means)
        sil_1 = []
        sil_2 = []
        for c in range(iSpeakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = numpy.nonzero(cls == c)[0].shape[0] / \
                             float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mt_feats_norm[:, cls == c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                Yt = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(numpy.mean(Yt) * clust_per_cent)
                silBs = []
                for c2 in range(iSpeakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = numpy.nonzero(cls == c2)[0].shape[0] /\
                                           float(len(cls))
                        MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2]
                        Yt = distance.cdist(mt_feats_norm_temp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clust_per_cent + clust_per_cent_2) / 2.0)
                silBs = numpy.array(silBs)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(silBs))
        sil_1 = numpy.array(sil_1)
        sil_2 = numpy.array(sil_2)
        sil = []
        for c in range(iSpeakers):
            # for each cluster (speaker) compute silhouette
            sil.append(
                (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(numpy.mean(sil))

    imax = numpy.argmax(sil_all)
    # optimal number of clusters
    nSpeakersFinal = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = numpy.zeros((n_wins, ))
    for i in range(n_wins):
        j = numpy.argmin(numpy.abs(i - i_non_outliers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
            trainHMM_computeStatistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]
    class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundturh exists
    if os.path.isfile(gt_file):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs,
                                              mt_step)

    if plot_res:
        fig = plt.figure()
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(numpy.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(numpy.array(range(len(cls))) * mt_step + mt_step / 2.0, cls)

    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(
                numpy.array(range(len(flags_gt))) * mt_step + mt_step / 2.0,
                flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
            evaluateSpeakerDiarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(
                          100 * purity_cluster_m, 100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        #print s_range, sil_all
        if n_speakers <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
    return cls
Exemple #28
0
def silenceRemoval(x,
                   Fs,
                   stWin,
                   stStep,
                   smoothWindow=0.5,
                   Weight=0.5,
                   plot=False):
    '''
    Event Detection (silence removal)
    ARGUMENTS:
         - x:                the input audio signal
         - Fs:               sampling freq
         - stWin, stStep:    window size and step in seconds
         - smoothWindow:     (optinal) smooth window (in seconds)
         - Weight:           (optinal) weight factor (0 < Weight < 1) the higher, the more strict
         - plot:             (optinal) True if results are to be plotted
    RETURNS:
         - segmentLimits:    list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that
                    the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds
    '''

    if Weight >= 1:
        Weight = 0.99
    if Weight <= 0:
        Weight = 0.01

    # Step 1: feature extraction
    x = audioBasicIO.stereo2mono(x)  # convert to mono
    ShortTermFeatures = aF.stFeatureExtraction(
        x, Fs, stWin * Fs, stStep * Fs)  # extract short-term features

    # Step 2: train binary SVM classifier of low vs high energy frames
    EnergySt = ShortTermFeatures[
        1, :]  # keep only the energy short-term sequence (2nd feature)
    E = numpy.sort(EnergySt)  # sort the energy feature values:
    L1 = int(len(E) / 10)  # number of 10% of the total short-term windows
    T1 = numpy.mean(
        E[0:L1]) + 0.000000000000001  # compute "lower" 10% energy threshold
    T2 = numpy.mean(
        E[-L1:-1]) + 0.000000000000001  # compute "higher" 10% energy threshold
    Class1 = ShortTermFeatures[:, numpy.where(
        EnergySt <= T1)[0]]  # get all features that correspond to low energy
    Class2 = ShortTermFeatures[:, numpy.where(
        EnergySt >= T2)[0]]  # get all features that correspond to high energy
    featuresSS = [Class1.T,
                  Class2.T]  # form the binary classification task and ...

    [featuresNormSS, MEANSS,
     STDSS] = aT.normalizeFeatures(featuresSS)  # normalize and ...
    SVM = aT.trainSVM(
        featuresNormSS,
        1.0)  # train the respective SVM probabilistic model (ONSET vs SILENCE)

    # Step 3: compute onset probability based on the trained SVM
    ProbOnset = []
    for i in range(ShortTermFeatures.shape[1]):  # for each frame
        curFV = (ShortTermFeatures[:, i] -
                 MEANSS) / STDSS  # normalize feature vector
        ProbOnset.append(
            SVM.predict_proba(curFV.reshape(1, -1))[0]
            [1])  # get SVM probability (that it belongs to the ONSET class)
    ProbOnset = numpy.array(ProbOnset)
    ProbOnset = smoothMovingAvg(ProbOnset,
                                smoothWindow / stStep)  # smooth probability

    # Step 4A: detect onset frame indices:
    ProbOnsetSorted = numpy.sort(
        ProbOnset
    )  # find probability Threshold as a weighted average of top 10% and lower 10% of the values
    Nt = int(ProbOnsetSorted.shape[0] / 10)
    T = (numpy.mean((1 - Weight) * ProbOnsetSorted[0:Nt]) +
         Weight * numpy.mean(ProbOnsetSorted[-Nt::]))

    MaxIdx = numpy.where(ProbOnset > T)[
        0]  # get the indices of the frames that satisfy the thresholding
    i = 0
    timeClusters = []
    segmentLimits = []

    # Step 4B: group frame indices to onset segments
    while i < len(MaxIdx):  # for each of the detected onset indices
        curCluster = [MaxIdx[i]]
        if i == len(MaxIdx) - 1:
            break
        while MaxIdx[i + 1] - curCluster[-1] <= 2:
            curCluster.append(MaxIdx[i + 1])
            i += 1
            if i == len(MaxIdx) - 1:
                break
        i += 1
        timeClusters.append(curCluster)
        segmentLimits.append([curCluster[0] * stStep, curCluster[-1] * stStep])

    # Step 5: Post process: remove very small segments:
    minDuration = 0.2
    segmentLimits2 = []
    for s in segmentLimits:
        if s[1] - s[0] > minDuration:
            segmentLimits2.append(s)
    segmentLimits = segmentLimits2

    if plot:
        timeX = numpy.arange(0, x.shape[0] / float(Fs), 1.0 / Fs)

        plt.subplot(2, 1, 1)
        plt.plot(timeX, x)
        for s in segmentLimits:
            plt.axvline(x=s[0])
            plt.axvline(x=s[1])
        plt.subplot(2, 1, 2)
        plt.plot(numpy.arange(0, ProbOnset.shape[0] * stStep, stStep),
                 ProbOnset)
        plt.title('Signal')
        for s in segmentLimits:
            plt.axvline(x=s[0])
            plt.axvline(x=s[1])
        plt.title('SVM Probability')
        plt.show()

    return segmentLimits
def trainMetaClassifier(dirName,
                        outputmodelName,
                        modelName,
                        method="svm",
                        postProcess=0,
                        PLOT=False):
    types = ('*.wav', )
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(dirName, files)))
    wavFilesList = sorted(wavFilesList)
    flagsAll = np.array([])

    for ifile, wavFile in enumerate(
            wavFilesList):  # for each wav file in folder
        print "{0:s}, {1:d} file of {2:d}".format(wavFile, ifile + 1,
                                                  len(wavFilesList))
        matFile = wavFile.replace(".wav",
                                  "_true.mat")  # load current ground truth
        if os.path.isfile(matFile):
            matfile = loadmat(matFile)
            segs_gt = matfile["segs_r"]
            classes_gt1 = matfile["classes_r"]
            classes_gt = []
            for c in classes_gt1[0]:
                if c == "M":
                    classes_gt.append("music")
                if c == "S" or c == "E":
                    classes_gt.append("speech")
            flagsIndGT, classesAllGT = audioSegmentation.segs2flags(
                [s[0] for s in segs_gt], [s[1] for s in segs_gt], classes_gt,
                1.0)
        #if method == "svm":
        # speech-music segmentation:
        #    [flagsInd, classesAll, acc] = audioSegmentation.mtFileClassification(fileName, modelName, "svm", False, '')
        if method == "cnn":  # apply the CNN on the current WAV
            WIDTH_SEC = 2.4
            [Fs, x] = io.readAudioFile(wavFile)  # read the WAV
            x = io.stereo2mono(x)
            [flagsInd, classesAll, P] = mtCNN_classification(
                x, Fs, WIDTH_SEC, 1.0, RGB_singleFrame_net, SOUND_mean_RGB,
                transformer_RGB,
                classNamesCNN)  #  apply the CNN mid-term classifier
            print len(
                flagsIndGT
            ), P.shape  # append the current ground truth labels AND estimated probabilities (either from the CNN or the SVM) on the global arrays

            lenF = P.shape[0]
            lenL = len(flagsIndGT)
            MIN = min(lenF, lenL)
            P = P[0:MIN, :]
            flagsIndGT = flagsIndGT[0:MIN]

            flagsNew = []
            for j, fl in enumerate(flagsIndGT):  # append features and labels
                flagsNew.append(classesAll.index(classesAllGT[flagsIndGT[j]]))

            flagsAll = np.append(flagsAll, np.array(flagsNew))

            if ifile == 0:
                Fall = P
            else:
                Fall = np.concatenate((Fall, P), axis=0)

            print Fall.shape
            print flagsAll.shape

    startprob, transmat, means, cov = audioSegmentation.trainHMM_computeStatistics(
        Fall.T, flagsAll)  # compute HMM statistics
    hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag")  # train HMM
    hmm.startprob_ = startprob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(outputmodelName, "wb")  # save HMM model
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classesAll, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classesAll