Exemple #1
0
def getStVectorPerWav(wavFile, stWin,
                      stStep):  # given a wav, get entire sT features
    [Fs, x] = getTotalAudio([wavFile])
    ShortTermFeatures = aF.stFeatureExtraction(x, Fs, stWin * Fs, stStep * Fs)
    [featuresNormSS, MEANSS, STDSS
     ] = aT.normalizeFeatures([ShortTermFeatures])  # normalize to 0-mean 1-std
    [X, y] = featureListToVectors([featuresNormSS])
    return X, y, Fs
Exemple #2
0
def ExtractFeatures(newPath):
    [fs, x] = audioBasicIO.readAudioFile(newPath)
    mt_size, mt_step, st_win = 1, 1, 0.5
    [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs,
                                 round(fs * st_win), round(fs * st_win * 0.5))
    (mt_feats_norm, MEAN, STD) = normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0]
    mt_feats_normal = mt_feats_norm[:55]
    return mt_feats_normal
def evaluateClassifier(argv):
    dirName = argv[2]    
    useAccelerometer = ((argv[3]=="1") or (argv[3]=="2") or (argv[3]=="3")  or (argv[3]=="4"))
    useAccelerometerOnlyX = (argv[3]=="1")
    useAccelerometerOnlyY = (argv[3]=="2")
    useAccelerometerOnlyZ = (argv[3]=="3")

    useImage = (argv[4]=="1")    
    fileList  = sorted(glob.glob(os.path.join(dirName, "*.csv")))    
    GTs = []
    eX = []        
    eY = []    
    eZ = [] 

    featuresAll = []
    classNames = []
    

    for i, m in enumerate(fileList):                
        gt = int(ntpath.basename(m).split("_")[-1].replace(".csv",""))

        className = ntpath.basename(m).split("_")[1]        
        if not className in classNames:
            classNames.append(className)
            featuresAll.append([])         
        #if gt>0:
        if True:
            GTs.append(gt)
            FeatureVectorFusion = featureExtraction(m, useAccelerometer, useAccelerometerOnlyX, useAccelerometerOnlyY, useAccelerometerOnlyZ, useImage)
            print FeatureVectorFusion.shape
            if len(featuresAll[classNames.index(className)])==0:
                featuresAll[classNames.index(className)] = FeatureVectorFusion
            else:
                featuresAll[classNames.index(className)] = numpy.vstack((featuresAll[classNames.index(className)], FeatureVectorFusion))

    #featuresAll = featuresY
    (featuresAll, MEAN, STD) = aT.normalizeFeatures(featuresAll)
    #bestParam = aT.evaluateClassifier(featuresAll, classNames, 1000, "svm", [0.05, 0.1, 0.5, 1, 2,3, 5, 10, 15, 20, 25, 50, 100, 200], 0, perTrain=0.80)
    bestParam = aT.evaluateClassifier(featuresAll, classNames, 1000, "svm", [0.05, 0.1, 0.5], 0, perTrain=0.80)    

    MEAN = MEAN.tolist()
    STD = STD.tolist()    

    # STEP C: Save the classifier to file    
    Classifier = aT.trainSVM(featuresAll, bestParam)
    modelName = argv[5]
    with open(modelName, 'wb') as fid:                                            # save to file
        cPickle.dump(Classifier, fid)            
    fo = open(modelName + "MEANS", "wb")
    cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()



    '''
Exemple #4
0
def ExtractFeatures(newPath):
    [fs, x] = audioBasicIO.readAudioFile(newPath)
    mt_size, mt_step, st_win = 1, 1, 0.5
    [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs,
                                 round(fs * st_win), round(fs * st_win * 0.5))
    (mt_feats_norm, MEAN, STD) = normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    #F, name = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs, 0.025*Fs)
    #print np.shape(F)
    return mt_feats_norm
Exemple #5
0
def train(files):
    #extract feature
    features, classes, filenames = aF.dirsWavFeatureExtraction(
        files, 1.0, 1.0, aT.shortTermWindow, aT.shortTermStep)
    #normalize
    [featuresNorm, MEAN, STD] = aT.normalizeFeatures(features)
    [X, Y] = aT.listOfFeatures2Matrix(featuresNorm)
    #train using SVM
    clf = sklearn.svm.SVC(kernel='linear', probability=True)
    clf.fit(X, Y)
    return clf, MEAN, STD
Exemple #6
0
def trainNN(listOfDirs, mtWin, mtStep, stWin, stStep, computeBEAT=False):
    #Feature Extraction
    [features, classNames,
     _] = aF.dirsWavFeatureExtraction(listOfDirs,
                                      mtWin,
                                      mtStep,
                                      stWin,
                                      stStep,
                                      computeBEAT=computeBEAT)

    if len(features) == 0:
        print "feature ERROR"
        return

    numOfFeatures = features[0].shape[1]
    featureNames = ["features" + str(d + 1) for d in range(numOfFeatures)]
    aT.writeTrainDataToARFF(modelName, features, classNames, featureNames)
    for i, f in enumerate(features):
        if len(f) == 0:
            print "feature ERROR"
            return

    C = len(classNames)
    [featuresNorm, MEAN,
     STD] = aT.normalizeFeatures(features)  # normalize features
    MEAN = MEAN.tolist()
    STD = STD.tolist()
    featuresNew = featuresNorm

    bestParam = evaluate(featuresNew,
                         classNames,
                         100,
                         numpy.array([1, 2, 3, 4, 5, 6]),
                         0,
                         perTrain=0.80)
    clf = train(featuresNew, bestParam)

    with open(modelName, 'wb') as fid:
        cPickle.dump(clf, fid)
    fo = open(modelName + "MEANS", "wb")
    cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(stWin, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(stStep, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(computeBEAT, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()
Exemple #7
0
def selfSimilarityMatrix(featureVectors):
    '''
    This function computes the self-similarity matrix for a sequence of feature vectors.
    ARGUMENTS:
     - featureVectors:     a numpy matrix (nDims x nVectors) whose i-th column corresponds to the i-th feature vector

    RETURNS:
     - S:             the self-similarity matrix (nVectors x nVectors)
    '''

    [nDims, nVectors] = featureVectors.shape
    [featureVectors2, MEAN, STD] = aT.normalizeFeatures([featureVectors.T])
    featureVectors2 = featureVectors2[0].T
    S = 1.0 - distance.squareform(distance.pdist(featureVectors2.T, 'cosine'))
    return S
Exemple #8
0
def train_SVM(st_feats):
    st_energy = st_feats[1, :]
    en = np.sort(st_energy)
    l1 = int(len(en) / 10)
    t1 = np.mean(en[0:l1]) + 0.000000000000001  # 计算10%较低能量的均值,作为低阈值
    t2 = np.mean(en[-l1:-1]) + 0.000000000000001  # 计算10%较高能量的均值,作为高阈值
    class1 = st_feats[:, np.where(st_energy <= t1)[0]]  # 将能量低于低阈值的帧,作为class1
    class2 = st_feats[:, np.where(st_energy >= t2)[0]]  # 将能量高于高阈值的帧,作为class2
    feats_s = [class1.T, class2.T]  # class1.T:(58,68)|class2.T:(38,68)

    [feats_s_norm, means_s,
     stds_s] = aT.normalizeFeatures(feats_s)  # 标准化:减均值除方差
    svm = aT.trainSVM(feats_s_norm, 1.0)

    return svm, means_s, stds_s
Exemple #9
0
    def trainDirs(self, dir_root):
        """
        Train all wav files within the list of directories within dir
        The class name is derived as last entry after splitting
        /path/to/dir
        """
        dir_list = glob.glob(dir_root+'/*')
        features=[] #is a list of feature matrices, one for each class
        self.classNames=[]
        for d in dir_list:
            log.logv('featurize %s\n' % (d))
            self.classNames.append(d.split('/')[-1])
            first = True
            class_features = np.array([])
            for w in os.listdir(d) :
                if w.endswith('.wav') :
                    _f = self.featurize(os.path.join(d, w)) # returns a matrix of numBlocks x numFeatures
                    if first :
                        first = False
                        class_features = _f
                    else:
                        class_features = np.vstack((class_features, _f))
                    
            if class_features.shape[0] > 0 :
                #class features is a matrix M*Features
                features.append(class_features)

        classifierParams = np.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0])

        # parameter mode 0 for best accuracy, 1 for best f1 score
        [featuresNew, self.MEAN, self.STD] = aT.normalizeFeatures(features) # normalize features

        bestParam = aT.evaluateClassifier(features, self.classNames, 100, "svm",
                                          classifierParams, 0, perTrain=0.90)

        print "Selected params: {0:.5f}".format(bestParam)
        # TODO
        # 1. normalize before evaluating?
        # 2. try gaussian kernel?
        self.Classifier = aT.trainSVM(featuresNew, bestParam)
Exemple #10
0
def getTotalEnergyVector(
    folder_to_wavs
):  # given a single list of wav paths, return their aggregate 10% vector
    [Fs, x] = getTotalAudio(folder_to_wavs)
    ShortTermFeatures = aF.stFeatureExtraction(x, Fs, stWin * Fs, stStep * Fs)
    EnergySt = ShortTermFeatures[1, :]
    E = np.sort(EnergySt)
    L1 = int(len(E) / 10)
    T1 = np.mean(E[0:L1]) + 0.000000000000001
    T2 = np.mean(
        E[-L1:-1]) + 0.000000000000001  # compute "higher" 10% energy threshold
    Class1 = ShortTermFeatures[:, np.where(
        EnergySt <= T1)[0]]  # get all features that correspond to low energy
    # Class1 = ShortTermFeatures[1,:][np.where(EnergySt <= T1)[0]]         # purely energy
    Class2 = ShortTermFeatures[:, np.where(
        EnergySt >= T2)[0]]  # get all features that correspond to high energy
    # Class2 = ShortTermFeatures[1,:][np.where(EnergySt >= T2)[0]]         # purely energy
    featuresSS = [Class1.T, Class2.T]  # form the binary classification task
    [featuresNormSS, MEANSS,
     STDSS] = aT.normalizeFeatures(featuresSS)  # normalize to 0-mean 1-std
    [X, y] = featureListToVectors(featuresNormSS)
    return X, y, Fs
Exemple #11
0
    [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
    [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
    MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
    MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] + len(classNames1),
                     i] = P1 + 0.0001
    MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001

mt_feats = MidTermFeatures2  # TODO
iFeaturesSelect = [
    8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46,
    47, 48, 49, 50, 51, 52, 53
]

mt_feats = mt_feats[iFeaturesSelect, :]

(mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
mt_feats_norm = mt_feats_norm[0].T
n_wins = mt_feats.shape[1]

# remove outliers:
dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0)
m_dist_all = np.mean(dist_all)
i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0]

# TODO: Combine energy threshold for outlier removal:
#EnergyMin = np.min(mt_feats[1,:])
#EnergyMean = np.mean(mt_feats[1,:])
#Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
#i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
#print i_non_outliers
Exemple #12
0
 for a in classNames:
     temp = numpy.load(
         os.path.dirname(os.path.realpath(sys.argv[0])) +
         '/classifier_data/' + a + '.npy')
     features.append(temp)
 classifierParams = numpy.array([0.001, 0.01, 0.5, 1.0, 5.0])
 nExp = 50
 bestParam = audioTrainTest.evaluateClassifier(features,
                                               classNames,
                                               nExp,
                                               "svm",
                                               classifierParams,
                                               0,
                                               perTrain=0.01)
 [featuresNorm, MEAN,
  STD] = audioTrainTest.normalizeFeatures(features)  # normalize features
 MEAN = MEAN.tolist()
 STD = STD.tolist()
 featuresNew = featuresNorm
 Classifier = audioTrainTest.trainSVM(featuresNew, bestParam)
 Classifier.save_model(
     os.path.dirname(os.path.realpath(sys.argv[0])) + '/classifier_data/' +
     modelName)
 fo = open(
     os.path.dirname(os.path.realpath(sys.argv[0])) + '/classifier_data/' +
     modelName + "MEANS", "wb")
 cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
 cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL)
 cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
 cPickle.dump(0, fo, protocol=cPickle.HIGHEST_PROTOCOL)
 cPickle.dump(0, fo, protocol=cPickle.HIGHEST_PROTOCOL)
def main(rootName, modelType, classifierParam, signal_type):
    CMall = numpy.zeros((2, 2))
    if modelType != "svm" and modelType != "svm_rbf":
        C = [int(classifierParam)]
    else:
        C = [(classifierParam)]
    F1s = []
    Accs = []
    for ifold in range(0, 10):  # for each fold
        dirName = rootName + os.sep + "fold_{0:d}".format(
            ifold)  # get fold path name
        classNamesTrain, featuresTrain = dirFeatureExtraction([
            os.path.join(dirName, "train", "fail"),
            os.path.join(dirName, "train", "success")
        ], signal_type)  # TRAINING data feature extraction
        bestParam = aT.evaluateClassifier(
            featuresTrain, classNamesTrain, 2, modelType, C, 0,
            0.90)  # internal cross-validation (for param selection)
        classNamesTest, featuresTest = dirFeatureExtraction([
            os.path.join(dirName, "test", "fail"),
            os.path.join(dirName, "test", "success")
        ], signal_type)  # trainGradientBoosting data feature extraction
        [featuresTrainNew, MEAN, STD] = aT.normalizeFeatures(
            featuresTrain)  # training features NORMALIZATION
        if modelType == "svm":  # classifier training
            Classifier = aT.trainSVM(featuresTrainNew, bestParam)
        elif modelType == "svm_rbf":
            Classifier = aT.trainSVM_RBF(featuresTrainNew, bestParam)
        elif modelType == "randomforest":
            Classifier = aT.trainRandomForest(featuresTrainNew, bestParam)
        elif modelType == "gradientboosting":
            Classifier = aT.trainGradientBoosting(featuresTrainNew, bestParam)
        elif modelType == "extratrees":
            Classifier = aT.trainExtraTrees(featuresTrainNew, bestParam)

        CM = numpy.zeros((2, 2))  # evaluation on testing data
        for iC, f in enumerate(featuresTest):  # for each class
            for i in range(
                    f.shape[0]):  # for each testing sample (feature vector)
                curF = f[i, :]  # get feature vector
                curF = (curF - MEAN) / STD  # normalize test feature vector
                winnerClass = classNamesTrain[int(
                    aT.classifierWrapper(
                        Classifier, modelType,
                        curF)[0])]  # classify and get winner class
                trueClass = classNamesTest[iC]  # get groundtruth class
                CM[classNamesTrain.index(trueClass)][classNamesTrain.index(
                    winnerClass)] += 1  # update confusion matrix
        CMall += CM  # update overall confusion matrix
        Recall, Precision, F1 = computePreRec(
            CM, classNamesTrain)  # get recall, precision and F1 (per class)
        Acc = numpy.diagonal(CM).sum() / CM.sum()  # get overall accuracy
        F1s.append(numpy.mean(F1))  # append average F1
        Accs.append(Acc)  # append clasification accuracy
    print
    print "FINAL RESULTS"
    print
    print "----------------------------------"
    print "fold\tacc\tf1"
    print "----------------------------------"
    for i in range(len(F1s)):
        print "{0:d}\t{1:.1f}\t{2:.1f}".format(i, 100 * Accs[i], 100 * F1s[i])
    Acc = numpy.diagonal(CMall).sum() / CMall.sum()
    Recall, Precision, F1 = computePreRec(CMall, classNamesTrain)
    print "----------------------------------"
    print "{0:s}\t{1:.1f}\t{2:.1f}".format("Avg", 100 * numpy.mean(Accs),
                                           100 * numpy.mean(F1s))
    print "{0:s}\t{1:.1f}\t{2:.1f}".format("Av CM", 100 * Acc,
                                           100 * numpy.mean(F1))
    print "----------------------------------"
    print
    print "Overal Confusion matrix:"
    aT.printConfusionMatrix(CMall, classNamesTrain)
    print
    print "FAIL Recall = {0:.1f}".format(100 *
                                         Recall[classNamesTrain.index("fail")])
    print "FAIL Precision = {0:.1f}".format(
        100 * Precision[classNamesTrain.index("fail")])
    print "SUCCESS Recall = {0:.1f}".format(
        100 * Recall[classNamesTrain.index("success")])
    print "SUCCESS Precision = {0:.1f}".format(
        100 * Precision[classNamesTrain.index("success")])

    return CMall, Acc, Recall, Precision, F1
Exemple #14
0
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=0, PLOT=False):
    '''
    ARGUMENTS:
        - fileName:        the name of the WAV file to be analyzed
        - numOfSpeakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mtSize (opt)     mid-term window size
        - mtStep (opt)     mid-term window step
        - stWin  (opt)     short-term window size
        - LDAdim (opt)     LDA dimension (0 for no LDA)
        - PLOT     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [Fs, x] = pyAudioAnalysis.audioBasicIO.readAudioFile(fileName)
    x = pyAudioAnalysis.audioBasicIO.stereo2mono(x)
    Duration = len(x) / Fs

    #[Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel(os.path.join("data","knnSpeakerAll"))
    #[Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel(os.path.join("data","knnSpeakerFemaleMale"))
    [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("pyAudioAnalysis/data/knnSpeakerAll")
    [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("pyAudioAnalysis/data/knnSpeakerFemaleMale")

    [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs*stWin * 0.5))

    MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2
        [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
        [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2    # TODO
    # SELECT FEATURES:
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20];                                                                                         # SET 0A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100];                                                                                 # SET 0B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,
    #   97,98, 99,100];     # SET 0C

    iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]                           # SET 1A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];                                          # SET 1B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 1C

    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53];             # SET 2A
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];     # SET 2B
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 2C

    #iFeaturesSelect = range(100);                                                                                                    # SET 3
    #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010

    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    # remove outliers:
    DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = numpy.min(MidTermFeatures[1,:])
    #EnergyMean = numpy.mean(MidTermFeatures[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
    #print iNonOutLiers

    perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    # LDA dimensionality reduction:
    if LDAdim > 0:
        #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));
        # extract mid-term features with minimum step:
        mtWinRatio = int(round(mtSize / stWin))
        mtStepRatio = int(round(stWin / stWin))
        mtFeaturesToReduce = []
        numOfFeatures = len(ShortTermFeatures)
        numOfStatistics = 2
        #for i in range(numOfStatistics * numOfFeatures + 1):
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append([])

        for i in range(numOfFeatures):        # for each of the short-term features:
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mtWinRatio
                if N2 > N:
                    N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures))
                curPos += mtStepRatio
        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
            [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001
        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
        (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T
        #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
        #MDistancesAll = numpy.mean(DistancesAll)
        #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
        #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
        Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ));
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i*stWin/LDAstepRatio);        
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels)
        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    if numOfSpeakers <= 0:
        sRange = range(2, 10)
    else:
        sRange = [numOfSpeakers]
    clsAll = []
    silAll = []
    centersAll = []
    
    for iSpeakers in sRange:        
        k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers)
        k_means.fit(MidTermFeaturesNorm.T)
        cls = k_means.labels_        
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
        clsAll.append(cls)
        centersAll.append(means)
        silA = []; silB = []
        for c in range(iSpeakers):                                # for each speaker (i.e. for each extracted cluster)
            clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls))
            if clusterPerCent < 0.020:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c]            # get subset of feature vectors
                Yt = distance.pdist(MidTermFeaturesNormTemp.T)                # compute average distance between samples that belong to the cluster (a values)
                silA.append(numpy.mean(Yt)*clusterPerCent)
                silBs = []
                for c2 in range(iSpeakers):                        # compute distances from samples of other clusters
                    if c2!=c:
                        clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
                        silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0)
                silBs = numpy.array(silBs)                            
                silB.append(min(silBs))                            # ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
        silA = numpy.array(silA); 
        silB = numpy.array(silB); 
        sil = []
        for c in range(iSpeakers):                                # for each cluster (speaker)
            sil.append( ( silB[c] - silA[c]) / (max(silB[c],  silA[c])+0.00001)  )        # compute silhouette

        silAll.append(numpy.mean(sil))                                # keep the AVERAGE SILLOUETTE

    #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
    imax = numpy.argmax(silAll)                                    # position of the maximum sillouette value
    nSpeakersFinal = sRange[imax]                                    # optimal number of clusters

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window)
    cls = numpy.zeros((numOfWindows,))
    for i in range(numOfWindows):
        j = numpy.argmin(numpy.abs(i-iNonOutLiers))        
        cls[i] = clsAll[imax][j]
        
    # Post-process method 1: hmm smoothing
    for i in range(1):
        startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls)
        hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag")            # hmm training        
        hmm.startprob_ = startprob
        hmm.transmat_ = transmat            
        hmm.means_ = means; hmm.covars_ = cov
        cls = hmm.predict(MidTermFeaturesNormOr.T)                    
    
    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = silAll[imax]                                        # final sillouette
    classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)];


    # load ground-truth if available
    gtFile = fileName.replace('.wav', '.segments');                            # open for annotated file
    if os.path.isfile(gtFile):                                    # if groundturh exists
        [segStart, segEnd, segLabels] = readSegmentGT(gtFile)                    # read GT data
        flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep)            # convert to flags

    if PLOT:
        fig = plt.figure()    
        if numOfSpeakers>0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(numpy.array(range(len(classNames))))
        ax1.axis((0, Duration, -1, len(classNames)))
        ax1.set_yticklabels(classNames)
        ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls)

    if os.path.isfile(gtFile):
        if PLOT:
            ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r')
        purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT)
        print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean)
        if PLOT:
            plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) )
    if PLOT:
        plt.xlabel("time (seconds)")
        #print sRange, silAll    
        if numOfSpeakers<=0:
            plt.subplot(212)
            plt.plot(sRange, silAll)
            plt.xlabel("number of clusters");
            plt.ylabel("average clustering's sillouette");
        plt.show()
    return cls
Exemple #15
0
def evaluate(features, ClassNames, nExp, Params, parameterMode, perTrain=0.80):
    (featuresNorm, MEAN, STD) = aT.normalizeFeatures(features)
    nClasses = len(features)
    CAll = []
    acAll = []
    F1All = []
    PrecisionClassesAll = []
    RecallClassesAll = []
    ClassesAll = []
    F1ClassesAll = []
    CMsAll = []

    # compute total number of samples:
    nSamplesTotal = 0
    for f in features:
        nSamplesTotal += f.shape[0]
    if nSamplesTotal > 1000 and nExp > 50:
        nExp = 50
        print "Number of training experiments changed to 50 due to high number of samples"
    if nSamplesTotal > 2000 and nExp > 10:
        nExp = 10
        print "Number of training experiments changed to 10 due to high number of samples"

    for Ci, C in enumerate(Params):  # for each param value
        CM = numpy.zeros((nClasses, nClasses))
        for e in range(nExp):  # for each cross-validation iteration:
            print "Param = {0:.5f} - Classifier Evaluation Experiment {1:d} of {2:d}".format(
                C, e + 1, nExp)
            featuresTrain, featuresTest = aT.randSplitFeatures(
                featuresNorm, perTrain)
            Classifier = train(featuresTrain, C)

            CMt = numpy.zeros((nClasses, nClasses))
            for c1 in range(nClasses):
                nTestSamples = len(featuresTest[c1])
                Results = numpy.zeros((nTestSamples, 1))
                for ss in range(nTestSamples):
                    [Results[ss], _] = classify(Classifier,
                                                featuresTest[c1][ss])
                for c2 in range(nClasses):
                    CMt[c1][c2] = float(len(numpy.nonzero(Results == c2)[0]))
            CM = CM + CMt
        CM = CM + 0.0000000010
        Rec = numpy.zeros((CM.shape[0], ))
        Pre = numpy.zeros((CM.shape[0], ))

        for ci in range(CM.shape[0]):
            Rec[ci] = CM[ci, ci] / numpy.sum(CM[ci, :])
            Pre[ci] = CM[ci, ci] / numpy.sum(CM[:, ci])
        PrecisionClassesAll.append(Pre)
        RecallClassesAll.append(Rec)
        F1 = 2 * Rec * Pre / (Rec + Pre)
        F1ClassesAll.append(F1)
        acAll.append(numpy.sum(numpy.diagonal(CM)) / numpy.sum(CM))

        CMsAll.append(CM)
        F1All.append(numpy.mean(F1))

    print("\t\t"),
    for i, c in enumerate(ClassNames):
        if i == len(ClassNames) - 1:
            print "{0:s}\t\t".format(c),
        else:
            print "{0:s}\t\t\t".format(c),
    print("OVERALL")
    print("\tC"),
    for c in ClassNames:
        print "\tPRE\tREC\tF1",
    print "\t{0:s}\t{1:s}".format("ACC", "F1")
    bestAcInd = numpy.argmax(acAll)
    bestF1Ind = numpy.argmax(F1All)
    for i in range(len(PrecisionClassesAll)):
        print "\t{0:.3f}".format(Params[i]),
        for c in range(len(PrecisionClassesAll[i])):
            print "\t{0:.1f}\t{1:.1f}\t{2:.1f}".format(
                100.0 * PrecisionClassesAll[i][c],
                100.0 * RecallClassesAll[i][c], 100.0 * F1ClassesAll[i][c]),
        print "\t{0:.1f}\t{1:.1f}".format(100.0 * acAll[i], 100.0 * F1All[i]),
        if i == bestF1Ind:
            print "\t best F1",
        if i == bestAcInd:
            print "\t best Acc",
        print
    return Params[bestF1Ind]
def featureAndTrainRegression(dir_name, mt_win, mt_step, st_win, st_step,
                              model_type, model_name, compute_beat=False,
                              feats=["gfcc", "mfcc"]):
    '''
    This function is used as a wrapper to segment-based audio feature extraction and classifier training.
    ARGUMENTS:
        dir_name:        path of directory containing the WAV files and Regression CSVs
        mt_win, mt_step:        mid-term window length and step
        st_win, st_step:        short-term window and step
        model_type:        "svm" or "knn" or "randomforest"
        model_name:        name of the model to be saved
    RETURNS:
        None. Resulting regression model along with the respective model parameters are saved on files.
    '''
    # STEP A: Feature Extraction:
    [features, _, filenames] = aF.dirsWavFeatureExtraction([dir_name],
                                                           mt_win,
                                                           mt_step,
                                                           st_win,
                                                           st_step,
                                                           compute_beat=compute_beat,
                                                           feats=feats)
    features = features[0]
    filenames = [ntpath.basename(f) for f in filenames[0]]
    f_final = []

    # Read CSVs:
    CSVs = glob.glob(dir_name + os.sep + "*.csv")
    regression_labels = []
    regression_names = []
    f_final = []
    for c in CSVs:                                                            # for each CSV
        cur_regression_labels = []
        f_temp = []
        with open(c, 'rt') as csvfile:                                        # open the csv file that contains the current target value's annotations
            CSVreader = csv.reader(csvfile, delimiter=',', quotechar='|')
            for row in CSVreader:
                if len(row) == 2:                                             # if the current row contains two fields (filename, target value)
                    if row[0] in filenames:                                   # ... and if the current filename exists in the list of filenames
                        index = filenames.index(row[0])
                        cur_regression_labels.append(float(row[1]))
                        f_temp.append(features[index,:])
                    else:
                        print("Warning: {} not found in list of files.".format(row[0]))
                else:
                    print("Warning: Row with unknown format in regression file")

        f_final.append(numpy.array(f_temp))
        regression_labels.append(numpy.array(cur_regression_labels))                          # cur_regression_labels is the list of values for the current regression problem
        regression_names.append(ntpath.basename(c).replace(".csv", ""))        # regression task name
        if len(features) == 0:
            print("ERROR: No data found in any input folder!")
            return

    n_feats = f_final[0].shape[1]

    # TODO: ARRF WRITE????
    # STEP B: classifier Evaluation and Parameter Selection:
    if model_type == "svm" or model_type == "svm_rbf":
        model_params = numpy.array([0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 5.0, 10.0])
    elif model_type == "randomforest":
        model_params = numpy.array([5, 10, 25, 50, 100])

#    elif model_type == "knn":
#        model_params = numpy.array([1, 3, 5, 7, 9, 11, 13, 15]);
    errors = []
    errors_base = []
    best_params = []

    for iRegression, r in enumerate(regression_names):
        # get optimal classifeir parameter:
        print("Regression task " + r)
        bestParam, error, berror = evaluateRegression(f_final[iRegression],
                                                      regression_labels[iRegression],
                                                      100, model_type,
                                                      model_params)
        errors.append(error)
        errors_base.append(berror)
        best_params.append(bestParam)
        print("Selected params: {0:.5f}".format(bestParam))

        [features_norm, MEAN, STD] = normalizeFeatures([f_final[iRegression]])        # normalize features

        # STEP C: Save the model to file
        if model_type == "svm":
            classifier, _ = trainSVMregression(features_norm[0],
                                               regression_labels[iRegression],
                                               bestParam)
        if model_type == "svm_rbf":
            classifier, _ = trainSVMregression_rbf(features_norm[0],
                                                   regression_labels[iRegression],
                                                   bestParam)
        if model_type == "randomforest":
            classifier, _ = trainRandomForestRegression(features_norm[0],
                                                        regression_labels[iRegression],
                                                        bestParam)

        if model_type == "svm" or model_type == "svm_rbf" or model_type == "randomforest":
            with open(model_name + "_" + r, 'wb') as fid:
                cPickle.dump(classifier, fid)
            fo = open(model_name + "_" + r + "MEANS", "wb")
            cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
            cPickle.dump(STD,  fo, protocol=cPickle.HIGHEST_PROTOCOL)
            cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
            cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
            cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
            cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
            cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL)
            fo.close()
    return errors, errors_base, best_params
def trainTextClassifiers(directoryPath, classifierType, classifierName):
    subdirectories = get_immediate_subdirectories(directoryPath)
    #tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features = 10000, stop_words='english')
    dicts = loadDictionaries("myDicts/")
    classNames = []
    Features = []
    # extract features from corpus
    for si, s in enumerate(
            subdirectories):  # for each directory in training data
        print "Training folder {0:d} of {1:d} ({2:s})".format(
            si + 1, len(subdirectories), s),
        files = getListOfFilesInDir(directoryPath + os.sep + s,
                                    "*")  # get list of files in directory
        if MAX_FILES_PER_CLASS > 0 and MAX_FILES_PER_CLASS < len(files):
            files = random.sample(files, MAX_FILES_PER_CLASS)
        print " - {0:d} files".format(len(files))
        classNames.append(s)
        for ifile, fi in enumerate(files):  # for each file in current class:
            with open(fi) as f:
                content = f.read()
                curF = getFeaturesFromText(content,
                                           dicts)  # get feature vector
            if ifile == 0:  # update feature matrix
                Features.append(curF.T)
            else:
                Features[-1] = numpy.concatenate((Features[-1], curF.T),
                                                 axis=0)

    # define classifier parameters
    if classifierType == "svm":
        classifierParams = numpy.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0])
    elif classifierType == "randomforest":
        classifierParams = numpy.array([10, 25, 50, 100, 200, 500])
    elif classifierType == "knn":
        classifierParams = numpy.array([1, 3, 5, 7, 9, 11, 13, 15])
    elif classifierType == "gradientboosting":
        classifierParams = numpy.array([10, 25, 50, 100, 200, 500])
    elif classifierType == "extratrees":
        classifierParams = numpy.array([10, 25, 50, 100, 200, 500])

    # evaluate classifier and select best param
    nExp = 10
    bestParam = audioTrainTest.evaluateClassifier(Features, subdirectories,
                                                  nExp, classifierType,
                                                  classifierParams, 0, 0.9)

    # normalize features
    C = len(classNames)
    [featuresNorm, MEAN, STD] = audioTrainTest.normalizeFeatures(Features)
    MEAN = MEAN.tolist()
    STD = STD.tolist()
    featuresNew = featuresNorm

    # save the classifier to file
    if classifierType == "svm":
        Classifier = audioTrainTest.trainSVM(featuresNew, bestParam)
    elif classifierType == "randomforest":
        Classifier = audioTrainTest.trainRandomForest(featuresNew, bestParam)
    elif classifierType == "gradientboosting":
        Classifier = audioTrainTest.trainGradientBoosting(
            featuresNew, bestParam)
    elif classifierType == "extratrees":
        Classifier = audioTrainTest.trainExtraTrees(featuresNew, bestParam)

    if 'Classifier' in locals():
        with open(classifierName, 'wb') as fid:  # save to file
            cPickle.dump(Classifier, fid)
        fo = open(classifierName + "MEANS", "wb")
        cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        fo.close()
def silenceCounter(x,
                   fs,
                   st_win,
                   st_step,
                   smoothWindow=0.5,
                   weight=0.5,
                   plot=False):
    if weight >= 1:
        weight = 0.99
    if weight <= 0:
        weight = 0.01

    # Step 1: feature extraction
    x = audioBasicIO.stereo2mono(x)
    st_feats, _ = aF.stFeatureExtraction(x, fs, st_win * fs, st_step * fs)

    # Step 2: train binary svm classifier of low vs high energy frames
    # keep only the energy short-term sequence (2nd feature)
    st_energy = st_feats[1, :]
    en = numpy.sort(st_energy)
    # number of 10% of the total short-term windows
    l1 = int(len(en) / 10)
    # compute "lower" 10% energy threshold
    t1 = numpy.mean(en[0:l1]) + 0.000000000000001
    # compute "higher" 10% energy threshold
    t2 = numpy.mean(en[-l1:-1]) + 0.000000000000001
    # get all features that correspond to low energy
    class1 = st_feats[:, numpy.where(st_energy <= t1)[0]]
    # get all features that correspond to high energy
    class2 = st_feats[:, numpy.where(st_energy >= t2)[0]]
    # form the binary classification task and ...

    # change the order of the array
    # faets_s = [class1.T, class2.T]

    # changing order gives the segmens with silence
    faets_s = [class2.T, class1.T]

    # normalize and train the respective svm probabilistic model
    # (SILENCE vs ONSET)
    [faets_s_norm, means_s, stds_s] = aT.normalizeFeatures(faets_s)
    svm = aT.trainSVM(faets_s_norm, 1.0)

    # Step 3: compute onset probability based on the trained svm
    prob_on_set = []
    for i in range(st_feats.shape[1]):
        # for each frame
        cur_fv = (st_feats[:, i] - means_s) / stds_s
        # get svm probability (that it belongs to the ONSET class)
        prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1])
    prob_on_set = numpy.array(prob_on_set)
    # smooth probability:
    prob_on_set = smoothMovingAvg(prob_on_set, smoothWindow / st_step)

    # Step 4A: detect onset frame indices:
    prog_on_set_sort = numpy.sort(prob_on_set)
    # find probability Threshold as a weighted average
    # of top 10% and lower 10% of the values
    Nt = int(prog_on_set_sort.shape[0] / 10)
    T = (numpy.mean((1 - weight) * prog_on_set_sort[0:Nt]) +
         weight * numpy.mean(prog_on_set_sort[-Nt::]))

    max_idx = numpy.where(prob_on_set > T)[0]
    # get the indices of the frames that satisfy the thresholding
    i = 0
    time_clusters = []
    seg_limits = []

    # Step 4B: group frame indices to onset segments
    while i < len(max_idx):
        # for each of the detected onset indices
        cur_cluster = [max_idx[i]]
        if i == len(max_idx) - 1:
            break
        while max_idx[i + 1] - cur_cluster[-1] <= 2:
            cur_cluster.append(max_idx[i + 1])
            i += 1
            if i == len(max_idx) - 1:
                break
        i += 1
        time_clusters.append(cur_cluster)
        seg_limits.append(
            [cur_cluster[0] * st_step, cur_cluster[-1] * st_step])

    # Step 5: Post process: remove very small segments:
    min_dur = 0.2
    seg_limits_2 = []
    for s in seg_limits:
        if s[1] - s[0] > min_dur:
            seg_limits_2.append(s)
    print(f"SEGMENTS 0.2: {seg_limits_2}")
    print(F"SEGMENTS: {seg_limits}")
Exemple #19
0
def speakerDiarization(fileName,
                       sRange=xrange(2, 10),
                       mtSize=2.0,
                       mtStep=0.2,
                       stWin=0.05,
                       LDAdim=35):
    Fs, x = audioBasicIO.readAudioFile(fileName)
    x = audioBasicIO.stereo2mono(x)
    duration = len(x) / Fs

    Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel(
        os.path.join(
            '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data',
            'knnSpeakerAll'))
    Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel(
        os.path.join(
            '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data',
            'knnSpeakerFemaleMale'))

    MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction(
        x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin),
        round(Fs * stWin * 0.5))
    MidTermFeatures2 = numpy.zeros(
        (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2),
         MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2

        Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1)
        Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2)

        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):,
                         i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2
    iFeaturesSelect = range(8, 21) + range(41, 54)
    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    DistancesAll = numpy.sum(distance.squareform(
        distance.pdist(MidTermFeaturesNorm.T)),
                             axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    perOutLier = (100.0 *
                  (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    if LDAdim > 0:
        mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int(
            round(mtSize / stWin)), int(round(
                stWin / stWin)), list(), len(ShortTermFeatures), 2
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append(list())

        for i in range(numOfFeatures):
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1, N2 = curPos, curPos + mtWinRatio
                if N2 > N: N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i + numOfFeatures].append(
                    numpy.std(curStFeatures))
                curPos += mtStepRatio

        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros(
            (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2),
             mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1)
            Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0],
                                i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[
                mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] +
                                len(classNames1):, i] = P2 + 0.0001

        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures(
            [mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T

        Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin

        for i in range(Labels.shape[0]):
            Labels[i] = int(i * stWin / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels)

        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    clsAll, silAll, centersAll = list(), list(), list()

    for iSpeakers in sRange:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(MidTermFeaturesNorm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        clsAll.append(cls)
        centersAll.append(means)
        silA, silB = list(), list()
        for c in range(iSpeakers):
            clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(
                len(cls))
            if clusterPerCent < 0.02:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c]
                Yt = distance.pdist(MidTermFeaturesNormTemp.T)
                silA.append(numpy.mean(Yt) * clusterPerCent)
                silBs = list()
                for c2 in range(iSpeakers):
                    if c2 != c:
                        clusterPerCent2 = numpy.nonzero(
                            cls == c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,
                                                                       cls ==
                                                                       c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clusterPerCent + clusterPerCent2) / 2.0)
                silBs = numpy.array(silBs)
                silB.append(min(silBs))
        silA, silB, sil = numpy.array(silA), numpy.array(silB), list()
        for c in range(iSpeakers):
            sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001))
        silAll.append(numpy.mean(sil))

    imax = numpy.argmax(silAll)
    nSpeakersFinal = sRange[imax]

    cls = numpy.zeros((numOfWindows, ))
    for i in range(numOfWindows):
        j = numpy.argmin(numpy.abs(i - iNonOutLiers))
        cls[i] = clsAll[imax][j]

    startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls)
    hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag')
    hmm.startprob_ = startprob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov
    cls = hmm.predict(MidTermFeaturesNormOr.T)
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = silAll[imax]
    classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)]

    return cls, classNames, duration, mtStep, silAll
Exemple #20
0
 classNames = classNames.split()
 for a in classNames:
     temp = numpy.load(
         os.path.dirname(os.path.realpath(sys.argv[0])) +
         '/classifier_data/' + a + '.npy')
     features.append(temp)
 classifierParams = numpy.array([0.001, 0.01, 0.5, 1.0, 5.0])
 nExp = 50
 bestParam = audioTrainTest.evaluateClassifier(features,
                                               classNames,
                                               nExp,
                                               "svm",
                                               classifierParams,
                                               0,
                                               perTrain=0.01)
 [featuresNorm, MEAN, STD] = audioTrainTest.normalizeFeatures(features)
 MEAN = MEAN.tolist()
 STD = STD.tolist()
 Classifier = audioTrainTest.trainSVM(featuresNorm, bestParam)
 #todo
 #featureAndTrain("/home/fnaser/Music", )
 #Classifier.save_model(os.path.dirname(os.path.realpath(sys.argv[0]))+'/classifier_data/'+modelName)
 with open(
         os.path.dirname(os.path.realpath(sys.argv[0])) +
         '/classifier_data/' + modelName, 'wb') as fid:
     cPickle.dump(Classifier, fid)
 fo = open(
     os.path.dirname(os.path.realpath(sys.argv[0])) + '/classifier_data/' +
     modelName + "MEANS", "wb")
 cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
 cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL)
Exemple #21
0
def fileGreenwaySpeakerDiarization(filename, output_folder, speech_key="52fe944f29784ae288482e5eb3092e2a", service_region="eastus2",
                                   n_speakers=2, mt_size=2.0, mt_step=0.2,
                                   st_win=0.05, lda_dim=35):
    """
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
                            the filename should have a suffix of the form: ..._min_3
                            this informs the service that audio file corresponds to the 3rd minute of the dialogue
        - output_folder    the folder location for saving the audio snippets generated from diarization                           
        - speech_key       mid-term window size            
        - service_region       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - n_speakers       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - mt_size (opt)    mid-term window size
        - mt_step (opt)    mid-term window step
        - st_win  (opt)    short-term window size
        - lda_dim (opt     LDA dimension (0 for no LDA)
        - plot_res         (opt)   0 for not plotting the results 1 for plotting
        - save_plot        (opt)   1|True for saving plot in output folder
    """
    '''
    OUTPUTS:
        - cls:             this is a vector with speaker ids in chronological sequence of speaker dialogue.
        - output:          a list of python dictionaries containing dialogue sequence information.
                            - dialogue_id
                            - sequence_id
                            - start_time
                            - end_time
                            - text
    '''

    filename_only = filename if "/" not in filename else filename.split("/")[-1]
    nameoffile = filename_only.split("_min_")[0]
    timeoffile = filename_only.split("_min_")[1]

    [fs, x] = audioBasicIO.read_audio_file(filename)
    x = audioBasicIO.stereo_to_mono(x)
    duration = len(x) / fs

    [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "pyAudioAnalysis/data/models", "knn_speaker_10"))
    [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "pyAudioAnalysis/data/models", "knn_speaker_male_female"))

    [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs,
                                                        mt_step * fs,
                                                        round(fs * st_win),
                                                        round(fs*st_win * 0.5))

    MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) +
                                 len(classNames2), mt_feats.shape[1]))

    for i in range(mt_feats.shape[1]):
        cur_f1 = (mt_feats[:, i] - MEAN1) / STD1
        cur_f2 = (mt_feats[:, i] - MEAN2) / STD2
        [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
        [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
        MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
        MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[mt_feats.shape[0] +
                         len(classNames1)::, i] = P2 + 0.0001

    mt_feats = MidTermFeatures2    # TODO
    iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41,
                       42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]

    mt_feats = mt_feats[iFeaturesSelect, :]

    (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    n_wins = mt_feats.shape[1]

    # remove outliers:
    dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)),
                      axis=0)
    m_dist_all = np.mean(dist_all)
    i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = np.min(mt_feats[1,:])
    #EnergyMean = np.mean(mt_feats[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
    # print i_non_outliers

    perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins
    mt_feats_norm_or = mt_feats_norm
    mt_feats_norm = mt_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        # [mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs,
        # st_win * fs, round(fs*st_win), round(fs*st_win));
        # extract mid-term features with minimum step:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        # for i in range(num_of_stats * num_of_features + 1):
        for i in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        # for each of the short-term features:
        for i in range(num_of_features):
            curPos = 0
            N = len(st_feats[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mt_win_ratio
                if N2 > N:
                    N2 = N
                curStFeatures = st_feats[i][N1:N2]
                mt_feats_to_red[i].append(np.mean(curStFeatures))
                mt_feats_to_red[i +
                                num_of_features].append(np.std(curStFeatures))
                curPos += mt_step_ratio
        mt_feats_to_red = np.array(mt_feats_to_red)
        mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] +
                                      len(classNames1) + len(classNames2),
                                      mt_feats_to_red.shape[1]))
        for i in range(mt_feats_to_red.shape[1]):
            cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1
            cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2
            [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
            [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0],
                              i] = mt_feats_to_red[:, i]
            mt_feats_to_red_2[mt_feats_to_red.shape[0]                              :mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001
            mt_feats_to_red_2[mt_feats_to_red.shape[0] +
                              len(classNames1)::, i] = P2 + 0.0001
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :]
        #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010
        (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures(
            [mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0)
        #m_dist_all = np.mean(dist_all)
        #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0]
        #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2]
        Labels = np.zeros((mt_feats_to_red.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / st_win
        # print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i*st_win/LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, Labels)
        mt_feats_norm = (clf.transform(mt_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    clsAll = []
    sil_all = []
    centersAll = []

    for iSpeakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(mt_feats_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(mt_feats_norm.T))
        clsAll.append(cls)
        centersAll.append(means)
        sil_1 = []
        sil_2 = []
        for c in range(iSpeakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \
                float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mt_feats_norm[:, cls == c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                Yt = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(np.mean(Yt)*clust_per_cent)
                silBs = []
                for c2 in range(iSpeakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\
                            float(len(cls))
                        MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2]
                        Yt = distance.cdist(mt_feats_norm_temp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(np.mean(Yt)*(clust_per_cent
                                                  + clust_per_cent_2)/2.0)
                silBs = np.array(silBs)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(silBs))
        sil_1 = np.array(sil_1)
        sil_2 = np.array(sil_2)
        sil = []
        for c in range(iSpeakers):
            # for each cluster (speaker) compute silhouette
            sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c],
                                                    sil_1[c]) + 0.00001))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(np.mean(sil))

    imax = np.argmax(sil_all)
    # optimal number of clusters
    nSpeakersFinal = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = np.zeros((n_wins,))
    for i in range(n_wins):
        j = np.argmin(np.abs(i-i_non_outliers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
            trainHMM_computeStatistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]
    class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundturh exists
    if os.path.isfile(gt_file):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(
            seg_start, seg_end, seg_labs, mt_step)

    # if plot_res:
    #     fig = plt.figure()
    #     if n_speakers > 0:
    #         ax1 = fig.add_subplot(111)
    #     else:
    #         ax1 = fig.add_subplot(211)
    #     ax1.set_yticks(np.array(range(len(class_names))))
    #     ax1.axis((0, duration, -1, len(class_names)))
    #     ax1.set_yticklabels(class_names)
    #     ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls)

    # if os.path.isfile(gt_file):
    #     if plot_res:
    #         ax1.plot(np.array(range(len(flags_gt))) *
    #                  mt_step + mt_step / 2.0, flags_gt, 'r')
    #     purity_cluster_m, purity_speaker_m = \
    #         evaluateSpeakerDiarization(cls, flags_gt)
    #     print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
    #                                     100 * purity_speaker_m))
        # if plot_res:
        #     plt.title("Cluster purity: {0:.1f}% - "
        #               "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m,
        #                                                 100 * purity_speaker_m))
    # if plot_res:
    #     plt.xlabel("time (seconds)")
    #     # print s_range, sil_all
    #     if n_speakers <= 0:
    #         plt.subplot(212)
    #         plt.plot(s_range, sil_all)
    #         plt.xlabel("number of clusters")
    #         plt.ylabel("average clustering's sillouette")
    #     if save_plot:
    #         plt.savefig(
    #             f"{output_folder}{filename_only}".replace(".wav", ".png"))
    #     else:
    #         pass
    #     plt.show()

    # Create Time Vector
    time_vec = np.array(range(len(cls)))*mt_step+mt_step/2.0

    # Find Change Points
    speaker_change_index = np.where(np.roll(cls, 1) != cls)[0]

    # Create List of dialogue convos
    output_list = []
    temp = {}
    for ind, sc in enumerate(speaker_change_index):
        temp['dialogue_id'] = str(datetime.now()).strip()
        temp['sequence_id'] = str(ind)
        temp['speaker'] = list(cls)[sc]
        temp['start_time'] = time_vec[sc]
        temp['end_time'] = time_vec[speaker_change_index[ind+1] -
                                    1] if ind+1 < len(speaker_change_index) else time_vec[-1]
        temp["text"] = ""
        output_list.append(temp)
        temp = {}

    def snip_transcribe(output_list, filename, output_folder=output_folder,
                        speech_key=speech_key, service_region=service_region):
        speech_config = speechsdk.SpeechConfig(
            subscription=speech_key, region=service_region)
        speech_config.enable_dictation

        def recognized_cb(evt):
            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                # Do something with the recognized text
                output_list[ind]['text'] = output_list[ind]['text'] + \
                    str(evt.result.text)
                print(evt.result.text)

        for ind, diag in enumerate(output_list):
            t1 = diag['start_time']
            t2 = diag['end_time']
            newAudio = AudioSegment.from_wav(filename)
            chunk = newAudio[t1*1000:t2*1000]
            filename_out = output_folder + f"snippet_{diag['sequence_id']}.wav"
            # Exports to a wav file in the current path.
            chunk.export(filename_out, format="wav")
            done = False

            def stop_cb(evt):
                """callback that signals to stop continuous recognition upon receiving an event `evt`"""
                print('CLOSING on {}'.format(evt))
                nonlocal done
                done = True

            audio_input = speechsdk.AudioConfig(filename=filename_out)
            speech_recognizer = speechsdk.SpeechRecognizer(
                speech_config=speech_config, audio_config=audio_input)
            output_list[ind]['snippet_path'] = filename_out

            speech_recognizer.recognized.connect(recognized_cb)

            speech_recognizer.session_stopped.connect(stop_cb)
            speech_recognizer.canceled.connect(stop_cb)

            # Start continuous speech recognition
            speech_recognizer.start_continuous_recognition()
            while not done:
                time.sleep(.5)

            speech_recognizer.stop_continuous_recognition()

        return output_list

    output = snip_transcribe(output_list, filename,
                             output_folder=output_folder)
    output_json = {filename_only: output}

    with open(f"{output_folder}{nameoffile}_{timeoffile}.txt", "w") as outfile:
        json.dump(output_json, outfile)

    return cls, output_json
Exemple #22
0
"""
import os, readchar, sklearn.cluster
from pyAudioAnalysis.audioFeatureExtraction import mtFeatureExtraction as mT
from pyAudioAnalysis.audioBasicIO import readAudioFile, stereo2mono
from pyAudioAnalysis.audioSegmentation import flags2segs
from pyAudioAnalysis.audioTrainTest import normalizeFeatures

if __name__ == '__main__':
    # read signal and get normalized segment features:
    input_file = "../data/song1.mp3"
    fs, x = readAudioFile(input_file)
    x = stereo2mono(x)
    mt_size, mt_step, st_win = 5, 0.5, 0.05
    [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs,
                                 round(fs * st_win), round(fs * st_win * 0.5))
    (mt_feats_norm, MEAN, STD) = normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    # perform clustering (k = 4)
    n_clusters = 4
    k_means = sklearn.cluster.KMeans(n_clusters=n_clusters)
    k_means.fit(mt_feats_norm.T)
    cls = k_means.labels_
    segs, c = flags2segs(cls, mt_step)  # convert flags to segment limits
    for sp in range(n_clusters):  # play each cluster's segment
        for i in range(len(c)):
            if c[i] == sp and segs[i, 1] - segs[i, 0] > 5:
                # play long segments of current cluster (only win_to_play seconds)
                d = segs[i, 1] - segs[i, 0]
                win_to_play = 10
                if win_to_play > d:
                    win_to_play = d
Exemple #23
0
            [bestParam, result_matrix, precision_classes_all, recall_classes_all, f1_classes_all, f1_all, ac_all] = \
                    AudioClassifierManager.getResultMatrixAndBestParam\
                        (features,classNames,model,AudioClassifierManager.BEST_ACCURACY,perTrain=pT)

            print("Selected params: {0:.5f}".format(bestParam))

            AudioClassifierManager.saveConfusionMatrix(result_matrix,
                                                       classNames, model_name)
            AudioClassifierManager.saveParamsFromClassification(
                classNames,
                AudioClassifierManager.getListParamsForClassifierType(model),
                model_name, precision_classes_all, recall_classes_all,
                f1_classes_all, ac_all, f1_all)

            # Feature normalization:
            (features_norm, MEAN, STD) = aT.normalizeFeatures(features)

            MEAN = MEAN.tolist()
            STD = STD.tolist()
            featuresNew = features_norm

            # Re-apply classification with normalized features and best param
            finalClassifier = AudioClassifierManager.getTrainClassifier(
                featuresNew, model, bestParam)
            # Save final model
            AudioClassifierManager.saveClassifierModel(featuresNew, model_name,
                                                       model, finalClassifier,
                                                       MEAN, STD, classNames,
                                                       bestParam)
def featureAndTrain(list_of_dirs, mt_win, mt_step, st_win, st_step,
                    classifier_type, model_name,
                    compute_beat=False, perTrain=0.90, feats=["gfcc", "mfcc"]):
    '''
    This function is used as a wrapper to segment-based audio feature extraction and classifier training.
    ARGUMENTS:
        list_of_dirs:        list of paths of directories. Each directory contains a signle audio class whose samples are stored in seperate WAV files.
        mt_win, mt_step:        mid-term window length and step
        st_win, st_step:        short-term window and step
        classifier_type:        "svm" or "knn" or "randomforest" or "gradientboosting" or "extratrees"
        model_name:        name of the model to be saved
    RETURNS:
        None. Resulting classifier along with the respective model parameters are saved on files.
    '''

    # STEP A: Feature Extraction:
    [features, classNames, _] = aF.dirsWavFeatureExtraction(list_of_dirs,
                                                            mt_win,
                                                            mt_step,
                                                            st_win,
                                                            st_step,
                                                            compute_beat=compute_beat,
                                                            feats=feats)

    if len(features) == 0:
        print("trainSVM_feature ERROR: No data found in any input folder!")
        return

    n_feats = features[0].shape[1]
    feature_names = ["features" + str(d + 1) for d in range(n_feats)]

    writeTrainDataToARFF(model_name, features, classNames, feature_names)

    for i, f in enumerate(features):
        if len(f) == 0:
            print("trainSVM_feature ERROR: " + list_of_dirs[i] + " folder is empty or non-existing!")
            return

    # STEP B: classifier Evaluation and Parameter Selection:
    if classifier_type == "svm" or classifier_type == "svm_rbf":
        classifier_par = numpy.array([0.001, 0.01,  0.5, 1.0, 5.0, 10.0, 20.0])
    elif classifier_type == "randomforest":
        classifier_par = numpy.array([10, 25, 50, 100,200,500])
    elif classifier_type == "knn":
        classifier_par = numpy.array([1, 3, 5, 7, 9, 11, 13, 15])
    elif classifier_type == "gradientboosting":
        classifier_par = numpy.array([10, 25, 50, 100,200,500])
    elif classifier_type == "extratrees":
        classifier_par = numpy.array([10, 25, 50, 100,200,500])
    elif classifier_type == "logisticregression":
        classifier_par = numpy.array([0.01, 0.1, 1, 5])

    # get optimal classifeir parameter:
    features2 = []
    for f in features:
        fTemp = []
        for i in range(f.shape[0]):
            temp = f[i,:]
            if (not numpy.isnan(temp).any()) and (not numpy.isinf(temp).any()) :
                fTemp.append(temp.tolist())
            else:
                print("NaN Found! Feature vector not used for training")
        features2.append(numpy.array(fTemp))
    features = features2

    bestParam = evaluateclassifier(features, classNames, 300, classifier_type, classifier_par, 0, perTrain) # Hier!!!!

    print("Selected params: {0:.5f}".format(bestParam))

    C = len(classNames)
    [features_norm, MEAN, STD] = normalizeFeatures(features)        # normalize features
    MEAN = MEAN.tolist()
    STD = STD.tolist()
    featuresNew = features_norm

    # STEP C: Save the classifier to file
    if classifier_type == "svm":
        classifier = trainSVM(featuresNew, bestParam)
    elif classifier_type == "svm_rbf":
        classifier = trainSVM_RBF(featuresNew, bestParam)
    elif classifier_type == "randomforest":
        classifier = trainRandomForest(featuresNew, bestParam)
    elif classifier_type == "gradientboosting":
        classifier = trainGradientBoosting(featuresNew, bestParam)
    elif classifier_type == "extratrees":
        classifier = trainExtraTrees(featuresNew, bestParam)
    elif classifier_type == "logisticregression":
        classifier = trainLogisticRegression(featuresNew, bestParam)


    if classifier_type == "knn":
        [X, Y] = listOfFeatures2Matrix(featuresNew)
        X = X.tolist()
        Y = Y.tolist()
        fo = open(model_name, "wb")
        cPickle.dump(X, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(Y,  fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(STD,  fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(classNames,  fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(bestParam,  fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        fo.close()
    elif classifier_type == "svm" or classifier_type == "svm_rbf" or \
                    classifier_type == "randomforest" or \
                    classifier_type == "gradientboosting" or \
                    classifier_type == "extratrees" or \
                    classifier_type == "logisticregression":
        with open(model_name, 'wb') as fid:
            cPickle.dump(classifier, fid)
        fo = open(model_name + "MEANS", "wb")
        cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL)
        fo.close()
Exemple #25
0
def silenceRemoval(x,
                   fs,
                   st_win,
                   st_step,
                   smoothWindow=0.5,
                   weight=0.5,
                   plot=False):
    '''
    Event Detection (silence removal)
    ARGUMENTS:
         - x:                the input audio signal
         - fs:               sampling freq
         - st_win, st_step:    window size and step in seconds
         - smoothWindow:     (optinal) smooth window (in seconds)
         - weight:           (optinal) weight factor (0 < weight < 1) the higher, the more strict
         - plot:             (optinal) True if results are to be plotted
    RETURNS:
         - seg_limits:    list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that
                    the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds
    '''

    if weight >= 1:
        weight = 0.99
    if weight <= 0:
        weight = 0.01

    # Step 1: feature extraction
    x = audioBasicIO.stereo2mono(x)
    st_feats, _ = aF.stFeatureExtraction(x, fs, st_win * fs, st_step * fs)

    # Step 2: train binary svm classifier of low vs high energy frames
    # keep only the energy short-term sequence (2nd feature)
    st_energy = st_feats[1, :]
    en = numpy.sort(st_energy)
    # number of 10% of the total short-term windows
    l1 = int(len(en) / 10)
    # compute "lower" 10% energy threshold
    t1 = numpy.mean(en[0:l1]) + 0.000000000000001
    # compute "higher" 10% energy threshold
    t2 = numpy.mean(en[-l1:-1]) + 0.000000000000001
    # get all features that correspond to low energy
    class1 = st_feats[:, numpy.where(st_energy <= t1)[0]]
    # get all features that correspond to high energy
    class2 = st_feats[:, numpy.where(st_energy >= t2)[0]]
    # form the binary classification task and ...
    faets_s = [class1.T, class2.T]
    # normalize and train the respective svm probabilistic model
    # (ONSET vs SILENCE)
    [faets_s_norm, means_s, stds_s] = aT.normalizeFeatures(faets_s)
    svm = aT.trainSVM(faets_s_norm, 1.0)

    # Step 3: compute onset probability based on the trained svm
    prob_on_set = []
    for i in range(st_feats.shape[1]):
        # for each frame
        cur_fv = (st_feats[:, i] - means_s) / stds_s
        # get svm probability (that it belongs to the ONSET class)
        prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1])
    prob_on_set = numpy.array(prob_on_set)
    # smooth probability:
    prob_on_set = smoothMovingAvg(prob_on_set, smoothWindow / st_step)

    # Step 4A: detect onset frame indices:
    prog_on_set_sort = numpy.sort(prob_on_set)
    # find probability Threshold as a weighted average
    # of top 10% and lower 10% of the values
    Nt = int(prog_on_set_sort.shape[0] / 10)
    T = (numpy.mean((1 - weight) * prog_on_set_sort[0:Nt]) +
         weight * numpy.mean(prog_on_set_sort[-Nt::]))

    max_idx = numpy.where(prob_on_set > T)[0]
    # get the indices of the frames that satisfy the thresholding
    i = 0
    time_clusters = []
    seg_limits = []

    # Step 4B: group frame indices to onset segments
    while i < len(max_idx):
        # for each of the detected onset indices
        cur_cluster = [max_idx[i]]
        if i == len(max_idx) - 1:
            break
        while max_idx[i + 1] - cur_cluster[-1] <= 2:
            cur_cluster.append(max_idx[i + 1])
            i += 1
            if i == len(max_idx) - 1:
                break
        i += 1
        time_clusters.append(cur_cluster)
        seg_limits.append(
            [cur_cluster[0] * st_step, cur_cluster[-1] * st_step])

    # Step 5: Post process: remove very small segments:
    min_dur = 0.2
    seg_limits_2 = []
    for s in seg_limits:
        if s[1] - s[0] > min_dur:
            seg_limits_2.append(s)
    seg_limits = seg_limits_2

    if plot:
        timeX = numpy.arange(0, x.shape[0] / float(fs), 1.0 / fs)

        plt.subplot(2, 1, 1)
        plt.plot(timeX, x)
        for s in seg_limits:
            plt.axvline(x=s[0])
            plt.axvline(x=s[1])
        plt.subplot(2, 1, 2)
        plt.plot(numpy.arange(0, prob_on_set.shape[0] * st_step, st_step),
                 prob_on_set)
        plt.title('Signal')
        for s in seg_limits:
            plt.axvline(x=s[0])
            plt.axvline(x=s[1])
        plt.title('svm Probability')
        plt.show()

    return seg_limits
def evaluateclassifier(features, class_names, n_exp, classifier_name, Params, parameterMode, perTrain=0.90):
    '''
    ARGUMENTS:
        features:     a list ([numOfClasses x 1]) whose elements containt numpy matrices of features.
                each matrix features[i] of class i is [n_samples x numOfDimensions]
        class_names:    list of class names (strings)
        n_exp:        number of cross-validation experiments
        classifier_name: svm or knn or randomforest
        Params:        list of classifier parameters (for parameter tuning during cross-validation)
        parameterMode:    0: choose parameters that lead to maximum overall classification ACCURACY
                1: choose parameters that lead to maximum overall f1 MEASURE
    RETURNS:
         bestParam:    the value of the input parameter that optimizes the selected performance measure
    '''

    # feature normalization:
    (features_norm, MEAN, STD) = normalizeFeatures(features)
    #features_norm = features;
    n_classes = len(features)
    ac_all = []
    f1_all = []
    precision_classes_all = []
    recall_classes_all = []
    f1_classes_all = []
    cms_all = []

    # compute total number of samples:
    n_samples_total = 0
    for f in features:
        n_samples_total += f.shape[0]
    if n_samples_total > 1000 and n_exp > 50:
        n_exp = 50
        print("Number of training experiments changed to 50 due to high number of samples")
    if n_samples_total > 2000 and n_exp > 10:
        n_exp = 10
        print("Number of training experiments changed to 10 due to high number of samples")

    for Ci, C in enumerate(Params):
        # for each param value
        cm = numpy.zeros((n_classes, n_classes))
        for e in range(n_exp):
            # for each cross-validation iteration:
            print("Param = {0:.5f} - classifier Evaluation "
                  "Experiment {1:d} of {2:d}".format(C, e+1, n_exp))
            # split features:
            f_train, f_test = randSplitFeatures(features_norm, perTrain)
            # train multi-class svms:
            if classifier_name == "svm":
                classifier = trainSVM(f_train, C)
            elif classifier_name == "svm_rbf":
                classifier = trainSVM_RBF(f_train, C)
            elif classifier_name == "knn":
                classifier = trainKNN(f_train, C)
            elif classifier_name == "randomforest":
                classifier = trainRandomForest(f_train, C)
            elif classifier_name == "gradientboosting":
                classifier = trainGradientBoosting(f_train, C)
            elif classifier_name == "extratrees":
                classifier = trainExtraTrees(f_train, C)
            elif classifier_name == "logisticregression":
                classifier = trainLogisticRegression(f_train, C)

            cmt = numpy.zeros((n_classes, n_classes))
            for c1 in range(n_classes):
                n_test_samples = len(f_test[c1])
                res = numpy.zeros((n_test_samples, 1))
                for ss in range(n_test_samples):
                    [res[ss], _] = classifierWrapperHead(classifier,
                                                     classifier_name,
                                                     f_test[c1][ss])
                for c2 in range(n_classes):
                    cmt[c1][c2] = float(len(numpy.nonzero(res == c2)[0]))
            cm = cm + cmt
        cm = cm + 0.0000000010
        rec = numpy.zeros((cm.shape[0], ))
        pre = numpy.zeros((cm.shape[0], ))

        for ci in range(cm.shape[0]):
            rec[ci] = cm[ci, ci] / numpy.sum(cm[ci, :])
            pre[ci] = cm[ci, ci] / numpy.sum(cm[:, ci])
        precision_classes_all.append(pre)
        recall_classes_all.append(rec)
        f1 = 2 * rec * pre / (rec + pre)
        f1_classes_all.append(f1)
        ac_all.append(numpy.sum(numpy.diagonal(cm)) / numpy.sum(cm))

        cms_all.append(cm)
        f1_all.append(numpy.mean(f1))

    print("\t\t", end="")
    for i, c in enumerate(class_names):
        if i == len(class_names)-1:
            print("{0:s}\t\t".format(c), end="")
        else:
            print("{0:s}\t\t\t".format(c), end="")
    print("OVERALL")
    print("\tC", end="")
    for c in class_names:
        print("\tPRE\tREC\tf1", end="")
    print("\t{0:s}\t{1:s}".format("ACC", "f1"))
    best_ac_ind = numpy.argmax(ac_all)
    best_f1_ind = numpy.argmax(f1_all)
    for i in range(len(precision_classes_all)):
        print("\t{0:.3f}".format(Params[i]), end="")
        for c in range(len(precision_classes_all[i])):
            print("\t{0:.1f}\t{1:.1f}\t{2:.1f}".format(100.0 * precision_classes_all[i][c],
                                                       100.0 * recall_classes_all[i][c],
                                                       100.0 * f1_classes_all[i][c]), end="")
        print("\t{0:.1f}\t{1:.1f}".format(100.0 * ac_all[i], 100.0 * f1_all[i]), end="")
        if i == best_f1_ind:
            print("\t best f1", end="")
        if i == best_ac_ind:
            print("\t best Acc", end="")
        print("")

    if parameterMode == 0:    # keep parameters that maximize overall classification accuracy:
        print("Confusion Matrix:")
        printConfusionMatrix(cms_all[best_ac_ind], class_names)
        return Params[best_ac_ind]
    elif parameterMode == 1:  # keep parameters that maximize overall f1 measure:
        print("Confusion Matrix:")
        printConfusionMatrix(cms_all[best_f1_ind], class_names)
        return Params[best_f1_ind]
Exemple #27
0
import os
from pyAudioAnalysis import audioTrainTest

if __name__ == '__main__':
    rospy.init_node("classifier_train_node")
    modelName = rospy.get_param('~classifier_name', 'modelSVM')
    features = []
    classNames = rospy.get_param('~classes', {'silence', 'speech'})
    classNames = classNames.split()
    for a in classNames:
        temp = numpy.load(os.path.dirname(os.path.realpath(sys.argv[0]))+'/classifier_data/'+a+'.npy')        
        features.append(temp)                                
    classifierParams = numpy.array([0.001, 0.01,  0.5, 1.0, 5.0])
    nExp = 50
    bestParam = audioTrainTest.evaluateClassifier(features, classNames, nExp, "svm", classifierParams, 0, perTrain = 0.01)
    [featuresNorm, MEAN, STD] = audioTrainTest.normalizeFeatures(features)        # normalize features
    MEAN = MEAN.tolist()
    STD = STD.tolist()
    featuresNew = featuresNorm
    Classifier = audioTrainTest.trainSVM(featuresNew, bestParam)
    Classifier.save_model(os.path.dirname(os.path.realpath(sys.argv[0]))+'/classifier_data/'+modelName)
    fo = open(os.path.dirname(os.path.realpath(sys.argv[0]))+'/classifier_data/'+modelName + "MEANS", "wb")
    cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(0, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(0, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(0, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(0, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(0, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()
def visualizeFeaturesFolder(folder, dimReductionMethod, priorKnowledge = "none"):
    '''
    This function generates a chordial visualization for the recordings of the provided path.
    ARGUMENTS:
        - folder:        path of the folder that contains the WAV files to be processed
        - dimReductionMethod:    method used to reduce the dimension of the initial feature space before computing the similarity.
        - priorKnowledge:    if this is set equal to "artist"
    '''
    if dimReductionMethod=="pca":
        allMtFeatures, wavFilesList, _ = aF.dirWavFeatureExtraction(folder, 30.0, 30.0, 0.050, 0.050, compute_beat = True)
        if allMtFeatures.shape[0]==0:
            print("Error: No data found! Check input folder")
            return
        
        namesCategoryToVisualize = [ntpath.basename(w).replace('.wav','').split(" --- ")[0] for w in wavFilesList]; 
        namesToVisualize       = [ntpath.basename(w).replace('.wav','') for w in wavFilesList]; 

        (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures])
        F = np.concatenate(F)
        
        # check that the new PCA dimension is at most equal to the number of samples
        K1 = 2
        K2 = 10
        if K1 > F.shape[0]:
            K1 = F.shape[0]
        if K2 > F.shape[0]:
            K2 = F.shape[0]
        pca1 = sklearn.decomposition.PCA(n_components = K1)
        pca1.fit(F)        
        pca2 = sklearn.decomposition.PCA(n_components = K2)
        pca2.fit(F)        

        finalDims = pca1.transform(F)
        finalDims2 = pca2.transform(F)
    else:    
        allMtFeatures, Ys, wavFilesList = aF.dirWavFeatureExtractionNoAveraging(folder, 20.0, 5.0, 0.040, 0.040) # long-term statistics cannot be applied in this context (LDA needs mid-term features)
        if allMtFeatures.shape[0]==0:
            print("Error: No data found! Check input folder")
            return
        
        namesCategoryToVisualize = [ntpath.basename(w).replace('.wav','').split(" --- ")[0] for w in wavFilesList]; 
        namesToVisualize       = [ntpath.basename(w).replace('.wav','') for w in wavFilesList]; 

        ldaLabels = Ys
        if priorKnowledge=="artist":
            uNamesCategoryToVisualize = list(set(namesCategoryToVisualize))
            YsNew = np.zeros( Ys.shape )
            for i, uname in enumerate(uNamesCategoryToVisualize):        # for each unique artist name:
                indicesUCategories = [j for j, x in enumerate(namesCategoryToVisualize) if x == uname]
                for j in indicesUCategories:
                    indices = np.nonzero(Ys==j)
                    YsNew[indices] = i
            ldaLabels = YsNew

        (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures])
        F = np.array(F[0])

        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=10)
        clf.fit(F, ldaLabels)    
        reducedDims =  clf.transform(F)

        pca = sklearn.decomposition.PCA(n_components = 2)
        pca.fit(reducedDims)
        reducedDims = pca.transform(reducedDims)

        # TODO: CHECK THIS ... SHOULD LDA USED IN SEMI-SUPERVISED ONLY????

        uLabels = np.sort(np.unique((Ys)))        # uLabels must have as many labels as the number of wavFilesList elements
        reducedDimsAvg = np.zeros( (uLabels.shape[0], reducedDims.shape[1] ) )
        finalDims = np.zeros( (uLabels.shape[0], 2) ) 
        for i, u in enumerate(uLabels):
            indices = [j for j, x in enumerate(Ys) if x == u]
            f = reducedDims[indices, :]
            finalDims[i, :] = f.mean(axis=0)
        finalDims2 = reducedDims

    for i in range(finalDims.shape[0]):            
        plt.text(finalDims[i,0], finalDims[i,1], ntpath.basename(wavFilesList[i].replace('.wav','')), horizontalalignment='center', verticalalignment='center', fontsize=10)
        plt.plot(finalDims[i,0], finalDims[i,1], '*r')
    plt.xlim([1.2*finalDims[:,0].min(), 1.2*finalDims[:,0].max()])
    plt.ylim([1.2*finalDims[:,1].min(), 1.2*finalDims[:,1].max()])            
    plt.show()

    SM = 1.0 - distance.squareform(distance.pdist(finalDims2, 'cosine'))
    for i in range(SM.shape[0]):
        SM[i,i] = 0.0;


    chordialDiagram("visualization", SM, 0.50, namesToVisualize, namesCategoryToVisualize)

    SM = 1.0 - distance.squareform(distance.pdist(F, 'cosine'))
    for i in range(SM.shape[0]):
        SM[i,i] = 0.0;
    chordialDiagram("visualizationInitial", SM, 0.50, namesToVisualize, namesCategoryToVisualize)

    # plot super-categories (i.e. artistname
    uNamesCategoryToVisualize = sort(list(set(namesCategoryToVisualize)))
    finalDimsGroup = np.zeros( (len(uNamesCategoryToVisualize), finalDims2.shape[1] ) )
    for i, uname in enumerate(uNamesCategoryToVisualize):
        indices = [j for j, x in enumerate(namesCategoryToVisualize) if x == uname]
        f = finalDims2[indices, :]
        finalDimsGroup[i, :] = f.mean(axis=0)

    SMgroup = 1.0 - distance.squareform(distance.pdist(finalDimsGroup, 'cosine'))
    for i in range(SMgroup.shape[0]):
        SMgroup[i,i] = 0.0;
    chordialDiagram("visualizationGroup", SMgroup, 0.50, uNamesCategoryToVisualize, uNamesCategoryToVisualize)
def evaluateClassifier(argv):
    save = argv[5]

    dirName = argv[2]  # path to csv files
    fileList = sorted(glob.glob(os.path.join(dirName, "*.csv")))

    #data = {}
    #data['user'] = {}
    user = []
    exercise = []
    repetition = []
    time = []
    emg_raw = []
    gt_labels = []
    feature_vectors_nofatigue = []
    feature_vectors_fatigue = []

    for file in fileList:

        with open(file, 'r') as f:
            x = f.readlines()
            if not x:
                continue
            time.append([float(label.split(',')[0]) for label in x])
            emg_raw.append([float(label.split(',')[1]) for label in x])
            gt_labels.append(
                [int(label.split(',')[2].rstrip()) for label in x])
        f.close

        #split the sample into the positive and negative classes
        ###
        feature_vectors, gtWindowLabels = featureExtraction(
            emg_raw[-1], time[-1], gt_labels[-1], 2, 1, 0.25, 0.25)

        for i, w in enumerate(gtWindowLabels):
            if w == 0:
                feature_vectors_nofatigue.append(feature_vectors[:, i])
            else:
                feature_vectors_fatigue.append(feature_vectors[:, i])

        user.append(file.split('/')[-1].split('E')[0][1:])
        exercise.append(file.split('/')[-1].split('R')[0][-1])
        repetition.append(file.split('/')[-1].split('.')[0][-1])

        if argv[-1] == '-s':
            showEMGData(emg_raw[-1], time[-1][-1] - time[-1][0], gt_labels[-1])

    #Collect all features
    featuresAll = []
    featuresAll.append(np.array(feature_vectors_nofatigue))
    featuresAll.append(np.array(feature_vectors_fatigue))
    labelsAll = ['0:NoFtigue', '1:Fatigue']  # 0:NoFtigue, 1:Fatigue

    #Normilize features
    (featuresAll, MEAN, STD) = aT.normalizeFeatures(featuresAll)

    clf = argv[3][1:]
    params = argv[4]
    bestParam = aT.evaluateclassifier(featuresAll,
                                      labelsAll,
                                      1000,
                                      clf,
                                      params,
                                      0,
                                      perTrain=0.80)

    MEAN = MEAN.tolist()
    STD = STD.tolist()

    model = Classify(clf, featuresAll, bestParam)

    if save:
        saveClassifier(clf, bestParam, model, MEAN, STD, labelsAll)

    print 'Training of', clf, 'completed'

    return clf, model, labelsAll, MEAN, STD, bestParam
Exemple #30
0
def visualizeFeaturesFolder(folder, dimReductionMethod, priorKnowledge="none"):
    '''
    This function generates a chordial visualization for the recordings of the provided path.
    ARGUMENTS:
        - folder:        path of the folder that contains the WAV files to be processed
        - dimReductionMethod:    method used to reduce the dimension of the initial feature space before computing the similarity.
        - priorKnowledge:    if this is set equal to "artist"
    '''
    if dimReductionMethod == "pca":
        allMtFeatures, wavFilesList, _ = aF.dirWavFeatureExtraction(
            folder, 30.0, 30.0, 0.050, 0.050, compute_beat=True)
        if allMtFeatures.shape[0] == 0:
            print("Error: No data found! Check input folder")
            return

        namesCategoryToVisualize = [
            ntpath.basename(w).replace('.wav', '').split(" --- ")[0]
            for w in wavFilesList
        ]
        namesToVisualize = [
            ntpath.basename(w).replace('.wav', '') for w in wavFilesList
        ]

        (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures])
        F = np.concatenate(F)

        # check that the new PCA dimension is at most equal to the number of samples
        K1 = 2
        K2 = 10
        if K1 > F.shape[0]:
            K1 = F.shape[0]
        if K2 > F.shape[0]:
            K2 = F.shape[0]
        pca1 = sklearn.decomposition.PCA(n_components=K1)
        pca1.fit(F)
        pca2 = sklearn.decomposition.PCA(n_components=K2)
        pca2.fit(F)

        finalDims = pca1.transform(F)
        finalDims2 = pca2.transform(F)
    else:
        allMtFeatures, Ys, wavFilesList = aF.dirWavFeatureExtractionNoAveraging(
            folder, 20.0, 5.0, 0.040, 0.040
        )  # long-term statistics cannot be applied in this context (LDA needs mid-term features)
        if allMtFeatures.shape[0] == 0:
            print("Error: No data found! Check input folder")
            return

        namesCategoryToVisualize = [
            ntpath.basename(w).replace('.wav', '').split(" --- ")[0]
            for w in wavFilesList
        ]
        namesToVisualize = [
            ntpath.basename(w).replace('.wav', '') for w in wavFilesList
        ]

        ldaLabels = Ys
        if priorKnowledge == "artist":
            uNamesCategoryToVisualize = list(set(namesCategoryToVisualize))
            YsNew = np.zeros(Ys.shape)
            for i, uname in enumerate(
                    uNamesCategoryToVisualize):  # for each unique artist name:
                indicesUCategories = [
                    j for j, x in enumerate(namesCategoryToVisualize)
                    if x == uname
                ]
                for j in indicesUCategories:
                    indices = np.nonzero(Ys == j)
                    YsNew[indices] = i
            ldaLabels = YsNew

        (F, MEAN, STD) = aT.normalizeFeatures([allMtFeatures])
        F = np.array(F[0])

        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=10)
        clf.fit(F, ldaLabels)
        reducedDims = clf.transform(F)

        pca = sklearn.decomposition.PCA(n_components=2)
        pca.fit(reducedDims)
        reducedDims = pca.transform(reducedDims)

        # TODO: CHECK THIS ... SHOULD LDA USED IN SEMI-SUPERVISED ONLY????

        uLabels = np.sort(
            np.unique((Ys))
        )  # uLabels must have as many labels as the number of wavFilesList elements
        reducedDimsAvg = np.zeros((uLabels.shape[0], reducedDims.shape[1]))
        finalDims = np.zeros((uLabels.shape[0], 2))
        for i, u in enumerate(uLabels):
            indices = [j for j, x in enumerate(Ys) if x == u]
            f = reducedDims[indices, :]
            finalDims[i, :] = f.mean(axis=0)
        finalDims2 = reducedDims

    for i in range(finalDims.shape[0]):
        plt.text(finalDims[i, 0],
                 finalDims[i, 1],
                 ntpath.basename(wavFilesList[i].replace('.wav', '')),
                 horizontalalignment='center',
                 verticalalignment='center',
                 fontsize=10)
        plt.plot(finalDims[i, 0], finalDims[i, 1], '*r')
    plt.xlim([1.2 * finalDims[:, 0].min(), 1.2 * finalDims[:, 0].max()])
    plt.ylim([1.2 * finalDims[:, 1].min(), 1.2 * finalDims[:, 1].max()])
    plt.show()

    SM = 1.0 - distance.squareform(distance.pdist(finalDims2, 'cosine'))
    for i in range(SM.shape[0]):
        SM[i, i] = 0.0

    chordialDiagram("visualization", SM, 0.50, namesToVisualize,
                    namesCategoryToVisualize)

    SM = 1.0 - distance.squareform(distance.pdist(F, 'cosine'))
    for i in range(SM.shape[0]):
        SM[i, i] = 0.0
    chordialDiagram("visualizationInitial", SM, 0.50, namesToVisualize,
                    namesCategoryToVisualize)

    # plot super-categories (i.e. artistname
    uNamesCategoryToVisualize = sort(list(set(namesCategoryToVisualize)))
    finalDimsGroup = np.zeros(
        (len(uNamesCategoryToVisualize), finalDims2.shape[1]))
    for i, uname in enumerate(uNamesCategoryToVisualize):
        indices = [
            j for j, x in enumerate(namesCategoryToVisualize) if x == uname
        ]
        f = finalDims2[indices, :]
        finalDimsGroup[i, :] = f.mean(axis=0)

    SMgroup = 1.0 - distance.squareform(
        distance.pdist(finalDimsGroup, 'cosine'))
    for i in range(SMgroup.shape[0]):
        SMgroup[i, i] = 0.0
    chordialDiagram("visualizationGroup", SMgroup, 0.50,
                    uNamesCategoryToVisualize, uNamesCategoryToVisualize)
Exemple #31
0
def silenceRemoval(x,
                   Fs,
                   stWin,
                   stStep,
                   smoothWindow=0.5,
                   Weight=0.5,
                   plot=False):
    '''
    Event Detection (silence removal)
    ARGUMENTS:
         - x:                the input audio signal
         - Fs:               sampling freq
         - stWin, stStep:    window size and step in seconds
         - smoothWindow:     (optinal) smooth window (in seconds)
         - Weight:           (optinal) weight factor (0 < Weight < 1) the higher, the more strict
         - plot:             (optinal) True if results are to be plotted
    RETURNS:
         - segmentLimits:    list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that
                    the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds
    '''

    if Weight >= 1:
        Weight = 0.99
    if Weight <= 0:
        Weight = 0.01

    # Step 1: feature extraction
    x = audioBasicIO.stereo2mono(x)  # convert to mono
    ShortTermFeatures = aF.stFeatureExtraction(
        x, Fs, stWin * Fs, stStep * Fs)  # extract short-term features

    # Step 2: train binary SVM classifier of low vs high energy frames
    EnergySt = ShortTermFeatures[
        1, :]  # keep only the energy short-term sequence (2nd feature)
    E = numpy.sort(EnergySt)  # sort the energy feature values:
    L1 = int(len(E) / 10)  # number of 10% of the total short-term windows
    T1 = numpy.mean(
        E[0:L1]) + 0.000000000000001  # compute "lower" 10% energy threshold
    T2 = numpy.mean(
        E[-L1:-1]) + 0.000000000000001  # compute "higher" 10% energy threshold
    Class1 = ShortTermFeatures[:, numpy.where(
        EnergySt <= T1)[0]]  # get all features that correspond to low energy
    Class2 = ShortTermFeatures[:, numpy.where(
        EnergySt >= T2)[0]]  # get all features that correspond to high energy
    featuresSS = [Class1.T,
                  Class2.T]  # form the binary classification task and ...

    [featuresNormSS, MEANSS,
     STDSS] = aT.normalizeFeatures(featuresSS)  # normalize and ...
    SVM = aT.trainSVM(
        featuresNormSS,
        1.0)  # train the respective SVM probabilistic model (ONSET vs SILENCE)

    # Step 3: compute onset probability based on the trained SVM
    ProbOnset = []
    for i in range(ShortTermFeatures.shape[1]):  # for each frame
        curFV = (ShortTermFeatures[:, i] -
                 MEANSS) / STDSS  # normalize feature vector
        ProbOnset.append(
            SVM.predict_proba(curFV.reshape(1, -1))[0]
            [1])  # get SVM probability (that it belongs to the ONSET class)
    ProbOnset = numpy.array(ProbOnset)
    ProbOnset = smoothMovingAvg(ProbOnset,
                                smoothWindow / stStep)  # smooth probability

    # Step 4A: detect onset frame indices:
    ProbOnsetSorted = numpy.sort(
        ProbOnset
    )  # find probability Threshold as a weighted average of top 10% and lower 10% of the values
    Nt = int(ProbOnsetSorted.shape[0] / 10)
    T = (numpy.mean((1 - Weight) * ProbOnsetSorted[0:Nt]) +
         Weight * numpy.mean(ProbOnsetSorted[-Nt::]))

    MaxIdx = numpy.where(ProbOnset > T)[
        0]  # get the indices of the frames that satisfy the thresholding
    i = 0
    timeClusters = []
    segmentLimits = []

    # Step 4B: group frame indices to onset segments
    while i < len(MaxIdx):  # for each of the detected onset indices
        curCluster = [MaxIdx[i]]
        if i == len(MaxIdx) - 1:
            break
        while MaxIdx[i + 1] - curCluster[-1] <= 2:
            curCluster.append(MaxIdx[i + 1])
            i += 1
            if i == len(MaxIdx) - 1:
                break
        i += 1
        timeClusters.append(curCluster)
        segmentLimits.append([curCluster[0] * stStep, curCluster[-1] * stStep])

    # Step 5: Post process: remove very small segments:
    minDuration = 0.2
    segmentLimits2 = []
    for s in segmentLimits:
        if s[1] - s[0] > minDuration:
            segmentLimits2.append(s)
    segmentLimits = segmentLimits2

    if plot:
        timeX = numpy.arange(0, x.shape[0] / float(Fs), 1.0 / Fs)

        plt.subplot(2, 1, 1)
        plt.plot(timeX, x)
        for s in segmentLimits:
            plt.axvline(x=s[0])
            plt.axvline(x=s[1])
        plt.subplot(2, 1, 2)
        plt.plot(numpy.arange(0, ProbOnset.shape[0] * stStep, stStep),
                 ProbOnset)
        plt.title('Signal')
        for s in segmentLimits:
            plt.axvline(x=s[0])
            plt.axvline(x=s[1])
        plt.title('SVM Probability')
        plt.show()

    return segmentLimits
Exemple #32
0
def speakerDiarization(filename,
                       n_speakers,
                       mt_size=2.0,
                       mt_step=0.2,
                       st_win=0.05,
                       lda_dim=35,
                       plot_res=False):
    '''
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
        - n_speakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mt_size (opt)     mid-term window size
        - mt_step (opt)     mid-term window step
        - st_win  (opt)     short-term window size
        - lda_dim (opt)     LDA dimension (0 for no LDA)
        - plot_res     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [fs, x] = audioBasicIO.readAudioFile(filename)
    x = audioBasicIO.stereo2mono(x)
    duration = len(x) / fs

    [
        classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1,
        stStep1, computeBEAT1
    ] = aT.load_model_knn(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "data",
                     "knnSpeakerAll"))
    [
        classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2,
        stStep2, computeBEAT2
    ] = aT.load_model_knn(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "data",
                     "knnSpeakerFemaleMale"))

    [mt_feats, st_feats, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs,
                                                     mt_step * fs,
                                                     round(fs * st_win),
                                                     round(fs * st_win * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (mt_feats.shape[0] + len(classNames1) + len(classNames2),
         mt_feats.shape[1]))

    for i in range(mt_feats.shape[1]):
        cur_f1 = (mt_feats[:, i] - MEAN1) / STD1
        cur_f2 = (mt_feats[:, i] - MEAN2) / STD2
        [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
        [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
        MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
        MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::,
                         i] = P2 + 0.0001

    mt_feats = MidTermFeatures2  # TODO
    iFeaturesSelect = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]

    mt_feats = mt_feats[iFeaturesSelect, :]

    (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    n_wins = mt_feats.shape[1]

    # remove outliers:
    dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_norm.T)),
                         axis=0)
    m_dist_all = numpy.mean(dist_all)
    i_non_outliers = numpy.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = numpy.min(mt_feats[1,:])
    #EnergyMean = numpy.mean(mt_feats[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #i_non_outliers = numpy.nonzero(mt_feats[1,:] > Thres)[0]
    #print i_non_outliers

    perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins
    mt_feats_norm_or = mt_feats_norm
    mt_feats_norm = mt_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, st_win * fs, round(fs*st_win), round(fs*st_win));
        # extract mid-term features with minimum step:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        #for i in range(num_of_stats * num_of_features + 1):
        for i in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        for i in range(
                num_of_features):  # for each of the short-term features:
            curPos = 0
            N = len(st_feats[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mt_win_ratio
                if N2 > N:
                    N2 = N
                curStFeatures = st_feats[i][N1:N2]
                mt_feats_to_red[i].append(numpy.mean(curStFeatures))
                mt_feats_to_red[i + num_of_features].append(
                    numpy.std(curStFeatures))
                curPos += mt_step_ratio
        mt_feats_to_red = numpy.array(mt_feats_to_red)
        mt_feats_to_red_2 = numpy.zeros(
            (mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2),
             mt_feats_to_red.shape[1]))
        for i in range(mt_feats_to_red.shape[1]):
            cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1
            cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2
            [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
            [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0],
                              i] = mt_feats_to_red[:, i]
            mt_feats_to_red_2[
                mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::,
                              i] = P2 + 0.0001
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :]
        #mt_feats_to_red += numpy.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010
        (mt_feats_to_red, MEAN,
         STD) = aT.normalizeFeatures([mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        #dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0)
        #m_dist_all = numpy.mean(dist_all)
        #iNonOutLiers2 = numpy.nonzero(dist_all < 3.0*m_dist_all)[0]
        #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2]
        Labels = numpy.zeros((mt_feats_to_red.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / st_win
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * st_win / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, Labels)
        mt_feats_norm = (clf.transform(mt_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    clsAll = []
    sil_all = []
    centersAll = []

    for iSpeakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(mt_feats_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(mt_feats_norm.T))
        clsAll.append(cls)
        centersAll.append(means)
        sil_1 = []
        sil_2 = []
        for c in range(iSpeakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = numpy.nonzero(cls == c)[0].shape[0] / \
                             float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mt_feats_norm[:, cls == c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                Yt = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(numpy.mean(Yt) * clust_per_cent)
                silBs = []
                for c2 in range(iSpeakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = numpy.nonzero(cls == c2)[0].shape[0] /\
                                           float(len(cls))
                        MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2]
                        Yt = distance.cdist(mt_feats_norm_temp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clust_per_cent + clust_per_cent_2) / 2.0)
                silBs = numpy.array(silBs)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(silBs))
        sil_1 = numpy.array(sil_1)
        sil_2 = numpy.array(sil_2)
        sil = []
        for c in range(iSpeakers):
            # for each cluster (speaker) compute silhouette
            sil.append(
                (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(numpy.mean(sil))

    imax = numpy.argmax(sil_all)
    # optimal number of clusters
    nSpeakersFinal = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = numpy.zeros((n_wins, ))
    for i in range(n_wins):
        j = numpy.argmin(numpy.abs(i - i_non_outliers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
            trainHMM_computeStatistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]
    class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundturh exists
    if os.path.isfile(gt_file):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs,
                                              mt_step)

    if plot_res:
        fig = plt.figure()
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(numpy.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(numpy.array(range(len(cls))) * mt_step + mt_step / 2.0, cls)

    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(
                numpy.array(range(len(flags_gt))) * mt_step + mt_step / 2.0,
                flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
            evaluateSpeakerDiarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(
                          100 * purity_cluster_m, 100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        #print s_range, sil_all
        if n_speakers <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
    return cls
Exemple #33
0
def speakerDiarization(fileName, sRange = xrange(2, 10), mtSize = 2.0, mtStep = 0.2, stWin = 0.05, LDAdim = 35):
	Fs, x = audioBasicIO.readAudioFile(fileName)
	x = audioBasicIO.stereo2mono(x)
	duration = len(x) / Fs

	Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll'))
	Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale'))

	MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5))
	MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1]))

	for i in range(MidTermFeatures.shape[1]):
		curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
		curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2

		Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1)
		Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2)

		MidTermFeatures2[0: MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
		MidTermFeatures2[MidTermFeatures.shape[0]: MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001
		MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001

	MidTermFeatures = MidTermFeatures2
	iFeaturesSelect = range(8, 21) + range(41, 54)
	MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

	MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T])
	MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
	numOfWindows = MidTermFeatures.shape[1]

	DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis = 0)
	MDistancesAll = numpy.mean(DistancesAll)
	iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

	perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
	MidTermFeaturesNormOr = MidTermFeaturesNorm
	MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

	if LDAdim > 0:
		mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int(round(mtSize / stWin)), int(round(stWin / stWin)), list(), len(ShortTermFeatures), 2
		for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list())

		for i in range(numOfFeatures):
			curPos = 0
			N = len(ShortTermFeatures[i])
			while (curPos < N):
				N1, N2 = curPos, curPos + mtWinRatio
				if N2 > N: N2 = N
				curStFeatures = ShortTermFeatures[i][N1: N2]
				mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
				mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures))
				curPos += mtStepRatio

		mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
		mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1]))
		for i in range(mtFeaturesToReduce.shape[1]):
			curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
			curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
			Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1)
			Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2)
			mtFeaturesToReduce2[0: mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]: mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001

		mtFeaturesToReduce = mtFeaturesToReduce2
		mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
		mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures([mtFeaturesToReduce.T])
		mtFeaturesToReduce = mtFeaturesToReduce[0].T
	
		Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ))
		LDAstep = 1.0
		LDAstepRatio = LDAstep / stWin

		for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio)
		clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components = LDAdim)
		clf.fit(mtFeaturesToReduce.T, Labels)

		MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

	clsAll, silAll, centersAll = list(), list(), list()

	for iSpeakers in sRange:
		k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers)
		k_means.fit(MidTermFeaturesNorm.T)
		cls = k_means.labels_
		means = k_means.cluster_centers_

		clsAll.append(cls)
		centersAll.append(means)
		silA, silB = list(), list()
		for c in range(iSpeakers):
			clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls))
			if clusterPerCent < 0.02:
				silA.append(0.0)
				silB.append(0.0)
			else:
				MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c]
				Yt = distance.pdist(MidTermFeaturesNormTemp.T)
				silA.append(numpy.mean(Yt) * clusterPerCent)
				silBs = list()
				for c2 in range(iSpeakers):
					if c2 != c:
						clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls))
						MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2]
						Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
						silBs.append(numpy.mean(Yt) * (clusterPerCent+clusterPerCent2) / 2.0)
				silBs = numpy.array(silBs)
				silB.append(min(silBs))
		silA, silB, sil = numpy.array(silA), numpy.array(silB), list()
		for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c],  silA[c]) + 0.00001))
		silAll.append(numpy.mean(sil))

	imax = numpy.argmax(silAll)
	nSpeakersFinal = sRange[imax]

	cls = numpy.zeros((numOfWindows, ))
	for i in range(numOfWindows):
		j = numpy.argmin(numpy.abs(i - iNonOutLiers))
		cls[i] = clsAll[imax][j]

	startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls)
	hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag')
	hmm.startprob_ = startprob
	hmm.transmat_ = transmat
	hmm.means_ = means
	hmm.covars_ = cov
	cls = hmm.predict(MidTermFeaturesNormOr.T)
	cls = scipy.signal.medfilt(cls, 13)
	cls = scipy.signal.medfilt(cls, 11)

	sil = silAll[imax]
	classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)]

	return cls, classNames, duration, mtStep, silAll