def featuresCallback(feat_msg): global mtFeaturesMatrix global classifierInfo, energies, classification_publisher curFV = feat_msg.ltWin1mean + feat_msg.ltWin1deviation #merge long term mean and std feature statistics (from the respective topic) curFV = list(curFV) #del curFV[18] curFVOr = curFV curFV = (curFV - classifierInfo["MEAN"]) / classifierInfo["STD"] # feature normalization [Result, P] = audioTrainTest.classifierWrapper(classifierInfo["Classifier"], "svm", curFV) # classification classResult = list(classifierInfo["classNames"])[int(Result)] EnergyThreshold = 0.90 * sum(energies)/float(len(energies)+0.00000001) if classResult == "silence": energies.append(curFVOr[0]) else: if curFVOr[0] < EnergyThreshold: classResult = "silence" energies.append(curFVOr[0]) else: energies.append(sum(energies)/(float(len(energies))+0.0001)) class_pub = classificationResult() class_pub.header.stamp = rospy.Time.now() class_pub.class_result.data = str(classResult) class_pub.probability.data = float(P[int(Result)]) classification_publisher.publish(class_pub) #print curFVOr print numpy.nonzero(numpy.isnan(numpy.array(curFVOr).mean(axis = 0))), classResult
def callback(data): global SVM, Mean, Std, ClassNames, counter, dicti, fps, publisher #Classify fv = numpy.array( [ data.boxes[0].pos.ratio, data.boxes[0].pos.ratio_diff, data.boxes[0].pos.distance, data.boxes[0].pos.distance_diff, data.boxes[0].pos.x_diff, data.boxes[0].pos.x_delta, data.boxes[0].pos.y_diff, data.boxes[0].pos.y_delta, data.boxes[0].pos.y_norm, data.boxes[0].pos.y_norm_diff, data.boxes[0].pos.depth_std, data.boxes[0].pos.z_diff, data.boxes[0].pos.z_diff_norm ]) curFV = (fv - Mean) / Std [Result, P] = audioTrainTest.classifierWrapper(SVM, "gradientboosting", curFV) dicti[ClassNames[int(Result)]] += 1 counter += 1 if counter == fps: m = max(dicti.iteritems(), key=operator.itemgetter(1))[0] #~ print m publisher.publish(m) counter = 0 for key, value in dicti.iteritems(): dicti[key] = 0
def classifierWrapperHead(classifier, classifier_type, test_sample): ''' ''' if classifier_type == "logisticregression": R = classifier.predict(test_sample.reshape(1,-1))[0] P = classifier.predict_proba(test_sample.reshape(1,-1))[0] return [R, P] else: return classifierWrapper(classifier, classifier_type, test_sample)
def chunkIterator() : for cf in self._feature_iterator(wav) : cf = (cf - self.MEAN) / self.STD # normalization for f in cf : [Result, P] = aT.classifierWrapper(self.Classifier, "svm", f) # classification if Result != -1: yield self.classNames[int(Result)] else: yield "UNKNOWN"
def classify_frames(self, frames): features = self._frames_featurizer(frames) curFV = (features - self.MEAN) / self.STD # normalization classNames = [] for f in curFV: [Result, P] = aT.classifierWrapper(self.Classifier, "svm", f) # classification if Result != -1: classNames.append(self.classNames[int(Result)]) else: classNames.append("UNKNOWN"); return classNames
def classifySingleFile(fileName, clf_name, model, MEAN, STD, classNames, filter): ''' Classify a csv using mid-term windows as samples :param fileName: file to classify :param clf_name: classifier name ie 'svm :param model: trained model :param MEAN: mean of training data :param STD: std of training data :param classNames: list of unique class-names :param filter: either to apply midfiltering or not :return: Confusion matrix of classified file - mid term windows are the samples ''' emg_raw = [] time = [] gt_labels = [] CM_file = numpy.zeros((len(classNames), len(classNames))) #read the data with open(fileName, 'r') as f: x = f.readlines() if not x: return CM_file time.append([float(label.split(',')[0]) for label in x]) emg_raw.append([float(label.split(',')[1]) for label in x]) gt_labels.append([int(label.split(',')[2].rstrip()) for label in x]) f.close # extract the features and mid-term labels fVs, labels = featureExtraction(emg_raw[0], time[0], gt_labels[0], 2, 1, 0.25, 0.25) MEAN = numpy.array(MEAN) STD = numpy.array(STD) #classify mid-term windows extracted from test-file predictions = [] for i in range(fVs.shape[1]): fV = fVs[:, i] fV = (fV - MEAN) / STD [Result, P] = aT.classifierWrapper(model, clf_name, fV) # classification predictions.append(Result) #perform median filtering if filter: predictions = medfilt(predictions, 13) # compute confusion matrix for idx, p in enumerate(predictions): CM_file[int(labels[idx]), int(p)] += 1 print 'Classification Results for file:', fileName print CM_file print return CM_file
def featuresCallback(feat_msg): global mtFeaturesMatrix global classifierInfo, energies, classification_publisher # merge long term mean and std feature statistics (from the respective topic) curFV = feat_msg.ltWin1mean + feat_msg.ltWin1deviation curFV = list(curFV) #del curFV[18] curFVOr = curFV # feature normalization curFV = (curFV - classifierInfo["MEAN"]) / classifierInfo["STD"] # classification [Result, P] = audioTrainTest.classifierWrapper(classifierInfo["Classifier"], "svm", curFV) classResult = list(classifierInfo["classNames"])[int(Result)] EnergyThreshold = 0.90 * sum(energies) / float(len(energies) + 0.00000001) if classResult == "silence": energies.append(curFVOr[0]) #print "silence" else: if curFVOr[0] < EnergyThreshold: classResult = "silence" energies.append(curFVOr[0]) else: energies.append(sum(energies) / (float(len(energies)) + 0.0001)) class_pub = classificationResult() class_pub.header.stamp = rospy.Time.now() class_pub.class_result.data = str(classResult) class_pub.probability.data = float(P[int(Result)]) classification_publisher.publish(class_pub) #print curFVOr #print numpy.nonzero(numpy.isnan(numpy.array(curFVOr).mean(axis = 0))), classResult print "class: {0:s}\t{1:.3f}".format(classResult, class_pub.probability.data)
#extract features MidTermFeatures = aF.mtFeatureExtraction(array, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) MidTermFeatures = MidTermFeatures[0] #classify chunks to speech/music flags = [] Ps = [] flagsInd = [] for i in range( MidTermFeatures[0].shape[0] ): # for each feature vector (i.e. for each fix-sized segment): curFV = (MidTermFeatures[:, i] - MEAN) / STD # normalize current feature vector [Result, P] = aT.classifierWrapper(Classifier, modelType, curFV) # classify vector flagsInd.append(Result) flags.append(classNames[int(Result)]) # update class label matrix Ps.append(numpy.max(P)) # update probability matrix # 1-window smoothing #for i in range(1, len(flagsInd) - 1): # if flagsInd[i - 1] == flagsInd[i + 1]: # flagsInd[i] = flagsInd[i + 1] #(segs, classes) = flags2segs(flags, mtStep) # convert fix-sized flags to segments and classes #segs[-1] = len(data) / float(Fs) flagsInd = numpy.array(flagsInd) #check what the majority is if (len(set(flagsInd)) == 1 and flagsInd[0] == 1): print("music")
os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models", "knn_speaker_male_female")) [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5)) MidTermFeatures2 = np.zeros( (mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
def main(rootName, modelType, classifierParam, signal_type): CMall = numpy.zeros((2, 2)) if modelType != "svm" and modelType != "svm_rbf": C = [int(classifierParam)] else: C = [(classifierParam)] F1s = [] Accs = [] for ifold in range(0, 10): # for each fold dirName = rootName + os.sep + "fold_{0:d}".format( ifold) # get fold path name classNamesTrain, featuresTrain = dirFeatureExtraction([ os.path.join(dirName, "train", "fail"), os.path.join(dirName, "train", "success") ], signal_type) # TRAINING data feature extraction bestParam = aT.evaluateClassifier( featuresTrain, classNamesTrain, 2, modelType, C, 0, 0.90) # internal cross-validation (for param selection) classNamesTest, featuresTest = dirFeatureExtraction([ os.path.join(dirName, "test", "fail"), os.path.join(dirName, "test", "success") ], signal_type) # trainGradientBoosting data feature extraction [featuresTrainNew, MEAN, STD] = aT.normalizeFeatures( featuresTrain) # training features NORMALIZATION if modelType == "svm": # classifier training Classifier = aT.trainSVM(featuresTrainNew, bestParam) elif modelType == "svm_rbf": Classifier = aT.trainSVM_RBF(featuresTrainNew, bestParam) elif modelType == "randomforest": Classifier = aT.trainRandomForest(featuresTrainNew, bestParam) elif modelType == "gradientboosting": Classifier = aT.trainGradientBoosting(featuresTrainNew, bestParam) elif modelType == "extratrees": Classifier = aT.trainExtraTrees(featuresTrainNew, bestParam) CM = numpy.zeros((2, 2)) # evaluation on testing data for iC, f in enumerate(featuresTest): # for each class for i in range( f.shape[0]): # for each testing sample (feature vector) curF = f[i, :] # get feature vector curF = (curF - MEAN) / STD # normalize test feature vector winnerClass = classNamesTrain[int( aT.classifierWrapper( Classifier, modelType, curF)[0])] # classify and get winner class trueClass = classNamesTest[iC] # get groundtruth class CM[classNamesTrain.index(trueClass)][classNamesTrain.index( winnerClass)] += 1 # update confusion matrix CMall += CM # update overall confusion matrix Recall, Precision, F1 = computePreRec( CM, classNamesTrain) # get recall, precision and F1 (per class) Acc = numpy.diagonal(CM).sum() / CM.sum() # get overall accuracy F1s.append(numpy.mean(F1)) # append average F1 Accs.append(Acc) # append clasification accuracy print print "FINAL RESULTS" print print "----------------------------------" print "fold\tacc\tf1" print "----------------------------------" for i in range(len(F1s)): print "{0:d}\t{1:.1f}\t{2:.1f}".format(i, 100 * Accs[i], 100 * F1s[i]) Acc = numpy.diagonal(CMall).sum() / CMall.sum() Recall, Precision, F1 = computePreRec(CMall, classNamesTrain) print "----------------------------------" print "{0:s}\t{1:.1f}\t{2:.1f}".format("Avg", 100 * numpy.mean(Accs), 100 * numpy.mean(F1s)) print "{0:s}\t{1:.1f}\t{2:.1f}".format("Av CM", 100 * Acc, 100 * numpy.mean(F1)) print "----------------------------------" print print "Overal Confusion matrix:" aT.printConfusionMatrix(CMall, classNamesTrain) print print "FAIL Recall = {0:.1f}".format(100 * Recall[classNamesTrain.index("fail")]) print "FAIL Precision = {0:.1f}".format( 100 * Precision[classNamesTrain.index("fail")]) print "SUCCESS Recall = {0:.1f}".format( 100 * Recall[classNamesTrain.index("success")]) print "SUCCESS Precision = {0:.1f}".format( 100 * Precision[classNamesTrain.index("success")]) return CMall, Acc, Recall, Precision, F1
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=0, PLOT=False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' [Fs, x] = pyAudioAnalysis.audioBasicIO.readAudioFile(fileName) x = pyAudioAnalysis.audioBasicIO.stereo2mono(x) Duration = len(x) / Fs #[Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel(os.path.join("data","knnSpeakerAll")) #[Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel(os.path.join("data","knnSpeakerFemaleMale")) [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("pyAudioAnalysis/data/knnSpeakerAll") [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("pyAudioAnalysis/data/knnSpeakerFemaleMale") [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs*stWin * 0.5)) MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 # TODO # SELECT FEATURES: #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96, # 97,98, 99,100]; # SET 0C iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] # SET 1A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C #iFeaturesSelect = range(100); # SET 3 #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] # remove outliers: DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(MidTermFeatures[1,:]) #EnergyMean = numpy.mean(MidTermFeatures[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0] #print iNonOutLiers perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] # LDA dimensionality reduction: if LDAdim > 0: #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin)); # extract mid-term features with minimum step: mtWinRatio = int(round(mtSize / stWin)) mtStepRatio = int(round(stWin / stWin)) mtFeaturesToReduce = [] numOfFeatures = len(ShortTermFeatures) numOfStatistics = 2 #for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) #MDistancesAll = numpy.mean(DistancesAll) #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0] #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )); LDAstep = 1.0 LDAstepRatio = LDAstep / stWin #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*stWin/LDAstepRatio); clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T if numOfSpeakers <= 0: sRange = range(2, 10) else: sRange = [numOfSpeakers] clsAll = [] silAll = [] centersAll = [] for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) clsAll.append(cls) centersAll.append(means) silA = []; silB = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.020: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c] # get subset of feature vectors Yt = distance.pdist(MidTermFeaturesNormTemp.T) # compute average distance between samples that belong to the cluster (a values) silA.append(numpy.mean(Yt)*clusterPerCent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2!=c: clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) silA = numpy.array(silA); silB = numpy.array(silB); sil = [] for c in range(iSpeakers): # for each cluster (speaker) sil.append( ( silB[c] - silA[c]) / (max(silB[c], silA[c])+0.00001) ) # compute silhouette silAll.append(numpy.mean(sil)) # keep the AVERAGE SILLOUETTE #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5))) imax = numpy.argmax(silAll) # position of the maximum sillouette value nSpeakersFinal = sRange[imax] # optimal number of clusters # generate the final set of cluster labels # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window) cls = numpy.zeros((numOfWindows,)) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i-iNonOutLiers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # hmm training hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] # final sillouette classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]; # load ground-truth if available gtFile = fileName.replace('.wav', '.segments'); # open for annotated file if os.path.isfile(gtFile): # if groundturh exists [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags if PLOT: fig = plt.figure() if numOfSpeakers>0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(classNames)))) ax1.axis((0, Duration, -1, len(classNames))) ax1.set_yticklabels(classNames) ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls) if os.path.isfile(gtFile): if PLOT: ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r') purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT) print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean) if PLOT: plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) ) if PLOT: plt.xlabel("time (seconds)") #print sRange, silAll if numOfSpeakers<=0: plt.subplot(212) plt.plot(sRange, silAll) plt.xlabel("number of clusters"); plt.ylabel("average clustering's sillouette"); plt.show() return cls
def speakerDiarization(fileName, sRange = xrange(2, 10), mtSize = 2.0, mtStep = 0.2, stWin = 0.05, LDAdim = 35): Fs, x = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) duration = len(x) / Fs Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll')) Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale')) MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) MidTermFeatures2[0: MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]: MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 iFeaturesSelect = range(8, 21) + range(41, 54) MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis = 0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] if LDAdim > 0: mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int(round(mtSize / stWin)), int(round(stWin / stWin)), list(), len(ShortTermFeatures), 2 for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list()) for i in range(numOfFeatures): curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1, N2 = curPos, curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1: N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) mtFeaturesToReduce2[0: mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]: mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components = LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T clsAll, silAll, centersAll = list(), list(), list() for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ clsAll.append(cls) centersAll.append(means) silA, silB = list(), list() for c in range(iSpeakers): clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clusterPerCent < 0.02: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] Yt = distance.pdist(MidTermFeaturesNormTemp.T) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = list() for c2 in range(iSpeakers): if c2 != c: clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append(numpy.mean(Yt) * (clusterPerCent+clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) silA, silB, sil = numpy.array(silA), numpy.array(silB), list() for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) silAll.append(numpy.mean(sil)) imax = numpy.argmax(silAll) nSpeakersFinal = sRange[imax] cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag') hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)] return cls, classNames, duration, mtStep, silAll
def mtFileClassification(input_file, model_name, model_type, plot_results=False, gt_file=""): ''' This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment ''' if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return (-1, -1, -1, -1) # Load classifier: if model_type == "knn": [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = \ aT.load_model_knn(model_name) else: [ classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat ] = aT.load_model(model_name) if compute_beat: print("Model " + model_name + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return (-1, -1, -1, -1) [fs, x] = audioBasicIO.readAudioFile(input_file) # load input file if fs == -1: # could not read file return (-1, -1, -1, -1) x = audioBasicIO.stereo2mono(x) # convert stereo (if) to mono duration = len(x) / fs # mid-term feature extraction: [mt_feats, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs, round(fs * st_win), round(fs * st_step)) flags = [] Ps = [] flags_ind = [] for i in range( mt_feats.shape[1] ): # for each feature vector (i.e. for each fix-sized segment): cur_fv = (mt_feats[:, i] - MEAN) / STD # normalize current feature vector [res, P] = aT.classifierWrapper(classifier, model_type, cur_fv) # classify vector flags_ind.append(res) flags.append(class_names[int(res)]) # update class label matrix Ps.append(numpy.max(P)) # update probability matrix flags_ind = numpy.array(flags_ind) # 1-window smoothing for i in range(1, len(flags_ind) - 1): if flags_ind[i - 1] == flags_ind[i + 1]: flags_ind[i] = flags_ind[i + 1] # convert fix-sized flags to segments and classes (segs, classes) = flags2segs(flags, mt_step) segs[-1] = len(x) / float(fs) # Load grount-truth: if os.path.isfile(gt_file): [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt, seg_l_gt, mt_step) flags_ind_gt = [] for j, fl in enumerate(flags_gt): # "align" labels with GT if class_names_gt[flags_gt[j]] in class_names: flags_ind_gt.append( class_names.index(class_names_gt[flags_gt[j]])) else: flags_ind_gt.append(-1) flags_ind_gt = numpy.array(flags_ind_gt) cm = numpy.zeros((len(class_names_gt), len(class_names_gt))) for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])): cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1 else: cm = [] flags_ind_gt = numpy.array([]) acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names, mt_step, not plot_results) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc)) return (flags_ind, class_names_gt, acc, cm) else: return (flags_ind, class_names, acc, cm)
def mtFileClassification(inputFile, modelName, modelType, plotResults=False, gtFile=""): ''' This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - inputFile: path of the input WAV file - modelName: name of the classification model - modelType: svm or knn depending on the classifier type - plotResults: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment ''' if not os.path.isfile(modelName): print("mtFileClassificationError: input modelType not found!") return (-1, -1, -1, -1) # Load classifier: if (modelType == 'svm') or (modelType == 'svm_rbf'): [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadSVModel(modelName) elif modelType == 'knn': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadKNNModel(modelName) elif modelType == 'randomforest': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadRandomForestModel(modelName) elif modelType == 'gradientboosting': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadGradientBoostingModel(modelName) elif modelType == 'extratrees': [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT] = aT.loadExtraTreesModel(modelName) if computeBEAT: print("Model " + modelName + " contains long-term music features (beat etc) and cannot be used in segmentation") return (-1, -1, -1, -1) [Fs, x] = audioBasicIO.readAudioFile(inputFile) # load input file if Fs == -1: # could not read file return (-1, -1, -1, -1) # convert stereo (if) to mono x = audioBasicIO.stereo2mono(x) Duration = len(x) / Fs # mid-term feature extraction: [MidTermFeatures, _] = aF.mtFeatureExtraction( x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep)) flags = [] Ps = [] flagsInd = [] # for each feature vector (i.e. for each fix-sized segment): for i in range(MidTermFeatures.shape[1]): # normalize current feature vector curFV = (MidTermFeatures[:, i] - MEAN) / STD [Result, P] = aT.classifierWrapper( Classifier, modelType, curFV) # classify vector flagsInd.append(Result) # update class label matrix flags.append(classNames[int(Result)]) # update probability matrix Ps.append(numpy.max(P)) flagsInd = numpy.array(flagsInd) # 1-window smoothing for i in range(1, len(flagsInd) - 1): if flagsInd[i - 1] == flagsInd[i + 1]: flagsInd[i] = flagsInd[i + 1] # convert fix-sized flags to segments and classes (segs, classes) = flags2segs(flags, mtStep) segs[-1] = len(x) / float(Fs) # Load grount-truth: if os.path.isfile(gtFile): [segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile) flagsGT, classNamesGT = segs2flags( segStartGT, segEndGT, segLabelsGT, mtStep) flagsIndGT = [] for j, fl in enumerate(flagsGT): # "align" labels with GT if classNamesGT[flagsGT[j]] in classNames: flagsIndGT.append(classNames.index(classNamesGT[flagsGT[j]])) else: flagsIndGT.append(-1) flagsIndGT = numpy.array(flagsIndGT) CM = numpy.zeros((len(classNamesGT), len(classNamesGT))) for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])): CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1 else: CM = [] flagsIndGT = numpy.array([]) acc = plotSegmentationResults( flagsInd, flagsIndGT, classNames, mtStep, not plotResults) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc)) return (flagsInd, classNamesGT, acc, CM) else: return (flagsInd, classNames, acc, CM)
def speakerDiarization(fileName, sRange=xrange(2, 10), mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35): Fs, x = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) duration = len(x) / Fs Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel( os.path.join( '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll')) Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel( os.path.join( '/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale')) MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction( x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 iFeaturesSelect = range(8, 21) + range(41, 54) MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] DistancesAll = numpy.sum(distance.squareform( distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] if LDAdim > 0: mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int( round(mtSize / stWin)), int(round( stWin / stWin)), list(), len(ShortTermFeatures), 2 for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list()) for i in range(numOfFeatures): curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1, N2 = curPos, curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append( numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1) Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[ mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures( [mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T clsAll, silAll, centersAll = list(), list(), list() for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ clsAll.append(cls) centersAll.append(means) silA, silB = list(), list() for c in range(iSpeakers): clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float( len(cls)) if clusterPerCent < 0.02: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] Yt = distance.pdist(MidTermFeaturesNormTemp.T) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = list() for c2 in range(iSpeakers): if c2 != c: clusterPerCent2 = numpy.nonzero( cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append( numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append(min(silBs)) silA, silB, sil = numpy.array(silA), numpy.array(silB), list() for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) silAll.append(numpy.mean(sil)) imax = numpy.argmax(silAll) nSpeakersFinal = sRange[imax] cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag') hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)] return cls, classNames, duration, mtStep, silAll
def fileGreenwaySpeakerDiarization(filename, output_folder, speech_key="52fe944f29784ae288482e5eb3092e2a", service_region="eastus2", n_speakers=2, mt_size=2.0, mt_step=0.2, st_win=0.05, lda_dim=35): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed the filename should have a suffix of the form: ..._min_3 this informs the service that audio file corresponds to the 3rd minute of the dialogue - output_folder the folder location for saving the audio snippets generated from diarization - speech_key mid-term window size - service_region the number of speakers (clusters) in the recording (<=0 for unknown) - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mt_size (opt) mid-term window size - mt_step (opt) mid-term window step - st_win (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting - save_plot (opt) 1|True for saving plot in output folder """ ''' OUTPUTS: - cls: this is a vector with speaker ids in chronological sequence of speaker dialogue. - output: a list of python dictionaries containing dialogue sequence information. - dialogue_id - sequence_id - start_time - end_time - text ''' filename_only = filename if "/" not in filename else filename.split("/")[-1] nameoffile = filename_only.split("_min_")[0] timeoffile = filename_only.split("_min_")[1] [fs, x] = audioBasicIO.read_audio_file(filename) x = audioBasicIO.stereo_to_mono(x) duration = len(x) / fs [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "pyAudioAnalysis/data/models", "knn_speaker_10")) [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "pyAudioAnalysis/data/models", "knn_speaker_male_female")) [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs*st_win * 0.5)) MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T n_wins = mt_feats.shape[1] # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = np.min(mt_feats[1,:]) #EnergyMean = np.mean(mt_feats[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] # print i_non_outliers perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins mt_feats_norm_or = mt_feats_norm mt_feats_norm = mt_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: # [mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, # st_win * fs, round(fs*st_win), round(fs*st_win)); # extract mid-term features with minimum step: mt_win_ratio = int(round(mt_size / st_win)) mt_step_ratio = int(round(st_win / st_win)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 # for i in range(num_of_stats * num_of_features + 1): for i in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) # for each of the short-term features: for i in range(num_of_features): curPos = 0 N = len(st_feats[i]) while (curPos < N): N1 = curPos N2 = curPos + mt_win_ratio if N2 > N: N2 = N curStFeatures = st_feats[i][N1:N2] mt_feats_to_red[i].append(np.mean(curStFeatures)) mt_feats_to_red[i + num_of_features].append(np.std(curStFeatures)) curPos += mt_step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2), mt_feats_to_red.shape[1])) for i in range(mt_feats_to_red.shape[1]): cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i] mt_feats_to_red_2[mt_feats_to_red.shape[0] :mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001 mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :] #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010 (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures( [mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0) #m_dist_all = np.mean(dist_all) #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0] #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2] Labels = np.zeros((mt_feats_to_red.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / st_win # print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*st_win/LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=lda_dim) clf.fit(mt_feats_to_red.T, Labels) mt_feats_norm = (clf.transform(mt_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] clsAll = [] sil_all = [] centersAll = [] for iSpeakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(mt_feats_norm.T)) clsAll.append(cls) centersAll.append(means) sil_1 = [] sil_2 = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \ float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mt_feats_norm[:, cls == c] # compute average distance between samples # that belong to the cluster (a values) Yt = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(Yt)*clust_per_cent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2] Yt = distance.cdist(mt_feats_norm_temp.T, MidTermFeaturesNormTemp2.T) silBs.append(np.mean(Yt)*(clust_per_cent + clust_per_cent_2)/2.0) silBs = np.array(silBs) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(silBs)) sil_1 = np.array(sil_1) sil_2 = np.array(sil_2) sil = [] for c in range(iSpeakers): # for each cluster (speaker) compute silhouette sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = np.argmax(sil_all) # optimal number of clusters nSpeakersFinal = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = np.zeros((n_wins,)) for i in range(n_wins): j = np.argmin(np.abs(i-i_non_outliers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): # hmm training start_prob, transmat, means, cov = \ trainHMM_computeStatistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = sil_all[imax] class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundturh exists if os.path.isfile(gt_file): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags( seg_start, seg_end, seg_labs, mt_step) # if plot_res: # fig = plt.figure() # if n_speakers > 0: # ax1 = fig.add_subplot(111) # else: # ax1 = fig.add_subplot(211) # ax1.set_yticks(np.array(range(len(class_names)))) # ax1.axis((0, duration, -1, len(class_names))) # ax1.set_yticklabels(class_names) # ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls) # if os.path.isfile(gt_file): # if plot_res: # ax1.plot(np.array(range(len(flags_gt))) * # mt_step + mt_step / 2.0, flags_gt, 'r') # purity_cluster_m, purity_speaker_m = \ # evaluateSpeakerDiarization(cls, flags_gt) # print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, # 100 * purity_speaker_m)) # if plot_res: # plt.title("Cluster purity: {0:.1f}% - " # "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m, # 100 * purity_speaker_m)) # if plot_res: # plt.xlabel("time (seconds)") # # print s_range, sil_all # if n_speakers <= 0: # plt.subplot(212) # plt.plot(s_range, sil_all) # plt.xlabel("number of clusters") # plt.ylabel("average clustering's sillouette") # if save_plot: # plt.savefig( # f"{output_folder}{filename_only}".replace(".wav", ".png")) # else: # pass # plt.show() # Create Time Vector time_vec = np.array(range(len(cls)))*mt_step+mt_step/2.0 # Find Change Points speaker_change_index = np.where(np.roll(cls, 1) != cls)[0] # Create List of dialogue convos output_list = [] temp = {} for ind, sc in enumerate(speaker_change_index): temp['dialogue_id'] = str(datetime.now()).strip() temp['sequence_id'] = str(ind) temp['speaker'] = list(cls)[sc] temp['start_time'] = time_vec[sc] temp['end_time'] = time_vec[speaker_change_index[ind+1] - 1] if ind+1 < len(speaker_change_index) else time_vec[-1] temp["text"] = "" output_list.append(temp) temp = {} def snip_transcribe(output_list, filename, output_folder=output_folder, speech_key=speech_key, service_region=service_region): speech_config = speechsdk.SpeechConfig( subscription=speech_key, region=service_region) speech_config.enable_dictation def recognized_cb(evt): if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech: # Do something with the recognized text output_list[ind]['text'] = output_list[ind]['text'] + \ str(evt.result.text) print(evt.result.text) for ind, diag in enumerate(output_list): t1 = diag['start_time'] t2 = diag['end_time'] newAudio = AudioSegment.from_wav(filename) chunk = newAudio[t1*1000:t2*1000] filename_out = output_folder + f"snippet_{diag['sequence_id']}.wav" # Exports to a wav file in the current path. chunk.export(filename_out, format="wav") done = False def stop_cb(evt): """callback that signals to stop continuous recognition upon receiving an event `evt`""" print('CLOSING on {}'.format(evt)) nonlocal done done = True audio_input = speechsdk.AudioConfig(filename=filename_out) speech_recognizer = speechsdk.SpeechRecognizer( speech_config=speech_config, audio_config=audio_input) output_list[ind]['snippet_path'] = filename_out speech_recognizer.recognized.connect(recognized_cb) speech_recognizer.session_stopped.connect(stop_cb) speech_recognizer.canceled.connect(stop_cb) # Start continuous speech recognition speech_recognizer.start_continuous_recognition() while not done: time.sleep(.5) speech_recognizer.stop_continuous_recognition() return output_list output = snip_transcribe(output_list, filename, output_folder=output_folder) output_json = {filename_only: output} with open(f"{output_folder}{nameoffile}_{timeoffile}.txt", "w") as outfile: json.dump(output_json, outfile) return cls, output_json
def recordAnalyzeAudio(duration, outputWavFile, midTermBufferSizeSec, modelName, modelType): ''' recordAnalyzeAudio(duration, outputWavFile, midTermBufferSizeSec, modelName, modelType) This function is used to record and analyze audio segments, in a fix window basis. ARGUMENTS: - duration total recording duration - outputWavFile path of the output WAV file - midTermBufferSizeSec (fix)segment length in seconds - modelName classification model name - modelType classification model type ''' if modelType == 'svm': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = aT.loadSVModel(modelName) elif modelType == 'knn': [ Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep, computeBEAT ] = aT.loadKNNModel(modelName) else: Classifier = None inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NONBLOCK) inp.setchannels(1) inp.setrate(Fs) inp.setformat(alsaaudio.PCM_FORMAT_S16_LE) inp.setperiodsize(512) midTermBufferSize = int(midTermBufferSizeSec * Fs) allData = [] midTermBuffer = [] curWindow = [] count = 0 while len(allData) < duration * Fs: # Read data from device l, data = inp.read() if l: for i in range(l): curWindow.append(audioop.getsample(data, 2, i)) if (len(curWindow) + len(midTermBuffer) > midTermBufferSize): samplesToCopyToMidBuffer = midTermBufferSize - \ len(midTermBuffer) else: samplesToCopyToMidBuffer = len(curWindow) midTermBuffer = midTermBuffer + \ curWindow[0:samplesToCopyToMidBuffer] del (curWindow[0:samplesToCopyToMidBuffer]) if len(midTermBuffer) == midTermBufferSize: count += 1 if Classifier != None: [mtFeatures, stFeatures] = aF.mtFeatureExtraction(midTermBuffer, Fs, 2.0 * Fs, 2.0 * Fs, 0.020 * Fs, 0.020 * Fs) curFV = (mtFeatures[:, 0] - MEAN) / STD [result, P] = aT.classifierWrapper(Classifier, modelType, curFV) print(classNames[int(result)]) allData = allData + midTermBuffer plt.clf() plt.plot(midTermBuffer) plt.show(block=False) plt.draw() midTermBuffer = [] allDataArray = numpy.int16(allData) wavfile.write(outputWavFile, Fs, allDataArray)
def speakerDiarization(filename, n_speakers, mt_size=2.0, mt_step=0.2, st_win=0.05, lda_dim=35, plot_res=False): ''' ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mt_size (opt) mid-term window size - mt_step (opt) mid-term window step - st_win (opt) short-term window size - lda_dim (opt) LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plottingy ''' [fs, x] = audioBasicIO.readAudioFile(filename) x = audioBasicIO.stereo2mono(x) duration = len(x) / fs [ classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 ] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerAll")) [ classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 ] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerFemaleMale")) [mt_feats, st_feats, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5)) MidTermFeatures2 = numpy.zeros( (mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T n_wins = mt_feats.shape[1] # remove outliers: dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0) m_dist_all = numpy.mean(dist_all) i_non_outliers = numpy.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(mt_feats[1,:]) #EnergyMean = numpy.mean(mt_feats[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #i_non_outliers = numpy.nonzero(mt_feats[1,:] > Thres)[0] #print i_non_outliers perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins mt_feats_norm_or = mt_feats_norm mt_feats_norm = mt_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, st_win * fs, round(fs*st_win), round(fs*st_win)); # extract mid-term features with minimum step: mt_win_ratio = int(round(mt_size / st_win)) mt_step_ratio = int(round(st_win / st_win)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 #for i in range(num_of_stats * num_of_features + 1): for i in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) for i in range( num_of_features): # for each of the short-term features: curPos = 0 N = len(st_feats[i]) while (curPos < N): N1 = curPos N2 = curPos + mt_win_ratio if N2 > N: N2 = N curStFeatures = st_feats[i][N1:N2] mt_feats_to_red[i].append(numpy.mean(curStFeatures)) mt_feats_to_red[i + num_of_features].append( numpy.std(curStFeatures)) curPos += mt_step_ratio mt_feats_to_red = numpy.array(mt_feats_to_red) mt_feats_to_red_2 = numpy.zeros( (mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2), mt_feats_to_red.shape[1])) for i in range(mt_feats_to_red.shape[1]): cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i] mt_feats_to_red_2[ mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001 mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :] #mt_feats_to_red += numpy.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010 (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures([mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T #dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0) #m_dist_all = numpy.mean(dist_all) #iNonOutLiers2 = numpy.nonzero(dist_all < 3.0*m_dist_all)[0] #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2] Labels = numpy.zeros((mt_feats_to_red.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / st_win #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i * st_win / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=lda_dim) clf.fit(mt_feats_to_red.T, Labels) mt_feats_norm = (clf.transform(mt_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] clsAll = [] sil_all = [] centersAll = [] for iSpeakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(mt_feats_norm.T)) clsAll.append(cls) centersAll.append(means) sil_1 = [] sil_2 = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = numpy.nonzero(cls == c)[0].shape[0] / \ float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mt_feats_norm[:, cls == c] # compute average distance between samples # that belong to the cluster (a values) Yt = distance.pdist(mt_feats_norm_temp.T) sil_1.append(numpy.mean(Yt) * clust_per_cent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = numpy.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2] Yt = distance.cdist(mt_feats_norm_temp.T, MidTermFeaturesNormTemp2.T) silBs.append( numpy.mean(Yt) * (clust_per_cent + clust_per_cent_2) / 2.0) silBs = numpy.array(silBs) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(silBs)) sil_1 = numpy.array(sil_1) sil_2 = numpy.array(sil_2) sil = [] for c in range(iSpeakers): # for each cluster (speaker) compute silhouette sil.append( (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001)) # keep the AVERAGE SILLOUETTE sil_all.append(numpy.mean(sil)) imax = numpy.argmax(sil_all) # optimal number of clusters nSpeakersFinal = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = numpy.zeros((n_wins, )) for i in range(n_wins): j = numpy.argmin(numpy.abs(i - i_non_outliers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): # hmm training start_prob, transmat, means, cov = \ trainHMM_computeStatistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = sil_all[imax] class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundturh exists if os.path.isfile(gt_file): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(numpy.array(range(len(cls))) * mt_step + mt_step / 2.0, cls) if os.path.isfile(gt_file): if plot_res: ax1.plot( numpy.array(range(len(flags_gt))) * mt_step + mt_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluateSpeakerDiarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format( 100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") #print s_range, sil_all if n_speakers <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() return cls
def getResultMatrixAndBestParam(features, class_names, classifier_name, parameterMode, perTrain=0.90, model_name='',Params=[]): ''' ARGUMENTS: features: a list ([numOfClasses x 1]) whose elements containt numpy matrices of features. each matrix features[i] of class i is [n_samples x numOfDimensions] class_names: list of class names (strings) n_exp: number of cross-validation experiments classifier_name: svm or knn or randomforest Params: list of classifier parameters (for parameter tuning during cross-validation) parameterMode: 0: choose parameters that lead to maximum overall classification ACCURACY 1: choose parameters that lead to maximum overall f1 MEASURE RETURNS: bestParam: the value of the input parameter that optimizes the selected performance measure confufionMatrix ''' # feature normalization: (features_norm, MEAN, STD) = aT.normalizeFeatures(features) n_classes = len(features) ac_all = [] f1_all = [] precision_classes_all = [] recall_classes_all = [] f1_classes_all = [] cms_all = [] smooth = 0.0000000010 # Optimize number of experiment n_exp = AudioClassifierManager.getOptimalNumberExperiment(features,AudioClassifierManager.__num_experiment) Params = AudioClassifierManager.getListParamsForClassifierType(classifier_name) if len(Params)==0 else Params # For each param value for Ci, C in enumerate(Params): # Init confusion matrix cm = numpy.zeros((n_classes, n_classes)) for e in range(n_exp): # Split features in Train and Test: f_train, f_test = aT.randSplitFeatures(features_norm, perTrain) countFTrain = 0 countFTest = 0 for g in f_train: for track in g: countFTrain += 1 for g in f_test: for track in g: countFTest += 1 if(countFTest == 0): print("WARNING: {0} has no test values".format(class_names[Ci])) # for each cross-validation iteration: print("Param = {0:.5f} - classifier Evaluation " "Experiment {1:d} of {2:d} - lenTrainingSet {3} lenTestSet {4}".format(C, e + 1, n_exp, countFTrain, countFTest)) # Get Classifier for train classifier = AudioClassifierManager.getTrainClassifier(f_train,classifier_name,C) cmt = numpy.zeros((n_classes, n_classes)) for c1 in range(n_classes): #print("==> Class {1}: {0} for exp {2}".format(class_names[c1],c1,e)) n_test_samples = len(f_test[c1]) res = numpy.zeros((n_test_samples, 1)) for ss in range(n_test_samples): [res[ss], _] = aT.classifierWrapper(classifier, classifier_name, f_test[c1][ss]) for c2 in range(n_classes): nnzero = numpy.nonzero(res == c2)[0] rlen = len(nnzero) cmt[c1][c2] = float(rlen) #print("cmt[{0}][{1}] = {2}".format(c1,c2,float(rlen))) cm = cm + cmt cm = cm + smooth rec = numpy.zeros((cm.shape[0],)) pre = numpy.zeros((cm.shape[0],)) # Calculate Precision, Recall and f1 Misure for ci in range(cm.shape[0]): rec[ci] = cm[ci, ci] / numpy.sum(cm[ci, :]) pre[ci] = cm[ci, ci] / numpy.sum(cm[:, ci]) precision_classes_all.append(pre) recall_classes_all.append(rec) f1 = 2 * rec * pre / (rec + pre) f1_classes_all.append(f1) ac_all.append(numpy.sum(numpy.diagonal(cm)) / numpy.sum(cm)) cms_all.append(cm) f1_all.append(numpy.mean(f1)) best_ac_ind = numpy.argmax(ac_all) best_f1_ind = numpy.argmax(f1_all) bestParam = 0 resultConfusionMatrix = None if parameterMode == AudioClassifierManager.BEST_ACCURACY: bestParam = Params[best_ac_ind] resultConfusionMatrix = cms_all[best_ac_ind] elif parameterMode == AudioClassifierManager.BEST_F1: bestParam = Params[best_f1_ind] resultConfusionMatrix = cms_all[best_f1_ind] return bestParam, resultConfusionMatrix, precision_classes_all, recall_classes_all, f1_classes_all, f1_all, ac_all