def self_similarity_matrix(feature_vectors): """ This function computes the self-similarity matrix for a sequence of feature vectors. ARGUMENTS: - feature_vectors: a np matrix (nDims x nVectors) whose i-th column corresponds to the i-th feature vector RETURNS: - sim_matrix: the self-similarity matrix (nVectors x nVectors) """ norm_feature_vectors, mean, std = at.normalize_features( [feature_vectors.T]) norm_feature_vectors = norm_feature_vectors[0].T sim_matrix = 1.0 - distance.squareform( distance.pdist(norm_feature_vectors.T, 'cosine')) return sim_matrix
def _get_features(fn, mt_step=0.1, st_win=0.05): ''' Generate the feature vector for the audio located at fn ''' sr, signal = read_audio_file(fn) if len(signal.shape) > 1: raise Exception( f'Single channel audio only! This audio has the shape: {signal.shape}' ) mid_step = mt_step * sr short_window = round(st_win * sr) short_step = round(st_win * sr * 0.5) [feats, short_feats, feat_names] = paa.MidTermFeatures\ .mid_feature_extraction (signal, sr, mid_window, mid_step, short_window, short_step) (feats_norm, MEAN, STD) = normalize_features([feats.T]) feats_norm = feats_norm[0].T return feats_norm
def visualizeFeaturesFolder(folder, dimReductionMethod, priorKnowledge="none"): ''' This function generates a chordial visualization for the recordings of the provided path. ARGUMENTS: - folder: path of the folder that contains the WAV files to be processed - dimReductionMethod: method used to reduce the dimension of the initial feature space before computing the similarity. - priorKnowledge: if this is set equal to "artist" ''' if dimReductionMethod == "pca": allMtFeatures, wavFilesList, _ = aF.directory_feature_extraction( folder, 30.0, 30.0, 0.050, 0.050, compute_beat=True) if allMtFeatures.shape[0] == 0: print("Error: No data found! Check input folder") return namesCategoryToVisualize = [ ntpath.basename(w).replace('.wav', '').split(" --- ")[0] for w in wavFilesList ] namesToVisualize = [ ntpath.basename(w).replace('.wav', '') for w in wavFilesList ] (F, MEAN, STD) = aT.normalize_features([allMtFeatures]) F = np.concatenate(F) # check that the new PCA dimension is at most equal to the number of samples K1 = 2 K2 = 10 if K1 > F.shape[0]: K1 = F.shape[0] if K2 > F.shape[0]: K2 = F.shape[0] pca1 = sklearn.decomposition.PCA(n_components=K1) pca1.fit(F) pca2 = sklearn.decomposition.PCA(n_components=K2) pca2.fit(F) finalDims = pca1.transform(F) finalDims2 = pca2.transform(F) else: allMtFeatures, Ys, wavFilesList = aF.directory_feature_extraction_no_avg( folder, 20.0, 5.0, 0.040, 0.040 ) # long-term statistics cannot be applied in this context (LDA needs mid-term features) if allMtFeatures.shape[0] == 0: print("Error: No data found! Check input folder") return namesCategoryToVisualize = [ ntpath.basename(w).replace('.wav', '').split(" --- ")[0] for w in wavFilesList ] namesToVisualize = [ ntpath.basename(w).replace('.wav', '') for w in wavFilesList ] ldaLabels = Ys if priorKnowledge == "artist": uNamesCategoryToVisualize = list(set(namesCategoryToVisualize)) YsNew = np.zeros(Ys.shape) for i, uname in enumerate( uNamesCategoryToVisualize): # for each unique artist name: indicesUCategories = [ j for j, x in enumerate(namesCategoryToVisualize) if x == uname ] for j in indicesUCategories: indices = np.nonzero(Ys == j) YsNew[indices] = i ldaLabels = YsNew (F, MEAN, STD) = aT.normalize_features([allMtFeatures]) F = np.array(F[0]) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=10) clf.fit(F, ldaLabels) reducedDims = clf.transform(F) pca = sklearn.decomposition.PCA(n_components=2) pca.fit(reducedDims) reducedDims = pca.transform(reducedDims) # TODO: CHECK THIS ... SHOULD LDA USED IN SEMI-SUPERVISED ONLY???? uLabels = np.sort( np.unique((Ys)) ) # uLabels must have as many labels as the number of wavFilesList elements reducedDimsAvg = np.zeros((uLabels.shape[0], reducedDims.shape[1])) finalDims = np.zeros((uLabels.shape[0], 2)) for i, u in enumerate(uLabels): indices = [j for j, x in enumerate(Ys) if x == u] f = reducedDims[indices, :] finalDims[i, :] = f.mean(axis=0) finalDims2 = reducedDims for i in range(finalDims.shape[0]): plt.text(finalDims[i, 0], finalDims[i, 1], ntpath.basename(wavFilesList[i].replace('.wav', '')), horizontalalignment='center', verticalalignment='center', fontsize=10) plt.plot(finalDims[i, 0], finalDims[i, 1], '*r') plt.xlim([1.2 * finalDims[:, 0].min(), 1.2 * finalDims[:, 0].max()]) plt.ylim([1.2 * finalDims[:, 1].min(), 1.2 * finalDims[:, 1].max()]) plt.show() SM = 1.0 - distance.squareform(distance.pdist(finalDims2, 'cosine')) for i in range(SM.shape[0]): SM[i, i] = 0.0 chordialDiagram("visualization", SM, 0.50, namesToVisualize, namesCategoryToVisualize) SM = 1.0 - distance.squareform(distance.pdist(F, 'cosine')) for i in range(SM.shape[0]): SM[i, i] = 0.0 chordialDiagram("visualizationInitial", SM, 0.50, namesToVisualize, namesCategoryToVisualize) # plot super-categories (i.e. artistname uNamesCategoryToVisualize = sort(list(set(namesCategoryToVisualize))) finalDimsGroup = np.zeros( (len(uNamesCategoryToVisualize), finalDims2.shape[1])) for i, uname in enumerate(uNamesCategoryToVisualize): indices = [ j for j, x in enumerate(namesCategoryToVisualize) if x == uname ] f = finalDims2[indices, :] finalDimsGroup[i, :] = f.mean(axis=0) SMgroup = 1.0 - distance.squareform( distance.pdist(finalDimsGroup, 'cosine')) for i in range(SMgroup.shape[0]): SMgroup[i, i] = 0.0 chordialDiagram("visualizationGroup", SMgroup, 0.50, uNamesCategoryToVisualize, uNamesCategoryToVisualize)
def speaker_diarization(filename, n_speakers, mid_window=2.0, mid_step=0.2, short_window=0.05, lda_dim=35, plot_res=False): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mid_window (opt) mid-term window size - mid_step (opt) mid-term window step - short_window (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting """ sampling_rate, signal = audioBasicIO.read_audio_file(filename) signal = audioBasicIO.stereo_to_mono(signal) duration = len(signal) / sampling_rate base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models") classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \ at.load_model_knn(os.path.join(base_dir, "knn_speaker_10")) classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _, _ = \ at.load_model_knn(os.path.join(base_dir, "knn_speaker_male_female")) mid_feats, st_feats, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * short_window), round(sampling_rate * short_window * 0.5)) mid_term_features = np.zeros( (mid_feats.shape[0] + len(class_names_all) + len(class_names_fm), mid_feats.shape[1])) for index in range(mid_feats.shape[1]): feature_norm_all = (mid_feats[:, index] - mean_all) / std_all feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm) start = mid_feats.shape[0] end = mid_feats.shape[0] + len(class_names_all) mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index] mid_term_features[start:end, index] = p1 + 1e-4 mid_term_features[end::, index] = p2 + 1e-4 mid_feats = mid_term_features # TODO feature_selected = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] mid_feats = mid_feats[feature_selected, :] mid_feats_norm, mean, std = at.normalize_features([mid_feats.T]) mid_feats_norm = mid_feats_norm[0].T n_wins = mid_feats.shape[1] # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: # EnergyMin = np.min(mt_feats[1,:]) # EnergyMean = np.mean(mt_feats[1,:]) # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] # print i_non_outliers mt_feats_norm_or = mid_feats_norm mid_feats_norm = mid_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: # extract mid-term features with minimum step: window_ratio = int(round(mid_window / short_window)) step_ratio = int(round(short_window / short_window)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 for index in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) # for each of the short-term features: for index in range(num_of_features): cur_pos = 0 feat_len = len(st_feats[index]) while cur_pos < feat_len: n1 = cur_pos n2 = cur_pos + window_ratio if n2 > feat_len: n2 = feat_len short_features = st_feats[index][n1:n2] mt_feats_to_red[index].append(np.mean(short_features)) mt_feats_to_red[index + num_of_features].\ append(np.std(short_features)) cur_pos += step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros( (mt_feats_to_red.shape[0] + len(class_names_all) + len(class_names_fm), mt_feats_to_red.shape[1])) limit = mt_feats_to_red.shape[0] + len(class_names_all) for index in range(mt_feats_to_red.shape[1]): feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \ mt_feats_to_red[:, index] mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit, index] = p1 + 1e-4 mt_feats_to_red_2[limit::, index] = p2 + 1e-4 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[feature_selected, :] mt_feats_to_red, mean, std = at.normalize_features([mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T labels = np.zeros((mt_feats_to_red.shape[1], )) lda_step = 1.0 lda_step_ratio = lda_step / short_window for index in range(labels.shape[0]): labels[index] = int(index * short_window / lda_step_ratio) clf = sklearn.discriminant_analysis.\ LinearDiscriminantAnalysis(n_components=lda_dim) clf.fit(mt_feats_to_red.T, labels) mid_feats_norm = (clf.transform(mid_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] cluster_labels = [] sil_all = [] cluster_centers = [] for speakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=speakers) k_means.fit(mid_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ cluster_labels.append(cls) cluster_centers.append(means) sil_1 = [] sil_2 = [] for c in range(speakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mid_feats_norm[:, cls == c] # compute average distance between samples # that belong to the cluster (a values) dist = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(dist) * clust_per_cent) sil_temp = [] for c2 in range(speakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) mid_features_temp = mid_feats_norm[:, cls == c2] dist = distance.cdist(mt_feats_norm_temp.T, mid_features_temp.T) sil_temp.append( np.mean(dist) * (clust_per_cent + clust_per_cent_2) / 2.0) sil_temp = np.array(sil_temp) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(sil_temp)) sil_1 = np.array(sil_1) sil_2 = np.array(sil_2) sil = [] for c in range(speakers): # for each cluster (speaker) compute silhouette sil.append( (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = int(np.argmax(sil_all)) # optimal number of clusters num_speakers = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = np.zeros((n_wins, )) for index in range(n_wins): j = np.argmin(np.abs(index - i_non_outliers)) cls[index] = cluster_labels[imax][j] # Post-process method 1: hmm smoothing for index in range(1): # hmm training start_prob, transmat, means, cov = \ train_hmm_compute_statistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundtruth exists if os.path.isfile(gt_file): seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file) flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end, seg_labs, mid_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(np.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls) if os.path.isfile(gt_file): if plot_res: ax1.plot( np.array(range(len(flags_gt))) * mid_step + mid_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluate_speaker_diarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format( 100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") if n_speakers <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() return cls
def silence_removal(signal, sampling_rate, st_win, st_step, smooth_window=0.5, weight=0.5, plot=False): """ Event Detection (silence removal) ARGUMENTS: - signal: the input audio signal - sampling_rate: sampling freq - st_win, st_step: window size and step in seconds - smoothWindow: (optinal) smooth window (in seconds) - weight: (optinal) weight factor (0 < weight < 1) the higher, the more strict - plot: (optinal) True if results are to be plotted RETURNS: - seg_limits: list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds """ if weight >= 1: weight = 0.99 if weight <= 0: weight = 0.01 # Step 1: feature extraction signal = audioBasicIO.stereo_to_mono(signal) st_feats, _ = stf.feature_extraction(signal, sampling_rate, st_win * sampling_rate, st_step * sampling_rate) # Step 2: train binary svm classifier of low vs high energy frames # keep only the energy short-term sequence (2nd feature) st_energy = st_feats[1, :] en = np.sort(st_energy) # number of 10% of the total short-term windows st_windows_fraction = int(len(en) / 10) # compute "lower" 10% energy threshold low_threshold = np.mean(en[0:st_windows_fraction]) + 1e-15 # compute "higher" 10% energy threshold high_threshold = np.mean(en[-st_windows_fraction:-1]) + 1e-15 # get all features that correspond to low energy low_energy = st_feats[:, np.where(st_energy <= low_threshold)[0]] # get all features that correspond to high energy high_energy = st_feats[:, np.where(st_energy >= high_threshold)[0]] # form the binary classification task and ... features = [low_energy.T, high_energy.T] # normalize and train the respective svm probabilistic model # (ONSET vs SILENCE) features_norm, mean, std = at.normalize_features(features) svm = at.train_svm(features_norm, 1.0) # Step 3: compute onset probability based on the trained svm prob_on_set = [] for index in range(st_feats.shape[1]): # for each frame cur_fv = (st_feats[:, index] - mean) / std # get svm probability (that it belongs to the ONSET class) prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1]) prob_on_set = np.array(prob_on_set) # smooth probability: prob_on_set = smooth_moving_avg(prob_on_set, smooth_window / st_step) # Step 4A: detect onset frame indices: prog_on_set_sort = np.sort(prob_on_set) # find probability Threshold as a weighted average # of top 10% and lower 10% of the values nt = int(prog_on_set_sort.shape[0] / 10) threshold = (np.mean((1 - weight) * prog_on_set_sort[0:nt]) + weight * np.mean(prog_on_set_sort[-nt::])) max_indices = np.where(prob_on_set > threshold)[0] # get the indices of the frames that satisfy the thresholding index = 0 seg_limits = [] time_clusters = [] # Step 4B: group frame indices to onset segments while index < len(max_indices): # for each of the detected onset indices cur_cluster = [max_indices[index]] if index == len(max_indices) - 1: break while max_indices[index + 1] - cur_cluster[-1] <= 2: cur_cluster.append(max_indices[index + 1]) index += 1 if index == len(max_indices) - 1: break index += 1 time_clusters.append(cur_cluster) seg_limits.append( [cur_cluster[0] * st_step, cur_cluster[-1] * st_step]) # Step 5: Post process: remove very small segments: min_duration = 0.2 seg_limits_2 = [] for s_lim in seg_limits: if s_lim[1] - s_lim[0] > min_duration: seg_limits_2.append(s_lim) seg_limits = seg_limits_2 if plot: time_x = np.arange(0, signal.shape[0] / float(sampling_rate), 1.0 / sampling_rate) plt.subplot(2, 1, 1) plt.plot(time_x, signal) for s_lim in seg_limits: plt.axvline(x=s_lim[0], color='red') plt.axvline(x=s_lim[1], color='red') plt.subplot(2, 1, 2) plt.plot(np.arange(0, prob_on_set.shape[0] * st_step, st_step), prob_on_set) plt.title('Signal') for s_lim in seg_limits: plt.axvline(x=s_lim[0], color='red') plt.axvline(x=s_lim[1], color='red') plt.title('svm Probability') plt.show() return seg_limits
def QE_speaker_diarization( sampling_rate, signal, n_speakers, classifier_all, mean_all, std_all, class_names_all, classifier_fm, mean_fm, std_fm, class_names_fm, # Load models from avove mid_window=2.0, mid_step=0.2, short_window=0.05, lda_dim=35, plot_res=False): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed #QE_:-> ADAPTED HERE TO RECEIVE DIRECTLY THE DATA sampling_rate, signal INSTEAD OF filename - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mid_window (opt) mid-term window size - mid_step (opt) mid-term window step - short_window (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting """ """ Otras opciones a explorar para diarization https://hackernoon.com/speaker-diarization-the-squad-way-2205e0accbda https://github.com/YongyuG/s4d-diarization-gao/blob/master/s4d/diar.py, muy buena pinta https://pypi.org/project/s4d/ https://projets-lium.univ-lemans.fr/s4d/ https://medium.com/datadriveninvestor/speaker-diarization-22121f1264b1 https://arxiv.org/pdf/2005.08072v1.pdf https://github.com/calclavia/tal-asrd https://github.com/josepatino/pyBK https://github.com/wq2012/awesome-diarization https://www.researchgate.net/publication/221480626_The_Detection_of_Overlapping_Speech_with_Prosodic_Features_for_Speaker_Diarization """ # sampling_rate, signal = audioBasicIO.read_audio_file(filename) # NO ES NECESARIO DADOJ QUE LO PASO COMO ARGUMENTOS DE ENTRADA EN LUGAR DE filename signal = audioBasicIO.stereo_to_mono( signal) # eliminar si ya viene en mono como condicion duration = len(signal) / sampling_rate """ #QE_: In order to avoid a recurrent load of the models, they are loaded more globally only once and then passed as arguments # So this part is copied in the module avove , QE_main: base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models") classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \ at.load_model_knn(os.path.join(base_dir, "knn_speaker_10")) classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _, _ = \ at.load_model_knn(os.path.join(base_dir, "knn_speaker_male_female")) """ mid_feats, st_feats, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * short_window), round(sampling_rate * short_window * 0.5)) mid_term_features = np.zeros( (mid_feats.shape[0] + len(class_names_all) + len(class_names_fm), mid_feats.shape[1])) for index in range(mid_feats.shape[1]): feature_norm_all = (mid_feats[:, index] - mean_all) / std_all feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm) start = mid_feats.shape[0] end = mid_feats.shape[0] + len(class_names_all) mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index] mid_term_features[start:end, index] = p1 + 1e-4 mid_term_features[end::, index] = p2 + 1e-4 mid_feats = mid_term_features # TODO feature_selected = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] mid_feats = mid_feats[feature_selected, :] mid_feats_norm, mean, std = at.normalize_features([mid_feats.T]) mid_feats_norm = mid_feats_norm[0].T n_wins = mid_feats.shape[1] # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: # EnergyMin = np.min(mt_feats[1,:]) # EnergyMean = np.mean(mt_feats[1,:]) # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] # print i_non_outliers mt_feats_norm_or = mid_feats_norm mid_feats_norm = mid_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: # extract mid-term features with minimum step: window_ratio = int(round(mid_window / short_window)) step_ratio = int(round(short_window / short_window)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 for index in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) # for each of the short-term features: for index in range(num_of_features): cur_pos = 0 feat_len = len(st_feats[index]) while cur_pos < feat_len: n1 = cur_pos n2 = cur_pos + window_ratio if n2 > feat_len: n2 = feat_len short_features = st_feats[index][n1:n2] mt_feats_to_red[index].append(np.mean(short_features)) mt_feats_to_red[index + num_of_features].\ append(np.std(short_features)) cur_pos += step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros( (mt_feats_to_red.shape[0] + len(class_names_all) + len(class_names_fm), mt_feats_to_red.shape[1])) limit = mt_feats_to_red.shape[0] + len(class_names_all) for index in range(mt_feats_to_red.shape[1]): feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \ mt_feats_to_red[:, index] mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit, index] = p1 + 1e-4 mt_feats_to_red_2[limit::, index] = p2 + 1e-4 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[feature_selected, :] mt_feats_to_red, mean, std = at.normalize_features([mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T labels = np.zeros((mt_feats_to_red.shape[1], )) lda_step = 1.0 lda_step_ratio = lda_step / short_window for index in range(labels.shape[0]): labels[index] = int(index * short_window / lda_step_ratio) clf = sklearn.discriminant_analysis.\ LinearDiscriminantAnalysis(n_components=lda_dim) clf.fit(mt_feats_to_red.T, labels) mid_feats_norm = (clf.transform(mid_feats_norm.T)).T ########################################################################################################################################## if n_speakers <= 0: s_range = range( 2, 10 ) #QE_: Adapt in this case to range 1-10? We are going to use this diarizantion in short windows, 250-500 ms ########################################################################################################################################### else: s_range = [n_speakers] cluster_labels = [] sil_all = [] cluster_centers = [] for speakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=speakers) k_means.fit(mid_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ cluster_labels.append(cls) cluster_centers.append(means) sil_1 = [] sil_2 = [] for c in range(speakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mid_feats_norm[:, cls == c] # compute average distance between samples # that belong to the cluster (a values) dist = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(dist) * clust_per_cent) sil_temp = [] for c2 in range(speakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) mid_features_temp = mid_feats_norm[:, cls == c2] dist = distance.cdist(mt_feats_norm_temp.T, mid_features_temp.T) sil_temp.append( np.mean(dist) * (clust_per_cent + clust_per_cent_2) / 2.0) sil_temp = np.array(sil_temp) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(sil_temp)) sil_1 = np.array(sil_1) sil_2 = np.array(sil_2) sil = [] for c in range(speakers): # for each cluster (speaker) compute silhouette sil.append( (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = int(np.argmax(sil_all)) # optimal number of clusters num_speakers = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = np.zeros((n_wins, )) for index in range(n_wins): j = np.argmin(np.abs(index - i_non_outliers)) cls[index] = cluster_labels[imax][j] # Post-process method 1: hmm smoothing for index in range(1): # hmm training start_prob, transmat, means, cov = \ train_hmm_compute_statistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundtruth exists if os.path.isfile(gt_file): seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file) flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end, seg_labs, mid_step) """ if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(np.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls) if os.path.isfile(gt_file): if plot_res: ax1.plot(np.array(range(len(flags_gt))) * mid_step + mid_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluate_speaker_diarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") if n_speakers <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() """ return cls
import os, readchar, sklearn.cluster from pyAudioAnalysis.MidTermFeatures import mid_feature_extraction as mT from pyAudioAnalysis.audioBasicIO import read_audio_file, stereo_to_mono from pyAudioAnalysis.audioSegmentation import labels_to_segments from pyAudioAnalysis.audioTrainTest import normalize_features if __name__ == '__main__': # read signal and get normalized segment features: input_file = "../data/diarizationExample.wav" fs, x = read_audio_file(input_file) x = stereo_to_mono(x) mt_size, mt_step, st_win = 1, 0.1, 0.05 [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5)) print(mt_feats.shape) (mt_feats_norm, MEAN, STD) = normalize_features([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T # perform clustering (k = 4) n_clusters = 4 k_means = sklearn.cluster.KMeans(n_clusters=n_clusters) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ print(cls.shape) segs, c = labels_to_segments(cls, mt_step) # convert flags to segment limits for sp in range(n_clusters): # play each cluster's segment for i in range(len(c)): if c[i] == sp and segs[i, 1] - segs[i, 0] > 0.5: # play long segments of current speaker print(c[i], segs[i, 0], segs[i, 1]) cmd = "ffmpeg -i {} -ss {} -t {} temp.wav " \