def thumbnailWrapper(inputFile, thumbnailWrapperSize): st_window = 0.5 st_step = 0.5 if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") [fs, x] = audioBasicIO.read_audio_file(inputFile) if fs == -1: # could not read file return [A1, A2, B1, B2, Smatrix] = aS.music_thumbnailing(x, fs, st_window, st_step, thumbnailWrapperSize) # write thumbnailWrappers to WAV files: if inputFile.endswith(".wav"): thumbnailWrapperFileName1 = inputFile.replace(".wav", "_thumb1.wav") thumbnailWrapperFileName2 = inputFile.replace(".wav", "_thumb2.wav") if inputFile.endswith(".mp3"): thumbnailWrapperFileName1 = inputFile.replace(".mp3", "_thumb1.mp3") thumbnailWrapperFileName2 = inputFile.replace(".mp3", "_thumb2.mp3") wavfile.write(thumbnailWrapperFileName1, fs, x[int(fs * A1):int(fs * A2)]) wavfile.write(thumbnailWrapperFileName2, fs, x[int(fs * B1):int(fs * B2)]) print("1st thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec" \ " -- {2:4.1f}sec".format(thumbnailWrapperFileName1, A1, A2)) print("2nd thumbnailWrapper (stored in file {0:s}): {1:4.1f}sec" \ " -- {2:4.1f}sec".format(thumbnailWrapperFileName2, B1, B2)) # Plot self-similarity matrix: fig = plt.figure() ax = fig.add_subplot(111, aspect="auto") plt.imshow(Smatrix) # Plot best-similarity diagonal: Xcenter = (A1 / st_step + A2 / st_step) / 2.0 Ycenter = (B1 / st_step + B2 / st_step) / 2.0 e1 = matplotlib.patches.Ellipse((Ycenter, Xcenter), thumbnailWrapperSize * 1.4, 3, angle=45, linewidth=3, fill=False) ax.add_patch(e1) plt.plot([B1/ st_step, Smatrix.shape[0]], [A1/ st_step, A1/ st_step], color="k", linestyle="--", linewidth=2) plt.plot([B2/ st_step, Smatrix.shape[0]], [A2/ st_step, A2/ st_step], color="k", linestyle="--", linewidth=2) plt.plot([B1/ st_step, B1/ st_step], [A1/ st_step, Smatrix.shape[0]], color="k", linestyle="--", linewidth=2) plt.plot([B2/ st_step, B2/ st_step], [A2/ st_step, Smatrix.shape[0]], color="k", linestyle="--", linewidth=2) plt.xlim([0, Smatrix.shape[0]]) plt.ylim([Smatrix.shape[1], 0]) ax.yaxis.set_label_position("right") ax.yaxis.tick_right() plt.xlabel("frame no") plt.ylabel("frame no") plt.title("Self-similarity matrix") plt.show()
def fileChromagramWrapper(wav_file): if not os.path.isfile(wav_file): raise Exception("Input audio file not found!") [fs, x] = audioBasicIO.read_audio_file(wav_file) x = audioBasicIO.stereo_to_mono(x) specgram, TimeAxis, FreqAxis = sF.chromagram(x, fs, round(fs * 0.040), round(fs * 0.040), True)
def get_spectrogram(path, win, step, method): """ get_spectrogram() is a wrapper to pyAudioAnalysis.ShortTermFeatures.spectrogram() with a caching functionality :param path: path of the WAV file to analyze :param win: short-term window to be used in spectrogram calculation :param step: short-term step to be used in spectrogram calculation :return: spectrogram matrix, time array, freq array and sampling freq """ fs, s = io.read_audio_file(path) if method == "pyaudioanalysis": spec_val, spec_time, spec_freq = sF.spectrogram( s, fs, round(fs * win), round(fs * step), False, True) elif method == "librosa": s = np.double(s) s = s / (2.0**15) spec_val = np.abs(librosa.stft(s, round(fs * win), round(fs * step))) spec_freq = [ float((f + 1) * fs) / (round(fs * step)) for f in range(spec_val.shape[0]) ] spec_time = [ float(t * round(fs * step)) / fs for t in range(spec_val.shape[1]) ] elif method == "shorttermfeatures": sF.feature_extraction(s, fs, round(fs * win), round(fs * step))
def beatExtractionWrapper(wav_file, plot): if not os.path.isfile(wav_file): raise Exception("Input audio file not found!") [fs, x] = audioBasicIO.read_audio_file(wav_file) F, _ = sF.feature_extraction(x, fs, 0.050 * fs, 0.050 * fs) bpm, ratio = aF.beat_extraction(F, 0.050, plot) print("Beat: {0:d} bpm ".format(int(bpm))) print("Ratio: {0:.2f} ".format(ratio))
def extract_extract_audioAnalysis(audio_file, chuncksize=1): [Fs, x] = audioBasicIO.read_audio_file(audio_file) x = audioBasicIO.stereo_to_mono(x) overlap = chuncksize * Fs F, f_names = ShortTermFeatures.feature_extraction(x, Fs, Fs, overlap) # takes approx. 2.5 mins to comple # return Zero Crossing Rate, Spectral Centroid, Spectral Spread, Spectral Entropy, Spectral Flux, Spectral Rolloff return F[0], F[3], F[4], F[5], F[6], F[7]
def silenceRemovalWrapper(inputFile, smoothingWindow, weight): if not os.path.isfile(inputFile): raise Exception("Input audio file not found!") [fs, x] = audioBasicIO.read_audio_file(inputFile) segmentLimits = aS.silence_removal(x, fs, 0.05, 0.05, smoothingWindow, weight, True) for i, s in enumerate(segmentLimits): strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(inputFile[0:-4], s[0], s[1]) wavfile.write(strOut, fs, x[int(fs * s[0]):int(fs * s[1])])
def file_regression(input_file, model_name, model_type): # Load classifier: if not os.path.isfile(input_file): print("fileClassification: wav file not found!") return -1, -1, -1 regression_models = glob.glob(model_name + "_*") regression_models2 = [] for r in regression_models: if r[-5::] != "MEANS": regression_models2.append(r) regression_models = regression_models2 regression_names = [] for r in regression_models: regression_names.append(r[r.rfind("_") + 1::]) # FEATURE EXTRACTION # LOAD ONLY THE FIRST MODEL (for mt_win, etc) if model_type == 'svm' or model_type == "svm_rbf" or \ model_type == 'randomforest': _, _, _, mid_window, mid_step, short_window, short_step, compute_beat \ = load_model(regression_models[0], True) # read audio file and convert to mono samping_rate, signal = audioBasicIO.read_audio_file(input_file) signal = audioBasicIO.stereo_to_mono(signal) # feature extraction: mid_features, s, _ = \ aF.mid_feature_extraction(signal, samping_rate, mid_window * samping_rate, mid_step * samping_rate, round(samping_rate * short_window), round(samping_rate * short_step)) # long term averaging of mid-term statistics mid_features = mid_features.mean(axis=1) if compute_beat: beat, beat_conf = aF.beat_extraction(s, short_step) mid_features = np.append(mid_features, beat) mid_features = np.append(mid_features, beat_conf) # REGRESSION R = [] for ir, r in enumerate(regression_models): if not os.path.isfile(r): print("fileClassification: input model_name not found!") return (-1, -1, -1) if model_type == 'svm' or model_type == "svm_rbf" \ or model_type == 'randomforest': model, mean, std, _, _, _, _, _ = load_model(r, True) curFV = (mid_features - mean) / std # normalization R.append(regression_wrapper(model, model_type, curFV)) # classification return R, regression_names
def mid_feature_extraction_to_file(file_path, mid_window, mid_step, short_window, short_step, output_file, store_short_features=False, store_csv=False, plot=False): """ This function is used as a wrapper to: a) read the content of a WAV file b) perform mid-term feature extraction on that signal c) write the mid-term feature sequences to a np file """ sampling_rate, signal = audioBasicIO.read_audio_file(file_path) signal = audioBasicIO.stereo_to_mono(signal) if store_short_features: mid_features, short_features, _ = \ mid_feature_extraction(signal, sampling_rate, round(sampling_rate * mid_window), round(sampling_rate * mid_step), round(sampling_rate * short_window), (sampling_rate * short_step)) # save st features to np file np.save(output_file + "_st", short_features) if plot: print("Short-term np file: " + output_file + "_st.npy saved") if store_csv: # store st features to CSV file np.savetxt(output_file + "_st.csv", short_features.T, delimiter=",") if plot: print("Short-term CSV file: " + output_file + "_st.csv saved") else: mid_features, _, _ = \ mid_feature_extraction(signal, sampling_rate, round(sampling_rate * mid_window), round(sampling_rate * mid_step), round(sampling_rate * short_window), round(sampling_rate * short_step)) # save mt features to np file np.save(output_file, mid_features) if plot: print("Mid-term np file: " + output_file + ".npy saved") if store_csv: np.savetxt(output_file + ".csv", mid_features.T, delimiter=",") if plot: print("Mid-term CSV file: " + output_file + ".csv saved")
def file_classification(input_file, model_name, model_type): # Load classifier: if not os.path.isfile(model_name): print("fileClassification: input model_name not found!") return -1, -1, -1 if not os.path.isfile(input_file): print("fileClassification: wav file not found!") return -1, -1, -1 if model_type == 'knn': classifier, mean, std, classes, mid_window, mid_step, short_window, \ short_step, compute_beat = load_model_knn(model_name) else: classifier, mean, std, classes, mid_window, mid_step, short_window, \ short_step, compute_beat = load_model(model_name) # read audio file and convert to mono sampling_rate, signal = audioBasicIO.read_audio_file(input_file) signal = audioBasicIO.stereo_to_mono(signal) if sampling_rate == 0: # audio file IO problem return -1, -1, -1 if signal.shape[0] / float(sampling_rate) <= mid_window: return -1, -1, -1 # feature extraction: mid_features, s, _ = \ aF.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * short_window), round(sampling_rate * short_step)) # long term averaging of mid-term statistics mid_features = mid_features.mean(axis=1) if compute_beat: beat, beat_conf = aF.beat_extraction(s, short_step) mid_features = np.append(mid_features, beat) mid_features = np.append(mid_features, beat_conf) feature_vector = (mid_features - mean) / std # normalization # classification class_id, probability = classifier_wrapper(classifier, model_type, feature_vector) return class_id, probability, classes
def directory_feature_extraction_no_avg(folder_path, mid_window, mid_step, short_window, short_step): """ This function extracts the mid-term features of the WAVE files of a particular folder without averaging each file. ARGUMENTS: - folder_path: the path of the WAVE directory - mid_window, mid_step: mid-term window and step (in seconds) - short_window, short_step: short-term window and step (in seconds) RETURNS: - X: A feature matrix - Y: A matrix of file labels - filenames: """ wav_file_list = [] signal_idx = np.array([]) mid_features = np.array([]) types = ('*.wav', '*.aif', '*.aiff', '*.ogg') for files in types: wav_file_list.extend(glob.glob(os.path.join(folder_path, files))) wav_file_list = sorted(wav_file_list) for i, file_path in enumerate(wav_file_list): sampling_rate, signal = audioBasicIO.read_audio_file(file_path) if sampling_rate == 0: continue signal = audioBasicIO.stereo_to_mono(signal) mid_feature_vector, _, _ = \ mid_feature_extraction(signal, sampling_rate, round(mid_window * sampling_rate), round(mid_step * sampling_rate), round(sampling_rate * short_window), round(sampling_rate * short_step)) mid_feature_vector = np.transpose(mid_feature_vector) if len(mid_features) == 0: # append feature vector mid_features = mid_feature_vector signal_idx = np.zeros((mid_feature_vector.shape[0], )) else: mid_features = np.vstack((mid_features, mid_feature_vector)) signal_idx = np.append( signal_idx, i * np.ones((mid_feature_vector.shape[0], ))) return mid_features, signal_idx, wav_file_list
def train_hmm_from_file(wav_file, gt_file, hmm_model_name, mid_window, mid_step): """ This function trains a HMM model for segmentation-classification using a single annotated audio file ARGUMENTS: - wav_file: the path of the audio filename - gt_file: the path of the ground truth filename (a csv file of the form <segment start in seconds>, <segment end in seconds>,<segment label> in each row - hmm_model_name: the name of the HMM model to be stored - mt_win: mid-term window size - mt_step: mid-term window step RETURNS: - hmm: an object to the resulting HMM - class_names: a list of class_names After training, hmm, class_names, along with the mt_win and mt_step values are stored in the hmm_model_name file """ seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file) flags, class_names = segments_to_labels(seg_start, seg_end, seg_labs, mid_step) sampling_rate, signal = audioBasicIO.read_audio_file(wav_file) features, _, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * 0.050), round(sampling_rate * 0.050)) class_priors, transumation_matrix, means, cov = \ train_hmm_compute_statistics(features, flags) hmm = hmmlearn.hmm.GaussianHMM(class_priors.shape[0], "diag") hmm.covars_ = cov hmm.means_ = means hmm.startprob_ = class_priors hmm.transmat_ = transumation_matrix save_hmm(hmm_model_name, hmm, class_names, mid_window, mid_step) return hmm, class_names
def hmm_segmentation(audio_file, hmm_model_name, plot_results=False, gt_file=""): sampling_rate, signal = audioBasicIO.read_audio_file(audio_file) with open(hmm_model_name, "rb") as f_handle: hmm = cpickle.load(f_handle) class_names = cpickle.load(f_handle) mid_window = cpickle.load(f_handle) mid_step = cpickle.load(f_handle) features, _, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * 0.050), round(sampling_rate * 0.050)) # apply model labels = hmm.predict(features.T) labels_gt, class_names_gt, accuracy, cm = \ load_ground_truth(gt_file, labels, class_names, mid_step, plot_results) return labels, class_names, accuracy, cm
def annotation2files(wavFile, csvFile): """ Break an audio stream to segments of interest, defined by a csv file - wavFile: path to input wavfile - csvFile: path to csvFile of segment limits Input CSV file must be of the format <T1>\t<T2>\t<Label> """ [Fs, x] = audioBasicIO.read_audio_file(wavFile) with open(csvFile, 'r') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='|') for j, row in enumerate(reader): T1 = float(row[0].replace(",",".")) T2 = float(row[1].replace(",",".")) label = "%s_%s_%.2f_%.2f.wav" % (wavFile, row[2], T1, T2) label = label.replace(" ", "_") xtemp = x[int(round(T1*Fs)):int(round(T2*Fs))] print(T1, T2, label, xtemp.shape) wavfile.write(label, Fs, xtemp)
def directory_feature_extraction(folder_path, mid_window, mid_step, short_window, short_step, compute_beat=True): """ This function extracts the mid-term features of the WAVE files of a particular folder. The resulting feature vector is extracted by long-term averaging the mid-term features. Therefore ONE FEATURE VECTOR is extracted for each WAV file. ARGUMENTS: - folder_path: the path of the WAVE directory - mid_window, mid_step: mid-term window and step (in seconds) - short_window, short_step: short-term window and step (in seconds) """ mid_term_features = np.array([]) process_times = [] types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au', '*.ogg') wav_file_list = [] for files in types: wav_file_list.extend(glob.glob(os.path.join(folder_path, files))) wav_file_list = sorted(wav_file_list) wav_file_list2, mid_feature_names = [], [] for i, file_path in enumerate(wav_file_list): print("Analyzing file {0:d} of {1:d}: {2:s}".format( i + 1, len(wav_file_list), file_path)) if os.stat(file_path).st_size == 0: print(" (EMPTY FILE -- SKIPPING)") continue sampling_rate, signal = audioBasicIO.read_audio_file(file_path) if sampling_rate == 0: continue t1 = time.clock() signal = audioBasicIO.stereo_to_mono(signal) if signal.shape[0] < float(sampling_rate) / 5: print(" (AUDIO FILE TOO SMALL - SKIPPING)") continue wav_file_list2.append(file_path) if compute_beat: mid_features, short_features, mid_feature_names = \ mid_feature_extraction(signal, sampling_rate, round(mid_window * sampling_rate), round(mid_step * sampling_rate), round(sampling_rate * short_window), round(sampling_rate * short_step)) beat, beat_conf = beat_extraction(short_features, short_step) else: mid_features, _, mid_feature_names = \ mid_feature_extraction(signal, sampling_rate, round(mid_window * sampling_rate), round(mid_step * sampling_rate), round(sampling_rate * short_window), round(sampling_rate * short_step)) mid_features = np.transpose(mid_features) mid_features = mid_features.mean(axis=0) # long term averaging of mid-term statistics if (not np.isnan(mid_features).any()) and \ (not np.isinf(mid_features).any()): if compute_beat: mid_features = np.append(mid_features, beat) mid_features = np.append(mid_features, beat_conf) if len(mid_term_features) == 0: # append feature vector mid_term_features = mid_features else: mid_term_features = np.vstack( (mid_term_features, mid_features)) t2 = time.clock() duration = float(len(signal)) / sampling_rate process_times.append((t2 - t1) / duration) if len(process_times) > 0: print("Feature extraction complexity ratio: " "{0:.1f} x realtime".format( (1.0 / np.mean(np.array(process_times))))) return mid_term_features, wav_file_list2, mid_feature_names
def train_hmm_from_directory(folder_path, hmm_model_name, mid_window, mid_step): """ This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored ARGUMENTS: - folder_path: the path of the data diretory - hmm_model_name: the name of the HMM model to be stored - mt_win: mid-term window size - mt_step: mid-term window step RETURNS: - hmm: an object to the resulting HMM - class_names: a list of class_names After training, hmm, class_names, along with the mt_win and mt_step values are stored in the hmm_model_name file """ flags_all = np.array([]) class_names_all = [] for i, f in enumerate(glob.glob(folder_path + os.sep + '*.wav')): # for each WAV file wav_file = f gt_file = f.replace('.wav', '.segments') if os.path.isfile(gt_file): seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file) flags, class_names = \ segments_to_labels(seg_start, seg_end, seg_labs, mid_step) for c in class_names: # update class names: if c not in class_names_all: class_names_all.append(c) sampling_rate, signal = audioBasicIO.read_audio_file(wav_file) feature_vector, _, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * 0.050), round(sampling_rate * 0.050)) flag_len = len(flags) feat_cols = feature_vector.shape[1] min_sm = min(feat_cols, flag_len) feature_vector = feature_vector[:, 0:min_sm] flags = flags[0:min_sm] flags_new = [] # append features and labels for j, fl in enumerate(flags): flags_new.append(class_names_all.index(class_names_all[flags[j]])) flags_all = np.append(flags_all, np.array(flags_new)) if i == 0: f_all = feature_vector else: f_all = np.concatenate((f_all, feature_vector), axis=1) # compute HMM statistics class_priors, transmutation_matrix, means, cov = \ train_hmm_compute_statistics(f_all, flags_all) # train the HMM hmm = hmmlearn.hmm.GaussianHMM(class_priors.shape[0], "diag") hmm.covars_ = cov hmm.means_ = means hmm.startprob_ = class_priors hmm.transmat_ = transmutation_matrix save_hmm(hmm_model_name, hmm, class_names_all, mid_window, mid_step) return hmm, class_names_all
def mid_term_file_classification(input_file, model_name, model_type, plot_results=False, gt_file=""): """ This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment """ labels = [] accuracy = 0.0 class_names = [] cm = np.array([]) if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return labels, class_names, accuracy, cm # Load classifier: if model_type == "knn": classifier, mean, std, class_names, mt_win, mid_step, st_win, \ st_step, compute_beat = at.load_model_knn(model_name) else: classifier, mean, std, class_names, mt_win, mid_step, st_win, \ st_step, compute_beat = at.load_model(model_name) if compute_beat: print("Model " + model_name + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return labels, class_names, accuracy, cm # load input file sampling_rate, signal = audioBasicIO.read_audio_file(input_file) # could not read file if sampling_rate == 0: return labels, class_names, accuracy, cm # convert stereo (if) to mono signal = audioBasicIO.stereo_to_mono(signal) # mid-term feature extraction: mt_feats, _, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mt_win * sampling_rate, mid_step * sampling_rate, round(sampling_rate * st_win), round(sampling_rate * st_step)) posterior_matrix = [] # for each feature vector (i.e. for each fix-sized segment): for col_index in range(mt_feats.shape[1]): # normalize current feature v feature_vector = (mt_feats[:, col_index] - mean) / std # classify vector: label_predicted, posterior = \ at.classifier_wrapper(classifier, model_type, feature_vector) labels.append(label_predicted) # update probability matrix posterior_matrix.append(np.max(posterior)) labels = np.array(labels) # convert fix-sized flags to segments and classes segs, classes = labels_to_segments(labels, mid_step) segs[-1] = len(signal) / float(sampling_rate) # Load grount-truth: labels_gt, class_names_gt, accuracy, cm = \ load_ground_truth(gt_file, labels, class_names, mid_step, plot_results) return labels, class_names, accuracy, cm
def speaker_diarization(filename, n_speakers, mid_window=2.0, mid_step=0.2, short_window=0.05, lda_dim=35, plot_res=False): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mid_window (opt) mid-term window size - mid_step (opt) mid-term window step - short_window (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting """ sampling_rate, signal = audioBasicIO.read_audio_file(filename) signal = audioBasicIO.stereo_to_mono(signal) duration = len(signal) / sampling_rate base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models") classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \ at.load_model_knn(os.path.join(base_dir, "knn_speaker_10")) classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _, _ = \ at.load_model_knn(os.path.join(base_dir, "knn_speaker_male_female")) mid_feats, st_feats, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * short_window), round(sampling_rate * short_window * 0.5)) mid_term_features = np.zeros((mid_feats.shape[0] + len(class_names_all) + len(class_names_fm), mid_feats.shape[1])) for index in range(mid_feats.shape[1]): feature_norm_all = (mid_feats[:, index] - mean_all) / std_all feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm) start = mid_feats.shape[0] end = mid_feats.shape[0] + len(class_names_all) mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index] mid_term_features[start:end, index] = p1 + 1e-4 mid_term_features[end::, index] = p2 + 1e-4 mid_feats = mid_term_features # TODO feature_selected = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] mid_feats = mid_feats[feature_selected, :] mid_feats_norm, mean, std = at.normalize_features([mid_feats.T]) mid_feats_norm = mid_feats_norm[0].T n_wins = mid_feats.shape[1] # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: # EnergyMin = np.min(mt_feats[1,:]) # EnergyMean = np.mean(mt_feats[1,:]) # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] # print i_non_outliers mt_feats_norm_or = mid_feats_norm mid_feats_norm = mid_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: # extract mid-term features with minimum step: window_ratio = int(round(mid_window / short_window)) step_ratio = int(round(short_window / short_window)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 for index in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) # for each of the short-term features: for index in range(num_of_features): cur_pos = 0 feat_len = len(st_feats[index]) while cur_pos < feat_len: n1 = cur_pos n2 = cur_pos + window_ratio if n2 > feat_len: n2 = feat_len short_features = st_feats[index][n1:n2] mt_feats_to_red[index].append(np.mean(short_features)) mt_feats_to_red[index + num_of_features].\ append(np.std(short_features)) cur_pos += step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] + len(class_names_all) + len(class_names_fm), mt_feats_to_red.shape[1])) limit = mt_feats_to_red.shape[0] + len(class_names_all) for index in range(mt_feats_to_red.shape[1]): feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \ mt_feats_to_red[:, index] mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit, index] = p1 + 1e-4 mt_feats_to_red_2[limit::, index] = p2 + 1e-4 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[feature_selected, :] mt_feats_to_red, mean, std = at.normalize_features([mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T labels = np.zeros((mt_feats_to_red.shape[1], )) lda_step = 1.0 lda_step_ratio = lda_step / short_window for index in range(labels.shape[0]): labels[index] = int(index * short_window / lda_step_ratio) clf = sklearn.discriminant_analysis.\ LinearDiscriminantAnalysis(n_components=lda_dim) clf.fit(mt_feats_to_red.T, labels) mid_feats_norm = (clf.transform(mid_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] cluster_labels = [] sil_all = [] cluster_centers = [] for speakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=speakers) k_means.fit(mid_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ cluster_labels.append(cls) cluster_centers.append(means) sil_1 = []; sil_2 = [] for c in range(speakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mid_feats_norm[:, cls == c] # compute average distance between samples # that belong to the cluster (a values) dist = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(dist)*clust_per_cent) sil_temp = [] for c2 in range(speakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) mid_features_temp = mid_feats_norm[:, cls == c2] dist = distance.cdist(mt_feats_norm_temp.T, mid_features_temp.T) sil_temp.append(np.mean(dist)*(clust_per_cent + clust_per_cent_2)/2.0) sil_temp = np.array(sil_temp) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(sil_temp)) sil_1 = np.array(sil_1) sil_2 = np.array(sil_2) sil = [] for c in range(speakers): # for each cluster (speaker) compute silhouette sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = int(np.argmax(sil_all)) # optimal number of clusters num_speakers = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = np.zeros((n_wins,)) for index in range(n_wins): j = np.argmin(np.abs(index-i_non_outliers)) cls[index] = cluster_labels[imax][j] # Post-process method 1: hmm smoothing for index in range(1): # hmm training start_prob, transmat, means, cov = \ train_hmm_compute_statistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundtruth exists if os.path.isfile(gt_file): seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file) flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end, seg_labs, mid_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(np.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls) if os.path.isfile(gt_file): if plot_res: ax1.plot(np.array(range(len(flags_gt))) * mid_step + mid_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluate_speaker_diarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") if n_speakers <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() return (np.array(range(len(cls))) * mid_step + mid_step / 2.0,cls)
def speaker_diarization(filename, n_speakers, mid_window=2.0, mid_step=0.2, short_window=0.05, lda_dim=35, plot_res=False): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mid_window (opt) mid-term window size - mid_step (opt) mid-term window step - short_window (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting """ sampling_rate, signal = audioBasicIO.read_audio_file(filename) signal = audioBasicIO.stereo_to_mono(signal) duration = len(signal) / sampling_rate base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models") classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \ at.load_model_knn(os.path.join(base_dir, "knn_speaker_10")) classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _, _ = \ at.load_model_knn(os.path.join(base_dir, "knn_speaker_male_female")) mid_feats, st_feats, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * short_window), round(sampling_rate * short_window * 0.5)) mid_term_features = np.zeros((mid_feats.shape[0] + len(class_names_all) + len(class_names_fm), mid_feats.shape[1])) for index in range(mid_feats.shape[1]): feature_norm_all = (mid_feats[:, index] - mean_all) / std_all feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm) start = mid_feats.shape[0] end = mid_feats.shape[0] + len(class_names_all) mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index] mid_term_features[start:end, index] = p1 + 1e-4 mid_term_features[end::, index] = p2 + 1e-4 mid_feats = mid_term_features # TODO feature_selected = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] mid_feats = mid_feats[feature_selected, :] mid_feats_norm, mean, std = at.normalize_features([mid_feats.T]) mid_feats_norm = mid_feats_norm[0].T n_wins = mid_feats.shape[1] dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] mt_feats_norm_or = mid_feats_norm mid_feats_norm = mid_feats_norm[:, i_non_outliers] if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] cluster_labels = [] sil_all = [] cluster_centers = [] for speakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=speakers) k_means.fit(mid_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ cluster_labels.append(cls) cluster_centers.append(means) sil_1 = []; sil_2 = [] for c in range(speakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: mt_feats_norm_temp = mid_feats_norm[:, cls == c] dist = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(dist)*clust_per_cent) sil_temp = [] for c2 in range(speakers): if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) mid_features_temp = mid_feats_norm[:, cls == c2] dist = distance.cdist(mt_feats_norm_temp.T, mid_features_temp.T) sil_temp.append(np.mean(dist)*(clust_per_cent + clust_per_cent_2)/2.0) sil_temp = np.array(sil_temp) sil_2.append(min(sil_temp)) sil_1 = np.array(sil_1) sil_2 = np.array(sil_2) sil = [] for c in range(speakers): sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5)) sil_all.append(np.mean(sil)) imax = int(np.argmax(sil_all)) num_speakers = s_range[imax] cls = np.zeros((n_wins,)) for index in range(n_wins): j = np.argmin(np.abs(index-i_non_outliers)) cls[index] = cluster_labels[imax][j] for index in range(1): start_prob, transmat, means, cov = \ train_hmm_compute_statistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)] if plot_res: fig = plt.figure(figsize=(10, 4)) if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(np.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) list_labels = np.array(range(len(cls))) * mid_step + mid_step ax1.set_xticks(list_labels[::25]) ax1.plot(np.array(range(len(cls))) * mid_step + mid_step, cls) if plot_res: plt.xlabel("time (seconds)") if n_speakers <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.savefig('foo.png') return cls, sampling_rate, len(signal)