def extract_time_start(video_path, bip_ref_path="ref_bip_isolated.wav"): # features of the ref # extract short-term features using a 50msec non-overlapping windows fs, s_ref = aIO.read_audio_file(bip_ref_path) duration = len(s_ref) / float(fs) win, step = 0.05, 0.05 win_mid, step_mid = duration, 0.5 mt_ref, st_ref, mt_n_ref = aFm.mid_feature_extraction( s_ref, fs, win_mid * fs, step_mid * fs, win * fs, step * fs) # extraction on the long signal my_clip1 = mp.VideoFileClip(video_path) fs = 44100 s_long = my_clip1.audio.to_soundarray(fps=fs) s_long = s_long[:, 0] duration_long = len(s_long) / float(fs) # extract short-term features using a 50msec non-overlapping windows win, step = 0.05, 0.05 win_mid, step_mid = 0.4, 0.05 mt_long, st_long, mt_n_long = aFm.mid_feature_extraction( s_long, fs, win_mid * fs, step_mid * fs, win * fs, step * fs) # compute the distance and get the minimum distances = np.linalg.norm(mt_long - mt_ref, axis=0) time_start = np.argmin(distances) * duration_long / mt_long.shape[1] return time_start
def analysisAudio(vid_uuid, analysis_uuid): with open("../../../data/processed/" + str(vid_uuid) + "-" + str(analysis_uuid) + "_extracted.interest.csv") as interestfile: interest_reader = csv.reader(interestfile, delimiter=',') interest_header = next(interest_reader, None) minFrame = int(list(next(interest_reader, None))[0]) test = reversed(list(interest_reader)) maxFrame = int(list(next(test, None))[0]) startTime = minFrame / 60 endTime = maxFrame / 60 clip = mp.VideoFileClip("../../../data/raw/" + str(vid_uuid) + "/replay.mp4").subclip(startTime, endTime) clip.audio.write_audiofile("../../../data/raw/" + str(vid_uuid) + "/" + str(analysis_uuid) + "-audio.wav") VIDEOFILE = "../../../data/raw/" + str(vid_uuid) + "/replay.mp4" AUDIOFILE = "../../../data/raw/" + str(vid_uuid) + "/" + str( analysis_uuid) + "-audio.wav" FEATUREFILE = "../../../data/processed/" + str(vid_uuid) + "-" + str( analysis_uuid) + "_extracted.ft" [Fs, x] = audioBasicIO.read_audio_file(AUDIOFILE) x = audioBasicIO.stereo_to_mono(x) midF, shortF, midFNames = MidTermFeatures.mid_feature_extraction( x, Fs, (1 / 30) * Fs, (1 / 60) * Fs, (1 / 60) * Fs, (1 / 120) * Fs) np.save(FEATUREFILE, midF) np.savetxt(FEATUREFILE + ".csv", midF.T, delimiter=",", header=",".join(midFNames)) #%% audioAnalysis.thumbnailWrapper(AUDIOFILE, 50)
def test_feature_extraction_segment(): print("Short-term feature extraction") [fs, x] = audioBasicIO.read_audio_file("test_data/5_sec_wav.wav") mt, st, mt_names = MidTermFeatures.mid_feature_extraction( x, fs, 1 * fs, 1 * fs, 0.05 * fs, 0.05 * fs) assert mt.shape[1] == 5, "Wrong number of short-term windows" assert mt.shape[0] == len(mt_names), "Number of features and feature " \ "names are not the same"
def file_regression(input_file, model_name, model_type): # Load classifier: if not os.path.isfile(input_file): print("fileClassification: wav file not found!") return -1, -1, -1 #regression_models = glob.glob(model_name + "_*") I CHANGED THIS regression_models = model_name regression_models2 = [] for r in regression_models: if r[-5::] != "MEANS": regression_models2.append(r) regression_models = regression_models2 regression_names = [] for r in regression_models: regression_names.append(r[r.rfind("_") + 1::]) # FEATURE EXTRACTION # LOAD ONLY THE FIRST MODEL (for mt_win, etc) if model_type == 'svm' or model_type == "svm_rbf" or \ model_type == 'randomforest': _, _, _, mid_window, mid_step, short_window, short_step, compute_beat \ = load_model(regression_models[0], True) # read audio file and convert to mono samping_rate, signal = audioBasicIO.read_audio_file(input_file) signal = audioBasicIO.stereo_to_mono(signal) # feature extraction: mid_features, s, _ = \ aF.mid_feature_extraction(signal, samping_rate, mid_window * samping_rate, mid_step * samping_rate, round(samping_rate * short_window), round(samping_rate * short_step)) # long term averaging of mid-term statistics mid_features = mid_features.mean(axis=1) if compute_beat: beat, beat_conf = aF.beat_extraction(s, short_step) mid_features = np.append(mid_features, beat) mid_features = np.append(mid_features, beat_conf) # REGRESSION R = [] for ir, r in enumerate(regression_models): if not os.path.isfile(r): print("fileClassification: input model_name not found!") return (-1, -1, -1) if model_type == 'svm' or model_type == "svm_rbf" \ or model_type == 'randomforest': model, mean, std, _, _, _, _, _ = load_model(r, True) curFV = (mid_features - mean) / std # normalization R.append(regression_wrapper(model, model_type, curFV)) # classification return R, regression_names
def fileRegression(inputFile, model_name, model_type): # Load classifier: if not os.path.isfile(inputFile): print("fileClassification: wav file not found!") return (-1, -1, -1) regression_models = glob.glob(model_name + "_*") regression_models2 = [] for r in regression_models: if r[-5::] != "MEANS": regression_models2.append(r) regression_models = regression_models2 regression_names = [] for r in regression_models: regression_names.append(r[r.rfind("_") + 1::]) # FEATURE EXTRACTION # LOAD ONLY THE FIRST MODEL (for mt_win, etc) if model_type == 'svm' or model_type == "svm_rbf" or \ model_type == 'randomforest': [_, _, _, mt_win, mt_step, st_win, st_step, compute_beat] = \ load_model(regression_models[0], True) # read audio file and convert to mono [Fs, x] = audioBasicIO.read_audio_file(inputFile) x = audioBasicIO.stereo_to_mono(x) # feature extraction: [mt_features, s, _] = aF.mid_feature_extraction(x, Fs, mt_win * Fs, mt_step * Fs, round(Fs * st_win), round(Fs * st_step)) # long term averaging of mid-term statistics mt_features = mt_features.mean(axis=1) if compute_beat: [beat, beatConf] = aF.beat_extraction(s, st_step) mt_features = np.append(mt_features, beat) mt_features = np.append(mt_features, beatConf) # REGRESSION R = [] for ir, r in enumerate(regression_models): if not os.path.isfile(r): print("fileClassification: input model_name not found!") return (-1, -1, -1) if model_type == 'svm' or model_type == "svm_rbf" \ or model_type == 'randomforest': [model, MEAN, STD, mt_win, mt_step, st_win, st_step, compute_beat] = load_model(r, True) curFV = (mt_features - MEAN) / STD # normalization R.append(regressionWrapper(model, model_type, curFV)) # classification return R, regression_names
def features(file_path): fs, s = aIO.read_audio_file(file_path) m_win, m_step, s_win, s_step = 1, 1, 0.1, 0.05 mid_features, short_features, mid_feature_names = aF.mid_feature_extraction( s, fs, round(fs * m_win), round(fs * m_step), round(fs * s_win), round(fs * s_step)) mid_features = np.transpose(mid_features).mean(axis=0) beat, beat_conf = aF.beat_extraction(short_features, s_step) mid_features = np.append(mid_features, beat) mid_features = np.append(mid_features, beat_conf) mid_feature_names.append('beat') mid_feature_names.append('beat_conf') return mid_features, mid_feature_names
def exp3(): fs, s = aIO.read_audio_file(AfeExp.wav_file) mt, st, mt_n = aMF.mid_feature_extraction(s, fs, 1 * fs, 1 * fs, 0.05 * fs, 0.05 * fs) print(f'signal duration {len(s)/fs} seconds') print( f'{st.shape[1]} {st.shape[0]}-D short-term feature vectors extracted' ) print( f'{mt.shape[1]} {mt.shape[0]}-D segment feature statistic vectors extracted' ) print('mid-term feature names') for i, mi in enumerate(mt_n): print(f'{i}:{mi}')
def hmmSegmentation(wav_file_name, hmm_model_name, plot_res=False, gt_file_name=""): [fs, x] = audioBasicIO.read_audio_file(wav_file_name) try: fo = open(hmm_model_name, "rb") except IOError: print("didn't find file") return try: hmm = cPickle.load(fo) classes_all = cPickle.load(fo) mt_win = cPickle.load(fo) mt_step = cPickle.load(fo) except: fo.close() fo.close() [Features, _, _] = aF.mid_feature_extraction(x, fs, mt_win * fs, mt_step * fs, round(fs * 0.050), round(fs * 0.050)) flags_ind = hmm.predict(Features.T) # apply model if os.path.isfile(gt_file_name): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file_name) flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step) flagsGTNew = [] for j, fl in enumerate(flags_gt): # "align" labels with GT if class_names_gt[flags_gt[j]] in classes_all: flagsGTNew.append( classes_all.index(class_names_gt[flags_gt[j]])) else: flagsGTNew.append(-1) cm = np.zeros((len(classes_all), len(classes_all))) flags_ind_gt = np.array(flagsGTNew) for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])): cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1 else: flags_ind_gt = np.array([]) acc = plotSegmentationResults(flags_ind, flags_ind_gt, classes_all, mt_step, not plot_res) if acc >= 0: print("Overall Accuracy: {0:.2f}".format(acc)) return (flags_ind, class_names_gt, acc, cm) else: return (flags_ind, classes_all, -1, -1)
def fileClassification(inputFile, model_name, model_type): # Load classifier: if not os.path.isfile(model_name): print("fileClassification: input model_name not found!") return (-1, -1, -1) if not os.path.isfile(inputFile): print("fileClassification: wav file not found!") return (-1, -1, -1) if model_type == 'knn': [ classifier, MEAN, STD, classNames, mt_win, mt_step, st_win, st_step, compute_beat ] = load_model_knn(model_name) else: [ classifier, MEAN, STD, classNames, mt_win, mt_step, st_win, st_step, compute_beat ] = load_model(model_name) # read audio file and convert to mono [Fs, x] = audioBasicIO.read_audio_file(inputFile) x = audioBasicIO.stereo_to_mono(x) if Fs == 0: # audio file IO problem return -1, -1, -1 if x.shape[0] / float(Fs) <= mt_win: return -1, -1, -1 # feature extraction: [mt_features, s, _] = aF.mid_feature_extraction(x, Fs, mt_win * Fs, mt_step * Fs, round(Fs * st_win), round(Fs * st_step)) # long term averaging of mid-term statistics mt_features = mt_features.mean(axis=1) if compute_beat: [beat, beatConf] = aF.beat_extraction(s, st_step) mt_features = np.append(mt_features, beat) mt_features = np.append(mt_features, beatConf) curFV = (mt_features - MEAN) / STD # normalization # classification [Result, P] = classifierWrapper(classifier, model_type, curFV) return Result, P, classNames
def extract_segment_features(self, filenames): """ Extract segment features using pyAudioAnalysis Parameters ---------- filenames : List of input audio filenames basic_features_params: Dictionary of parameters to consider. It must contain: - mid_window: window size for framing - mid_step: window step for framing - short_window: segment window size - short_step: segment window step Returns ------- segment_features_all: List of stats on segment features feature_names: List of feature names """ print("--> Extracting audio features") segment_features_all = [] sequences, sampling_rate = self.read_files(filenames) mid_window = self.basic_features_params['mid_window'] mid_step = self.basic_features_params['mid_step'] short_window = self.basic_features_params['short_window'] short_step = self.basic_features_params['short_step'] for seq in sequences: (segment_features_stats, segment_features, feature_names) = aF.mid_feature_extraction( seq, sampling_rate, round(mid_window * sampling_rate), round(mid_step * sampling_rate), round(sampling_rate * short_window), round(sampling_rate * short_step)) segment_features_stats = np.asarray(segment_features_stats) segment_features_all.append(segment_features_stats) return segment_features_all, feature_names
def file_classification(input_file, model_name, model_type): # Load classifier: if not os.path.isfile(model_name): print("fileClassification: input model_name not found!") return -1, -1, -1 if not os.path.isfile(input_file): print("fileClassification: wav file not found!") return -1, -1, -1 if model_type == 'knn': classifier, mean, std, classes, mid_window, mid_step, short_window, \ short_step, compute_beat = load_model_knn(model_name) else: classifier, mean, std, classes, mid_window, mid_step, short_window, \ short_step, compute_beat = load_model(model_name) # read audio file and convert to mono sampling_rate, signal = audioBasicIO.read_audio_file(input_file) signal = audioBasicIO.stereo_to_mono(signal) if sampling_rate == 0: # audio file IO problem return -1, -1, -1 if signal.shape[0] / float(sampling_rate) <= mid_window: return -1, -1, -1 # feature extraction: mid_features, s, _ = \ aF.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * short_window), round(sampling_rate * short_step)) # long term averaging of mid-term statistics mid_features = mid_features.mean(axis=1) if compute_beat: beat, beat_conf = aF.beat_extraction(s, short_step) mid_features = np.append(mid_features, beat) mid_features = np.append(mid_features, beat_conf) feature_vector = (mid_features - mean) / std # normalization # classification class_id, probability = classifier_wrapper(classifier, model_type, feature_vector) return class_id, probability, classes
def compute_features_paa(filename, with_timebase=False, verbose=False): """compute_features_paa Compute a bag of standard audio features to be used for some downstream task. """ if verbose: print('compute_features_paa loading from {0}'.format(filename)) [Fs, x_] = audioBasicIO.read_audio_file(filename) if verbose: print('compute_features_paa: loaded {1} samples from {0}'.format( filename, x_.shape)) if len(x_.shape) > 1 and x_.shape[1] > 1: x = audioBasicIO.stereo_to_mono(x_) else: x = x_ x_duration = x.shape[0] / Fs if verbose: print(f'compute_features_paa: {x_duration} seconds of audio at {Fs}Hz') mt_win = 1.0 * Fs mt_step = 0.5 * Fs st_win = 0.050 * Fs st_step = 0.025 * Fs # F, F_names = audioFeatureExtraction.stFeatureExtraction(x, Fs, st_win, st_step) # G, F, F_names = audioFeatureExtraction.mtFeatureExtraction(x, Fs, mt_win, mt_step, st_win, st_step) G, F, F_names = mF.mid_feature_extraction(x, Fs, mt_win, mt_step, st_win, st_step) if with_timebase: G_time = np.linspace(0, G.shape[1] * 0.5, G.shape[1] + 1) F_time = np.linspace(0, F.shape[1] * 0.025, F.shape[1] + 1) else: G_time = None F_time = None if verbose: print(f'compute_features_paa: F = {F.shape} {F}') print(f'compute_features_paa: {F_time}') print(f'compute_features_paa: G = {G.shape} {G}') print(f'compute_features_paa: {G_time}') if with_timebase: return F, F_names, G, F_time, G_time else: return F, F_names, G
def extract_afs(wav_file): fs, s = aIO.read_audio_file(wav_file) mt, st, mt_n = aMF.mid_feature_extraction(s, fs, 1 * fs, 1 * fs, 0.05 * fs, 0.05 * fs) ''' print(f'signal duration {len(s)/fs} seconds') print(f'{st.shape[1]} {st.shape[0]}-D short-term feature vectors extracted') print(f'{mt.shape[1]} {mt.shape[0]}-D segment feature statistic vectors extracted') print('mid-term feature names') for i, mi in enumerate(mt_n): print(f'{i}:{mi}') ''' mtf = np.mean(mt, axis=1) feats = np.array([ mtf[mt_n.index('spectral_centroid_mean')], mtf[mt_n.index('energy_entropy_mean')] ]) return feats
def train_hmm_from_file(wav_file, gt_file, hmm_model_name, mid_window, mid_step): """ This function trains a HMM model for segmentation-classification using a single annotated audio file ARGUMENTS: - wav_file: the path of the audio filename - gt_file: the path of the ground truth filename (a csv file of the form <segment start in seconds>, <segment end in seconds>,<segment label> in each row - hmm_model_name: the name of the HMM model to be stored - mt_win: mid-term window size - mt_step: mid-term window step RETURNS: - hmm: an object to the resulting HMM - class_names: a list of class_names After training, hmm, class_names, along with the mt_win and mt_step values are stored in the hmm_model_name file """ seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file) flags, class_names = segments_to_labels(seg_start, seg_end, seg_labs, mid_step) sampling_rate, signal = audioBasicIO.read_audio_file(wav_file) features, _, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * 0.050), round(sampling_rate * 0.050)) class_priors, transumation_matrix, means, cov = \ train_hmm_compute_statistics(features, flags) hmm = hmmlearn.hmm.GaussianHMM(class_priors.shape[0], "diag") hmm.covars_ = cov hmm.means_ = means hmm.startprob_ = class_priors hmm.transmat_ = transumation_matrix save_hmm(hmm_model_name, hmm, class_names, mid_window, mid_step) return hmm, class_names
def mid_term_feat_extraction(wav_file_path): sampling_rate, signal = audioBasicIO.read_audio_file(wav_file_path) if sampling_rate == 0: print('Sampling rate not correct.') return None signal = audioBasicIO.stereo_to_mono(signal) if signal.shape[0] < float(sampling_rate) / 5: print("The duration of the audio is too short.") return None mid_window, mid_step, short_window, short_step = 0.5, 0.5, 0.05, 0.05 mid_features, _, mid_feature_names = MidTermFeatures.mid_feature_extraction( signal, sampling_rate, round(mid_window * sampling_rate), round(mid_step * sampling_rate), round(sampling_rate * short_window), round(sampling_rate * short_step)) mid_features = np.transpose(mid_features) mid_features = mid_features.mean(axis=0) # long term averaging of mid-term statistics if (not np.isnan(mid_features).any()) and ( not np.isinf(mid_features).any()): #print('Mid-Terms features extracted correctly.') mid_dict = dict(zip(mid_feature_names, mid_features)) mid_df = pd.DataFrame([mid_dict.values()], columns=mid_dict.keys()) # Smile library audio extraction smile = opensmile.Smile( feature_set=opensmile.FeatureSet.eGeMAPSv01b, feature_level=opensmile.FeatureLevel.Functionals, ) smile_features = smile.process_signal(signal, sampling_rate) smile_df = pd.DataFrame(smile_features).reset_index().iloc[:, 2:] final_df = pd.concat([mid_df, smile_df], axis=1) #excel_path = wav_file_path.strip('.') + 'features_extracted.xlsx' #final_df.to_excel(excel_path) return final_df else: #print('Mid-Terms features extracted incorrectly.') return None
def trainHMM_fromFile(wav_file, gt_file, hmm_model_name, mt_win, mt_step): """ This function trains a HMM model for segmentation-classification using a single annotated audio file ARGUMENTS: - wav_file: the path of the audio filename - gt_file: the path of the ground truth filename (a csv file of the form <segment start in seconds>, <segment end in seconds>,<segment label> in each row - hmm_model_name: the name of the HMM model to be stored - mt_win: mid-term window size - mt_step: mid-term window step RETURNS: - hmm: an object to the resulting HMM - class_names: a list of class_names After training, hmm, class_names, along with the mt_win and mt_step values are stored in the hmm_model_name file """ [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step) [fs, x] = audioBasicIO.read_audio_file(wav_file) [F, _, _] = aF.mid_feature_extraction(x, fs, mt_win * fs, mt_step * fs, round(fs * 0.050), round(fs * 0.050)) start_prob, transmat, means, cov = trainHMM_computeStatistics(F, flags) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov fo = open(hmm_model_name, "wb") cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(class_names, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, class_names
def hmm_segmentation(audio_file, hmm_model_name, plot_results=False, gt_file=""): sampling_rate, signal = audioBasicIO.read_audio_file(audio_file) with open(hmm_model_name, "rb") as f_handle: hmm = cpickle.load(f_handle) class_names = cpickle.load(f_handle) mid_window = cpickle.load(f_handle) mid_step = cpickle.load(f_handle) features, _, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * 0.050), round(sampling_rate * 0.050)) # apply model labels = hmm.predict(features.T) labels_gt, class_names_gt, accuracy, cm = \ load_ground_truth(gt_file, labels, class_names, mid_step, plot_results) return labels, class_names, accuracy, cm
[ classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 ] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models", "knn_speaker_10")) [ classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 ] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models", "knn_speaker_male_female")) [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5)) MidTermFeatures2 = np.zeros( (mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001
import plotly.graph_objs as go import plotly import wavio # extraction on the ref fs, s = aIO.read_audio_file("wav/ref_bip.wav") s_ref = s[3000:18000, 0] print(fs, s_ref.shape) duration = len(s_ref) / float(fs) print(f'duration = {duration} seconds') # extract short-term features using a 50msec non-overlapping windows win, step = 0.05, 0.05 win_mid, step_mid = duration, 0.5 mt_ref, st_ref, mt_n_ref = aFm.mid_feature_extraction(s_ref, fs, win_mid * fs, step_mid * fs, win * fs, step * fs) # print(f'signal duration {len(s)/fs} seconds') # print(f'{st.shape[1]} {st.shape[0]}-D short-term feature vectors extracted') # print(f'{mt.shape[1]} {mt.shape[0]}-D segment feature statistic vectors extracted') # print('mid-term feature names') # for i, mi in enumerate(mt_n): # print(f'{i}:{mi}') # extraction on the long signal audio_to_analyse = "50_brasse_stevens.wav" # fs, s_long = aIO.read_audio_file("wav/200_4n_dames_finaleA_f122020_gauche_lowered.wav") # 1.9 # fs, s_long = aIO.read_audio_file("wav/50_dos_dames_finaleA_f122020_gauche_lowered.wav") # 6 fs, s_long = aIO.read_audio_file("wav/" + audio_to_analyse) s = s_long[:, 0] print(fs, s.shape) duration_long = len(s) / float(fs)
def trainHMM_fromDir(dirPath, hmm_model_name, mt_win, mt_step): """ This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored ARGUMENTS: - dirPath: the path of the data diretory - hmm_model_name: the name of the HMM model to be stored - mt_win: mid-term window size - mt_step: mid-term window step RETURNS: - hmm: an object to the resulting HMM - class_names: a list of class_names After training, hmm, class_names, along with the mt_win and mt_step values are stored in the hmm_model_name file """ flags_all = np.array([]) classes_all = [] for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')): # for each WAV file wav_file = f gt_file = f.replace('.wav', '.segments') if not os.path.isfile(gt_file): continue [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step) for c in class_names: # update class names: if c not in classes_all: classes_all.append(c) [fs, x] = audioBasicIO.read_audio_file(wav_file) [F, _, _] = aF.mid_feature_extraction(x, fs, mt_win * fs, mt_step * fs, round(fs * 0.050), round(fs * 0.050)) lenF = F.shape[1] lenL = len(flags) min_sm = min(lenF, lenL) F = F[:, 0:min_sm] flags = flags[0:min_sm] flagsNew = [] for j, fl in enumerate(flags): # append features and labels flagsNew.append(classes_all.index(class_names[flags[j]])) flags_all = np.append(flags_all, np.array(flagsNew)) if i == 0: f_all = F else: f_all = np.concatenate((f_all, F), axis=1) # compute HMM statistics start_prob, transmat, means, cov = trainHMM_computeStatistics(f_all, flags_all) # train the HMM hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov fo = open(hmm_model_name, "wb") # save HMM model cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classes_all, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classes_all
def QE_speaker_diarization( sampling_rate, signal, n_speakers, classifier_all, mean_all, std_all, class_names_all, classifier_fm, mean_fm, std_fm, class_names_fm, # Load models from avove mid_window=2.0, mid_step=0.2, short_window=0.05, lda_dim=35, plot_res=False): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed #QE_:-> ADAPTED HERE TO RECEIVE DIRECTLY THE DATA sampling_rate, signal INSTEAD OF filename - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mid_window (opt) mid-term window size - mid_step (opt) mid-term window step - short_window (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting """ """ Otras opciones a explorar para diarization https://hackernoon.com/speaker-diarization-the-squad-way-2205e0accbda https://github.com/YongyuG/s4d-diarization-gao/blob/master/s4d/diar.py, muy buena pinta https://pypi.org/project/s4d/ https://projets-lium.univ-lemans.fr/s4d/ https://medium.com/datadriveninvestor/speaker-diarization-22121f1264b1 https://arxiv.org/pdf/2005.08072v1.pdf https://github.com/calclavia/tal-asrd https://github.com/josepatino/pyBK https://github.com/wq2012/awesome-diarization https://www.researchgate.net/publication/221480626_The_Detection_of_Overlapping_Speech_with_Prosodic_Features_for_Speaker_Diarization """ # sampling_rate, signal = audioBasicIO.read_audio_file(filename) # NO ES NECESARIO DADOJ QUE LO PASO COMO ARGUMENTOS DE ENTRADA EN LUGAR DE filename signal = audioBasicIO.stereo_to_mono( signal) # eliminar si ya viene en mono como condicion duration = len(signal) / sampling_rate """ #QE_: In order to avoid a recurrent load of the models, they are loaded more globally only once and then passed as arguments # So this part is copied in the module avove , QE_main: base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models") classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \ at.load_model_knn(os.path.join(base_dir, "knn_speaker_10")) classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _, _ = \ at.load_model_knn(os.path.join(base_dir, "knn_speaker_male_female")) """ mid_feats, st_feats, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * short_window), round(sampling_rate * short_window * 0.5)) mid_term_features = np.zeros( (mid_feats.shape[0] + len(class_names_all) + len(class_names_fm), mid_feats.shape[1])) for index in range(mid_feats.shape[1]): feature_norm_all = (mid_feats[:, index] - mean_all) / std_all feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm) start = mid_feats.shape[0] end = mid_feats.shape[0] + len(class_names_all) mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index] mid_term_features[start:end, index] = p1 + 1e-4 mid_term_features[end::, index] = p2 + 1e-4 mid_feats = mid_term_features # TODO feature_selected = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] mid_feats = mid_feats[feature_selected, :] mid_feats_norm, mean, std = at.normalize_features([mid_feats.T]) mid_feats_norm = mid_feats_norm[0].T n_wins = mid_feats.shape[1] # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: # EnergyMin = np.min(mt_feats[1,:]) # EnergyMean = np.mean(mt_feats[1,:]) # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] # print i_non_outliers mt_feats_norm_or = mid_feats_norm mid_feats_norm = mid_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: # extract mid-term features with minimum step: window_ratio = int(round(mid_window / short_window)) step_ratio = int(round(short_window / short_window)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 for index in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) # for each of the short-term features: for index in range(num_of_features): cur_pos = 0 feat_len = len(st_feats[index]) while cur_pos < feat_len: n1 = cur_pos n2 = cur_pos + window_ratio if n2 > feat_len: n2 = feat_len short_features = st_feats[index][n1:n2] mt_feats_to_red[index].append(np.mean(short_features)) mt_feats_to_red[index + num_of_features].\ append(np.std(short_features)) cur_pos += step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros( (mt_feats_to_red.shape[0] + len(class_names_all) + len(class_names_fm), mt_feats_to_red.shape[1])) limit = mt_feats_to_red.shape[0] + len(class_names_all) for index in range(mt_feats_to_red.shape[1]): feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \ mt_feats_to_red[:, index] mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit, index] = p1 + 1e-4 mt_feats_to_red_2[limit::, index] = p2 + 1e-4 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[feature_selected, :] mt_feats_to_red, mean, std = at.normalize_features([mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T labels = np.zeros((mt_feats_to_red.shape[1], )) lda_step = 1.0 lda_step_ratio = lda_step / short_window for index in range(labels.shape[0]): labels[index] = int(index * short_window / lda_step_ratio) clf = sklearn.discriminant_analysis.\ LinearDiscriminantAnalysis(n_components=lda_dim) clf.fit(mt_feats_to_red.T, labels) mid_feats_norm = (clf.transform(mid_feats_norm.T)).T ########################################################################################################################################## if n_speakers <= 0: s_range = range( 2, 10 ) #QE_: Adapt in this case to range 1-10? We are going to use this diarizantion in short windows, 250-500 ms ########################################################################################################################################### else: s_range = [n_speakers] cluster_labels = [] sil_all = [] cluster_centers = [] for speakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=speakers) k_means.fit(mid_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ cluster_labels.append(cls) cluster_centers.append(means) sil_1 = [] sil_2 = [] for c in range(speakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mid_feats_norm[:, cls == c] # compute average distance between samples # that belong to the cluster (a values) dist = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(dist) * clust_per_cent) sil_temp = [] for c2 in range(speakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) mid_features_temp = mid_feats_norm[:, cls == c2] dist = distance.cdist(mt_feats_norm_temp.T, mid_features_temp.T) sil_temp.append( np.mean(dist) * (clust_per_cent + clust_per_cent_2) / 2.0) sil_temp = np.array(sil_temp) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(sil_temp)) sil_1 = np.array(sil_1) sil_2 = np.array(sil_2) sil = [] for c in range(speakers): # for each cluster (speaker) compute silhouette sil.append( (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = int(np.argmax(sil_all)) # optimal number of clusters num_speakers = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = np.zeros((n_wins, )) for index in range(n_wins): j = np.argmin(np.abs(index - i_non_outliers)) cls[index] = cluster_labels[imax][j] # Post-process method 1: hmm smoothing for index in range(1): # hmm training start_prob, transmat, means, cov = \ train_hmm_compute_statistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundtruth exists if os.path.isfile(gt_file): seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file) flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end, seg_labs, mid_step) """ if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(np.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls) if os.path.isfile(gt_file): if plot_res: ax1.plot(np.array(range(len(flags_gt))) * mid_step + mid_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluate_speaker_diarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") if n_speakers <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() """ return cls
def mtFileClassification(input_file, model_name, model_type, plot_results=False, gt_file=""): """ This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment """ if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return (-1, -1, -1, -1) # Load classifier: if model_type == "knn": [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model_knn(model_name) else: [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model(model_name) if compute_beat: print("Model " + model_name + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return (-1, -1, -1, -1) [fs, x] = audioBasicIO.read_audio_file(input_file) # load input file if fs == -1: # could not read file return (-1, -1, -1, -1) x = audioBasicIO.stereo_to_mono(x) # convert stereo (if) to mono # mid-term feature extraction: [mt_feats, _, _] = aF.mid_feature_extraction(x, fs, mt_win * fs, mt_step * fs, round(fs * st_win), round(fs * st_step)) flags = [] Ps = [] flags_ind = [] # for each feature vector (i.e. for each fix-sized segment): for i in range(mt_feats.shape[1]): cur_fv = (mt_feats[:, i] - MEAN) / STD # normalize current feature v # classify vector: [res, P] = aT.classifierWrapper(classifier, model_type, cur_fv) flags_ind.append(res) flags.append(class_names[int(res)]) # update class label matrix Ps.append(np.max(P)) # update probability matrix flags_ind = np.array(flags_ind) # 1-window smoothing for i in range(1, len(flags_ind) - 1): if flags_ind[i-1] == flags_ind[i + 1]: flags_ind[i] = flags_ind[i + 1] # convert fix-sized flags to segments and classes (segs, classes) = flags2segs(flags, mt_step) segs[-1] = len(x) / float(fs) # Load grount-truth: if os.path.isfile(gt_file): [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt, seg_l_gt, mt_step) flags_ind_gt = [] for j, fl in enumerate(flags_gt): # "align" labels with GT if class_names_gt[flags_gt[j]] in class_names: flags_ind_gt.append(class_names.index(class_names_gt[ flags_gt[j]])) else: flags_ind_gt.append(-1) flags_ind_gt = np.array(flags_ind_gt) cm = np.zeros((len(class_names_gt), len(class_names_gt))) for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])): cm[int(flags_ind_gt[i]),int(flags_ind[i])] += 1 else: cm = [] flags_ind_gt = np.array([]) acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names, mt_step, not plot_results) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc) ) return (flags_ind, class_names_gt, acc, cm) else: return (flags_ind, class_names, acc, cm)
def WhatIsThis(self, data): # There are two completely separate models, one is a classifier that uses pyaudioanalysis, the other is a deepspeech model # Convert or cast the raw audio data to numpy array log.debug('Converting data to numpy') if len(data) % 2 != 0: log.critical('Data length: {0}'.format(len(data))) log.critical('Data: {0}'.format(data)) return { #bullshit 'loudness': 0.0, 'class': 'bullshit', 'probability': 1.0, 'text': 'fuckitall', } AccumulatedData_np = np.frombuffer(data, np.int16) # Get the loudness, hope this works rms = np.sqrt(np.mean(AccumulatedData_np**2)) log.debug(f'Raw loudness: {rms}') # normalize it, make it between 0.0 and 1.0. # rms = round((rms - 20.0) / 45, 2) # rms = float(np.clip(rms, 0.0, 1.0)) seg_len = len(AccumulatedData_np) log.debug('seg_len ' + str(seg_len)) # Run the classifier. This is ripped directly out of paura.py and carelessly sutured into place. There's so much blood! Thank you!!! log.debug('Running classifier') try: [mt_feats, _, _] = mF.mid_feature_extraction(AccumulatedData_np, self.fs, seg_len, seg_len, round(self.fs * self.st_win), round(self.fs * self.st_step)) cur_fv = (mt_feats[:, 0] - self.MEAN) / self.STD except ValueError: log.error('Yeah, that thing happened') log.critical('Data length: {0}'.format(len(data))) log.critical('Data: {0}'.format(data)) return { #bullshit 'loudness': 0.0, 'class': 'bullshit', 'probability': 1.0, 'text': 'fuckitall', } # classify vector: [res, prob] = aT.classifier_wrapper(self.classifier, "svm_rbf", cur_fv) win_class = self.class_names[int(res)] win_prob = round(prob[int(res)], 2) log.info('Classified {0:s} with probability {1:.2f}'.format( win_class, win_prob)) # Run the accumulated audio data through deepspeech, if it's speech if win_class == 'lover': log.debug('Running deepspeech model') text = self.model.stt(AccumulatedData_np) log.info('Recognized: %s', text) else: text = 'undefined' # Save the utterance to a wav file. I hope later I'll be able to use this for training a better model, after I learn how to do that. # log.debug('Saving wav file') # wf = wave.open(os.path.join(self.save_dir, str(int(time.time())) + '_' + win_class + '_' + text.replace(' ', '_') + '.wav'), 'wb') # wf.setnchannels(1) # wf.setsampwidth(2) # wf.setframerate(16000) # wf.writeframes(data) # wf.close() # return an object return { 'loudness': rms, 'class': win_class, 'probability': win_prob, 'text': text, }
def mid_term_file_classification(input_file, model_name, model_type, plot_results=False, gt_file=""): """ This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment """ labels = [] accuracy = 0.0 class_names = [] cm = np.array([]) if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return labels, class_names, accuracy, cm # Load classifier: if model_type == "knn": classifier, mean, std, class_names, mt_win, mid_step, st_win, \ st_step, compute_beat = at.load_model_knn(model_name) else: classifier, mean, std, class_names, mt_win, mid_step, st_win, \ st_step, compute_beat = at.load_model(model_name) if compute_beat: print("Model " + model_name + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return labels, class_names, accuracy, cm # load input file sampling_rate, signal = audioBasicIO.read_audio_file(input_file) # could not read file if sampling_rate == 0: return labels, class_names, accuracy, cm # convert stereo (if) to mono signal = audioBasicIO.stereo_to_mono(signal) # mid-term feature extraction: mt_feats, _, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mt_win * sampling_rate, mid_step * sampling_rate, round(sampling_rate * st_win), round(sampling_rate * st_step)) posterior_matrix = [] # for each feature vector (i.e. for each fix-sized segment): for col_index in range(mt_feats.shape[1]): # normalize current feature v feature_vector = (mt_feats[:, col_index] - mean) / std # classify vector: label_predicted, posterior = \ at.classifier_wrapper(classifier, model_type, feature_vector) labels.append(label_predicted) # update probability matrix posterior_matrix.append(np.max(posterior)) labels = np.array(labels) # convert fix-sized flags to segments and classes segs, classes = labels_to_segments(labels, mid_step) segs[-1] = len(signal) / float(sampling_rate) # Load grount-truth: labels_gt, class_names_gt, accuracy, cm = \ load_ground_truth(gt_file, labels, class_names, mid_step, plot_results) return labels, class_names, accuracy, cm
def speaker_diarization(filename, n_speakers, mid_window=2.0, mid_step=0.2, short_window=0.05, lda_dim=35, plot_res=False): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mid_window (opt) mid-term window size - mid_step (opt) mid-term window step - short_window (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting """ sampling_rate, signal = audioBasicIO.read_audio_file(filename) signal = audioBasicIO.stereo_to_mono(signal) duration = len(signal) / sampling_rate base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models") classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \ at.load_model_knn(os.path.join(base_dir, "knn_speaker_10")) classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _, _ = \ at.load_model_knn(os.path.join(base_dir, "knn_speaker_male_female")) mid_feats, st_feats, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * short_window), round(sampling_rate * short_window * 0.5)) mid_term_features = np.zeros( (mid_feats.shape[0] + len(class_names_all) + len(class_names_fm), mid_feats.shape[1])) for index in range(mid_feats.shape[1]): feature_norm_all = (mid_feats[:, index] - mean_all) / std_all feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm) start = mid_feats.shape[0] end = mid_feats.shape[0] + len(class_names_all) mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index] mid_term_features[start:end, index] = p1 + 1e-4 mid_term_features[end::, index] = p2 + 1e-4 mid_feats = mid_term_features # TODO feature_selected = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] mid_feats = mid_feats[feature_selected, :] mid_feats_norm, mean, std = at.normalize_features([mid_feats.T]) mid_feats_norm = mid_feats_norm[0].T n_wins = mid_feats.shape[1] # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: # EnergyMin = np.min(mt_feats[1,:]) # EnergyMean = np.mean(mt_feats[1,:]) # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] # print i_non_outliers mt_feats_norm_or = mid_feats_norm mid_feats_norm = mid_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: # extract mid-term features with minimum step: window_ratio = int(round(mid_window / short_window)) step_ratio = int(round(short_window / short_window)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 for index in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) # for each of the short-term features: for index in range(num_of_features): cur_pos = 0 feat_len = len(st_feats[index]) while cur_pos < feat_len: n1 = cur_pos n2 = cur_pos + window_ratio if n2 > feat_len: n2 = feat_len short_features = st_feats[index][n1:n2] mt_feats_to_red[index].append(np.mean(short_features)) mt_feats_to_red[index + num_of_features].\ append(np.std(short_features)) cur_pos += step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros( (mt_feats_to_red.shape[0] + len(class_names_all) + len(class_names_fm), mt_feats_to_red.shape[1])) limit = mt_feats_to_red.shape[0] + len(class_names_all) for index in range(mt_feats_to_red.shape[1]): feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \ mt_feats_to_red[:, index] mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit, index] = p1 + 1e-4 mt_feats_to_red_2[limit::, index] = p2 + 1e-4 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[feature_selected, :] mt_feats_to_red, mean, std = at.normalize_features([mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T labels = np.zeros((mt_feats_to_red.shape[1], )) lda_step = 1.0 lda_step_ratio = lda_step / short_window for index in range(labels.shape[0]): labels[index] = int(index * short_window / lda_step_ratio) clf = sklearn.discriminant_analysis.\ LinearDiscriminantAnalysis(n_components=lda_dim) clf.fit(mt_feats_to_red.T, labels) mid_feats_norm = (clf.transform(mid_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] cluster_labels = [] sil_all = [] cluster_centers = [] for speakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=speakers) k_means.fit(mid_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ cluster_labels.append(cls) cluster_centers.append(means) sil_1 = [] sil_2 = [] for c in range(speakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mid_feats_norm[:, cls == c] # compute average distance between samples # that belong to the cluster (a values) dist = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(dist) * clust_per_cent) sil_temp = [] for c2 in range(speakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) mid_features_temp = mid_feats_norm[:, cls == c2] dist = distance.cdist(mt_feats_norm_temp.T, mid_features_temp.T) sil_temp.append( np.mean(dist) * (clust_per_cent + clust_per_cent_2) / 2.0) sil_temp = np.array(sil_temp) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(sil_temp)) sil_1 = np.array(sil_1) sil_2 = np.array(sil_2) sil = [] for c in range(speakers): # for each cluster (speaker) compute silhouette sil.append( (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = int(np.argmax(sil_all)) # optimal number of clusters num_speakers = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = np.zeros((n_wins, )) for index in range(n_wins): j = np.argmin(np.abs(index - i_non_outliers)) cls[index] = cluster_labels[imax][j] # Post-process method 1: hmm smoothing for index in range(1): # hmm training start_prob, transmat, means, cov = \ train_hmm_compute_statistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundtruth exists if os.path.isfile(gt_file): seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file) flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end, seg_labs, mid_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(np.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls) if os.path.isfile(gt_file): if plot_res: ax1.plot( np.array(range(len(flags_gt))) * mid_step + mid_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluate_speaker_diarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format( 100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") if n_speakers <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() return cls
def train_hmm_from_directory(folder_path, hmm_model_name, mid_window, mid_step): """ This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored ARGUMENTS: - folder_path: the path of the data diretory - hmm_model_name: the name of the HMM model to be stored - mt_win: mid-term window size - mt_step: mid-term window step RETURNS: - hmm: an object to the resulting HMM - class_names: a list of class_names After training, hmm, class_names, along with the mt_win and mt_step values are stored in the hmm_model_name file """ flags_all = np.array([]) class_names_all = [] for i, f in enumerate(glob.glob(folder_path + os.sep + '*.wav')): # for each WAV file wav_file = f gt_file = f.replace('.wav', '.segments') if os.path.isfile(gt_file): seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file) flags, class_names = \ segments_to_labels(seg_start, seg_end, seg_labs, mid_step) for c in class_names: # update class names: if c not in class_names_all: class_names_all.append(c) sampling_rate, signal = audioBasicIO.read_audio_file(wav_file) feature_vector, _, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * 0.050), round(sampling_rate * 0.050)) flag_len = len(flags) feat_cols = feature_vector.shape[1] min_sm = min(feat_cols, flag_len) feature_vector = feature_vector[:, 0:min_sm] flags = flags[0:min_sm] flags_new = [] # append features and labels for j, fl in enumerate(flags): flags_new.append( class_names_all.index(class_names_all[flags[j]])) flags_all = np.append(flags_all, np.array(flags_new)) if i == 0: f_all = feature_vector else: f_all = np.concatenate((f_all, feature_vector), axis=1) # compute HMM statistics class_priors, transmutation_matrix, means, cov = \ train_hmm_compute_statistics(f_all, flags_all) # train the HMM hmm = hmmlearn.hmm.GaussianHMM(class_priors.shape[0], "diag") hmm.covars_ = cov hmm.means_ = means hmm.startprob_ = class_priors hmm.transmat_ = transmutation_matrix save_hmm(hmm_model_name, hmm, class_names_all, mid_window, mid_step) return hmm, class_names_all
def speakerDiarization(filename, n_speakers, mt_size=2.0, mt_step=0.2, st_win=0.05, lda_dim=35, plot_res=False): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mt_size (opt) mid-term window size - mt_step (opt) mid-term window step - st_win (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting """ [fs, x] = audioBasicIO.read_audio_file(filename) x = audioBasicIO.stereo_to_mono(x) duration = len(x) / fs [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerAll")) [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerFemaleMale")) [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs*st_win * 0.5)) MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0]+len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T n_wins = mt_feats.shape[1] # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = np.min(mt_feats[1,:]) #EnergyMean = np.mean(mt_feats[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] #print i_non_outliers perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins mt_feats_norm_or = mt_feats_norm mt_feats_norm = mt_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, # st_win * fs, round(fs*st_win), round(fs*st_win)); # extract mid-term features with minimum step: mt_win_ratio = int(round(mt_size / st_win)) mt_step_ratio = int(round(st_win / st_win)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 #for i in range(num_of_stats * num_of_features + 1): for i in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) for i in range(num_of_features): # for each of the short-term features: curPos = 0 N = len(st_feats[i]) while (curPos < N): N1 = curPos N2 = curPos + mt_win_ratio if N2 > N: N2 = N curStFeatures = st_feats[i][N1:N2] mt_feats_to_red[i].append(np.mean(curStFeatures)) mt_feats_to_red[i+num_of_features].append(np.std(curStFeatures)) curPos += mt_step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2), mt_feats_to_red.shape[1])) for i in range(mt_feats_to_red.shape[1]): cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i] mt_feats_to_red_2[mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001 mt_feats_to_red_2[mt_feats_to_red.shape[0]+len(classNames1)::, i] = P2 + 0.0001 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :] #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010 (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures([mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0) #m_dist_all = np.mean(dist_all) #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0] #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2] Labels = np.zeros((mt_feats_to_red.shape[1], )); LDAstep = 1.0 LDAstepRatio = LDAstep / st_win #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*st_win/LDAstepRatio); clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=lda_dim) clf.fit(mt_feats_to_red.T, Labels) mt_feats_norm = (clf.transform(mt_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] clsAll = [] sil_all = [] centersAll = [] for iSpeakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(mt_feats_norm.T)) clsAll.append(cls) centersAll.append(means) sil_1 = []; sil_2 = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \ float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mt_feats_norm[:, cls==c] # compute average distance between samples # that belong to the cluster (a values) Yt = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(Yt)*clust_per_cent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2] Yt = distance.cdist(mt_feats_norm_temp.T, MidTermFeaturesNormTemp2.T) silBs.append(np.mean(Yt)*(clust_per_cent + clust_per_cent_2)/2.0) silBs = np.array(silBs) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(silBs)) sil_1 = np.array(sil_1); sil_2 = np.array(sil_2); sil = [] for c in range(iSpeakers): # for each cluster (speaker) compute silhouette sil.append( ( sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = np.argmax(sil_all) # optimal number of clusters nSpeakersFinal = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = np.zeros((n_wins,)) for i in range(n_wins): j = np.argmin(np.abs(i-i_non_outliers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): # hmm training start_prob, transmat, means, cov = \ trainHMM_computeStatistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = sil_all[imax] class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]; # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundturh exists if os.path.isfile(gt_file): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(np.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls) if os.path.isfile(gt_file): if plot_res: ax1.plot(np.array(range(len(flags_gt))) * mt_step + mt_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluateSpeakerDiarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") #print s_range, sil_all if n_speakers<=0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters"); plt.ylabel("average clustering's sillouette"); plt.show() return cls
def extract(label, out_dir): df = pd.read_csv(os.path.join("data", config.dataset) + ".csv") df = df[df.sex == label] for filename in df.filename: if args.ml: if not os.path.exists(os.path.join(out_dir, filename) + ".csv"): try: sound = AudioSegment.from_mp3(os.path.join( "./data", "recordings", filename) + ".mp3") sound.export("tmp.wav", format="wav") # signal, sampling rate fs, s = aIO.read_audio_file("tmp.wav") # get all mid-term features, returning an array of features # Look at the first 10 seconds mid_term_window = 10 mt, st, mt_n = aFm.mid_feature_extraction(s, fs, mid_term_window * fs, mid_term_window * fs, 0.05 * fs, 0.05 * fs) # Mid-Term Features: # 0:zcr_mean # 1:energy_mean # 2:energy_entropy_mean # 3:spectral_centroid_mean # 4:spectral_spread_mean # 5:spectral_entropy_mean # 6:spectral_flux_mean # 7:spectral_rolloff_mean # 8:mfcc_1_mean # 9:mfcc_2_mean # 10:mfcc_3_mean # 11:mfcc_4_mean # 12:mfcc_5_mean # 13:mfcc_6_mean # 14:mfcc_7_mean # 15:mfcc_8_mean # 16:mfcc_9_mean # 17:mfcc_10_mean # 18:mfcc_11_mean # 19:mfcc_12_mean # 20:mfcc_13_mean # 21:chroma_1_mean # 22:chroma_2_mean # 23:chroma_3_mean # 24:chroma_4_mean # 25:chroma_5_mean # 26:chroma_6_mean # 27:chroma_7_mean # 28:chroma_8_mean # 29:chroma_9_mean # 30:chroma_10_mean # 31:chroma_11_mean # 32:chroma_12_mean # 33:chroma_std_mean # 34:delta zcr_mean # 35:delta energy_mean # 36:delta energy_entropy_mean # 37:delta spectral_centroid_mean # 38:delta spectral_spread_mean # 39:delta spectral_entropy_mean # 40:delta spectral_flux_mean # 41:delta spectral_rolloff_mean # 42:delta mfcc_1_mean # 43:delta mfcc_2_mean # 44:delta mfcc_3_mean # 45:delta mfcc_4_mean # 46:delta mfcc_5_mean # 47:delta mfcc_6_mean # 48:delta mfcc_7_mean # 49:delta mfcc_8_mean # 50:delta mfcc_9_mean # 51:delta mfcc_10_mean # 52:delta mfcc_11_mean # 53:delta mfcc_12_mean # 54:delta mfcc_13_mean # 55:delta chroma_1_mean # 56:delta chroma_2_mean # 57:delta chroma_3_mean # 58:delta chroma_4_mean # 59:delta chroma_5_mean # 60:delta chroma_6_mean # 61:delta chroma_7_mean # 62:delta chroma_8_mean # 63:delta chroma_9_mean # 64:delta chroma_10_mean # 65:delta chroma_11_mean # 66:delta chroma_12_mean # 67:delta chroma_std_mean # 68:zcr_std # 69:energy_std # 70:energy_entropy_std # 71:spectral_centroid_std # 72:spectral_spread_std # 73:spectral_entropy_std # 74:spectral_flux_std # 75:spectral_rolloff_std # 76:mfcc_1_std # 77:mfcc_2_std # 78:mfcc_3_std # 79:mfcc_4_std # 80:mfcc_5_std # 81:mfcc_6_std # 82:mfcc_7_std # 83:mfcc_8_std # 84:mfcc_9_std # 85:mfcc_10_std # 86:mfcc_11_std # 87:mfcc_12_std # 88:mfcc_13_std # 89:chroma_1_std # 90:chroma_2_std # 91:chroma_3_std # 92:chroma_4_std # 93:chroma_5_std # 94:chroma_6_std # 95:chroma_7_std # 96:chroma_8_std # 97:chroma_9_std # 98:chroma_10_std # 99:chroma_11_std # 100:chroma_12_std # 101:chroma_std_std # 102:delta zcr_std # 103:delta energy_std # 104:delta energy_entropy_std # 105:delta spectral_centroid_std # 106:delta spectral_spread_std # 107:delta spectral_entropy_std # 108:delta spectral_flux_std # 109:delta spectral_rolloff_std # 110:delta mfcc_1_std # 111:delta mfcc_2_std # 112:delta mfcc_3_std # 113:delta mfcc_4_std # 114:delta mfcc_5_std # 115:delta mfcc_6_std # 116:delta mfcc_7_std # 117:delta mfcc_8_std # 118:delta mfcc_9_std # 119:delta mfcc_10_std # 120:delta mfcc_11_std # 121:delta mfcc_12_std # 122:delta mfcc_13_std # 123:delta chroma_1_std # 124:delta chroma_2_std # 125:delta chroma_3_std # 126:delta chroma_4_std # 127:delta chroma_5_std # 128:delta chroma_6_std # 129:delta chroma_7_std # 130:delta chroma_8_std # 131:delta chroma_9_std # 132:delta chroma_10_std # 133:delta chroma_11_std # 134:delta chroma_12_std # 135:delta chroma_std_std features = {mt_n[i]: [mt[i][0]] for i in range(len(mt_n))} ftDf = pd.DataFrame.from_dict(features) ftDf.to_csv(os.path.join(out_dir, filename) + ".csv") except Exception as e: print(e) elif args.nn: if not os.path.exists(os.path.join(out_dir, filename) + ".png"): try: sound = AudioSegment.from_mp3(os.path.join( "./data", "recordings", filename) + ".mp3") sound.export("tmp.wav", format="wav") y, sr = librosa.load( "tmp.wav", offset=2.0, duration=8.0, sr=22050) # extract a fixed length window # number of samples per time-step in spectrogram hop_length = 512 # number of bins in spectrogram. Height of image n_mels = config.cnn_input_size[0] # number of time-steps. Width of image time_steps = config.cnn_input_size[1] # starting at beginning start_sample = 0 length_samples = time_steps*hop_length window = y[start_sample:start_sample+length_samples] # use log-melspectrogram mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, n_fft=hop_length*2, hop_length=hop_length) # add small number to avoid log(0) mels = np.log(mels + 1e-9) # min-max scale to fit inside 8-bit range img = scale_minmax(mels, 0, 255).astype(numpy.uint8) # put low frequencies at the bottom in image img = np.flip(img, axis=0) img = 255-img # invert. make black==more energy # save as PNG skimage.io.imsave(os.path.join( out_dir, filename) + ".png", img) except Exception as e: print(e) pass elif args.rnn: if not os.path.exists(os.path.join(out_dir, filename) + ".csv"): try: sound = AudioSegment.from_mp3(os.path.join( "./data", "recordings", filename) + ".mp3") sound.export("tmp.wav", format="wav") # signal, sampling rate fs, s = aIO.read_audio_file("tmp.wav") # get all shoart-term features, returning an array of features # extract short-term features using a 50msec non-overlapping windows duration = len(s) / float(fs) win, step = 0.050, 0.050 [f, fn] = aFs.feature_extraction(s, fs, int(fs * win), int(fs * step)) print( f'{f.shape[1]} frames, {f.shape[0]} short-term features') # Short-Term Features: # 0:zcr # 1:energy # 2:energy_entropy # 3:spectral_centroid # 4:spectral_spread # 5:spectral_entropy # 6:spectral_flux # 7:spectral_rolloff # 8:mfcc_1 # 9:mfcc_2 # 10:mfcc_3 # 11:mfcc_4 # 12:mfcc_5 # 13:mfcc_6 # 14:mfcc_7 # 15:mfcc_8 # 16:mfcc_9 # 17:mfcc_10 # 18:mfcc_11 # 19:mfcc_12 # 20:mfcc_13 # 21:chroma_1 # 22:chroma_2 # 23:chroma_3 # 24:chroma_4 # 25:chroma_5 # 26:chroma_6 # 27:chroma_7 # 28:chroma_8 # 29:chroma_9 # 30:chroma_10 # 31:chroma_11 # 32:chroma_12 # 33:chroma_std # 34:delta zcr # 35:delta energy # 36:delta energy_entropy # 37:delta spectral_centroid # 38:delta spectral_spread # 39:delta spectral_entropy # 40:delta spectral_flux # 41:delta spectral_rolloff # 42:delta mfcc_1 # 43:delta mfcc_2 # 44:delta mfcc_3 # 45:delta mfcc_4 # 46:delta mfcc_5 # 47:delta mfcc_6 # 48:delta mfcc_7 # 49:delta mfcc_8 # 50:delta mfcc_9 # 51:delta mfcc_10 # 52:delta mfcc_11 # 53:delta mfcc_12 # 54:delta mfcc_13 # 55:delta chroma_1 # 56:delta chroma_2 # 57:delta chroma_3 # 58:delta chroma_4 # 59:delta chroma_5 # 60:delta chroma_6 # 61:delta chroma_7 # 62:delta chroma_8 # 63:delta chroma_9 # 64:delta chroma_10 # 65:delta chroma_11 # 66:delta chroma_12 # 67:delta chroma_std features = {fn[i]: f[i] for i in range(len(fn))} ftDf = pd.DataFrame.from_dict(features) ftDf.to_csv(os.path.join(out_dir, filename) + ".csv") except Exception as e: print(e) else: pass
def vadFolderWrapperMergedByTh(inputFolder, outFolder, smoothingWindow, weight, model_name, threshold): if not os.path.isfile(model_name): print("fileClassification: input model_name not found!") classifier, mean, std, classes, mid_window, mid_step, short_window, \ short_step, compute_beat = aT.load_model(model_name) types = ('*.wav', '*.mp3') wavFilesList = [] for files in types: print(inputFolder + files) wavFilesList.extend(glob.glob((inputFolder + files))) wavFilesList = sorted(wavFilesList) if len(wavFilesList) == 0: print("No WAV files found!") return for wavFile in wavFilesList: # print(wavFile) if not os.path.isfile(wavFile): raise Exception("Input audio file not found!") base = os.path.splitext(os.path.basename(wavFile))[0] folder = outFolder + base + '/' if not os.path.exists(folder): os.makedirs(folder) segfile = open(os.path.join(folder, 'segments'), 'w+') segfile2 = open(os.path.join(folder, 'segments_details'), 'w+') stack = deque() [fs, x] = audioBasicIO.read_audio_file(wavFile) segmentLimits = aS.silence_removal(x, fs, 0.05, 0.05, smoothingWindow, weight, False) merge=True for i, st in enumerate(segmentLimits): signal = audioBasicIO.stereo_to_mono(x[int(fs * st[0]):int(fs * st[1])]) # print('in here', len(segmentLimits), st[0],st[1],classes, type(st)) if fs == 0: continue # audio file IO problem # return -1, -1, -1 if signal.shape[0] / float(fs) < mid_window: mid_window = signal.shape[0] / float(fs) # feature extraction: mid_features, s, _ = \ aF.mid_feature_extraction(signal, fs, mid_window * fs, mid_step * fs, round(fs * short_window), round(fs * short_step)) # long term averaging of mid-term statistics mid_features = mid_features.mean(axis=1) if compute_beat: # print('in here3') beat, beat_conf = aF.beat_extraction(s, short_step) mid_features = np.append(mid_features, beat) mid_features = np.append(mid_features, beat_conf) feature_vector = (mid_features - mean) / std # normalization # class_id = -1 # probability = -1 class_id = classifier.predict(feature_vector.reshape(1, -1))[0] # probability = classifier.predict_proba(feature_vector.reshape(1, -1))[0] print(class_id, type(class_id)) label=classes[int(class_id)] print(label) if label=='speech': dur=st[1]-st[0] # print('in hereas') if merge == True: seg_prev=[] # print('in hereasq12') if len(stack) >0: seg_prev = stack.pop() if len(seg_prev) >0 and st[1]-seg_prev[0] > threshold: # print('in hereas4') seg = [st[0], st[1], label] stack.append(seg_prev) stack.append(seg) merge = True elif len(seg_prev) >0: # print('in hereasqw345') seg = [seg_prev[0], st[1], label] stack.append(seg) merge = True else: seg = [st[0], st[1], label] stack.append(seg) merge = True else: # print('in hereas2') seg = [st[0], st[1], label] stack.append(seg) merge = True else: merge = False print(i, merge) # print(len(segmentLimits), len(stack)) for sn in stack: # print(type(wavFile), sn[0].shape, sn[1].shape, type(sn[0]), type(sn[1])) strName = base + "_" + "{:.3f}".format(sn[0]) + "_" + "{:.3f}".format(sn[1]) if sn[2] == 'speech': strOut = folder + base + "_" + "{:.3f}".format(sn[0]) + "_" + "{:.3f}".format(sn[1]) + ".wav" wavfile.write(strOut, fs, x[int(fs * sn[0]):int(fs * sn[1])]) segfile.write(strName + ' ' + base + ' ' + "{:.3f}".format(sn[0]) + ' ' + "{:.3f}".format(sn[1]) + "\n") segfile2.write(strName + ' ' + "{:.3f}".format(sn[0]) + ' ' + "{:.3f}".format(sn[1]) + ' ' + sn[2] + "\n") segfile.close() segfile2.close()
Add types of Features ''' data_dir = "C:/Users/MADHUKAR/Desktop/test/abc/*.wav" audio_files = glob(data_dir) for filename in range(0, len(audio_files), 1): [Fs, x] = audioBasicIO.read_audio_file(audio_files[filename]) Mono_Signal = audioBasicIO.stereo_to_mono(x) print(Fs) #short term features [Feature, Feature_Names] = ShortTermFeatures.feature_extraction(Mono_Signal, Fs, 0.050 * Fs, 0.025 * Fs, deltas=True) #mid term features [mid_features, short_features, mid_feature_names ] = MidTermFeatures.mid_feature_extraction(Mono_Signal, Fs, 1.0 * Fs, 0.75 * Fs, 0.050 * Fs, 0.005 * Fs) #mid_feature_extraction(signal, sampling_rate, mid_window, mid_step, short_window, short_step) print(Feature_Names) print(Feature) print(mid_feature_names) print(mid_features)