def featureExtractionDirWrapper(directory, mt_win, mt_step, st_win, st_step): if not os.path.isdir(directory): raise Exception("Input path not found!") aF.mid_feature_extraction_file_dir(directory, mt_win, mt_step, st_win, st_step, True, True, True)
for audio in os.listdir(directory): wav_file_path = os.path.join(directory, audio) with sf.SoundFile(wav_file_path) as f: duration = (len(f) / f.samplerate) if (duration < 1): os.remove(wav_file_path) remove_len = len(os.listdir(directory)) print('In total, {} audios have been removed due to short duration'.format(original_len-remove_len)) check_duration(pos_path) check_duration(neg_path) [mid_term_features_pos, wav_file_list_pos, mid_feature_names] = mF.directory_feature_extraction(pos_path, 0.5,0.5, 0.05, 0.05, compute_beat=False) [mid_term_features_neg, wav_file_list_neg, mid_feature_names] = mF.directory_feature_extraction(neg_path, 0.5,0.5, 0.05, 0.05, compute_beat=False) filenames_pos = [] for file in wav_file_list_pos: filenames_pos.append(file.split('/')[-1].split('\\')[-1].split('.')[0]) filenames_neg = [] for file in wav_file_list_neg: filenames_neg.append(file.split('/')[-1].split('\\')[-1].split('.')[0]) df_pos = pd.DataFrame(mid_term_features_pos, columns = mid_feature_names) df_pos['filename'] = filenames_pos df_pos['label'] = np.ones(len(df_pos)) df_neg = pd.DataFrame(mid_term_features_neg, columns = mid_feature_names)
def speakerDiarization(filename, n_speakers, mt_size=2.0, mt_step=0.2, st_win=0.05, lda_dim=35, plot_res=False): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mt_size (opt) mid-term window size - mt_step (opt) mid-term window step - st_win (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting """ [fs, x] = audioBasicIO.read_audio_file(filename) x = audioBasicIO.stereo_to_mono(x) duration = len(x) / fs [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models", "knn_speaker_10")) [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models", "knn_speaker_male_female")) [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs*st_win * 0.5)) MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0]+len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T n_wins = mt_feats.shape[1] # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = np.min(mt_feats[1,:]) #EnergyMean = np.mean(mt_feats[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] #print i_non_outliers perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins mt_feats_norm_or = mt_feats_norm mt_feats_norm = mt_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, # st_win * fs, round(fs*st_win), round(fs*st_win)); # extract mid-term features with minimum step: mt_win_ratio = int(round(mt_size / st_win)) mt_step_ratio = int(round(st_win / st_win)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 #for i in range(num_of_stats * num_of_features + 1): for i in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) for i in range(num_of_features): # for each of the short-term features: curPos = 0 N = len(st_feats[i]) while (curPos < N): N1 = curPos N2 = curPos + mt_win_ratio if N2 > N: N2 = N curStFeatures = st_feats[i][N1:N2] mt_feats_to_red[i].append(np.mean(curStFeatures)) mt_feats_to_red[i+num_of_features].append(np.std(curStFeatures)) curPos += mt_step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2), mt_feats_to_red.shape[1])) for i in range(mt_feats_to_red.shape[1]): cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i] mt_feats_to_red_2[mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001 mt_feats_to_red_2[mt_feats_to_red.shape[0]+len(classNames1)::, i] = P2 + 0.0001 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :] #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010 (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures([mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0) #m_dist_all = np.mean(dist_all) #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0] #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2] Labels = np.zeros((mt_feats_to_red.shape[1], )); LDAstep = 1.0 LDAstepRatio = LDAstep / st_win #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*st_win/LDAstepRatio); clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=lda_dim) clf.fit(mt_feats_to_red.T, Labels) mt_feats_norm = (clf.transform(mt_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] clsAll = [] sil_all = [] centersAll = [] for iSpeakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(mt_feats_norm.T)) clsAll.append(cls) centersAll.append(means) sil_1 = []; sil_2 = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \ float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mt_feats_norm[:, cls==c] # compute average distance between samples # that belong to the cluster (a values) Yt = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(Yt)*clust_per_cent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2] Yt = distance.cdist(mt_feats_norm_temp.T, MidTermFeaturesNormTemp2.T) silBs.append(np.mean(Yt)*(clust_per_cent + clust_per_cent_2)/2.0) silBs = np.array(silBs) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(silBs)) sil_1 = np.array(sil_1); sil_2 = np.array(sil_2); sil = [] for c in range(iSpeakers): # for each cluster (speaker) compute silhouette sil.append( ( sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = np.argmax(sil_all) # optimal number of clusters nSpeakersFinal = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = np.zeros((n_wins,)) for i in range(n_wins): j = np.argmin(np.abs(i-i_non_outliers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): # hmm training start_prob, transmat, means, cov = \ trainHMM_computeStatistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = sil_all[imax] class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]; # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundturh exists if os.path.isfile(gt_file): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(np.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls) if os.path.isfile(gt_file): if plot_res: ax1.plot(np.array(range(len(flags_gt))) * mt_step + mt_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluateSpeakerDiarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") #print s_range, sil_all if n_speakers<=0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters"); plt.ylabel("average clustering's sillouette"); plt.show() return cls
def extract_features_and_train(paths, mid_window, mid_step, short_window, short_step, classifier_type, model_name, compute_beat=False, train_percentage=0.90): """ This function is used as a wrapper to segment-based audio feature extraction and classifier training. ARGUMENTS: paths: list of paths of directories. Each directory contains a signle audio class whose samples are stored in seperate WAV files. mid_window, mid_step: mid-term window length and step short_window, short_step: short-term window and step classifier_type: "svm" or "knn" or "randomforest" or "gradientboosting" or "extratrees" model_name: name of the model to be saved RETURNS: None. Resulting classifier along with the respective model parameters are saved on files. """ # STEP A: Feature Extraction: features, class_names, _ = \ aF.multiple_directory_feature_extraction(paths, mid_window, mid_step, short_window, short_step, compute_beat=compute_beat) if len(features) == 0: print("trainSVM_feature ERROR: No data found in any input folder!") return n_feats = features[0].shape[1] feature_names = ["features" + str(d + 1) for d in range(n_feats)] write_train_data_arff(model_name, features, class_names, feature_names) for i, feat in enumerate(features): if len(feat) == 0: print("trainSVM_feature ERROR: " + paths[i] + " folder is empty or non-existing!") return # STEP B: classifier Evaluation and Parameter Selection: if classifier_type == "svm" or classifier_type == "svm_rbf": classifier_par = np.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0, 20.0]) elif classifier_type == "randomforest": classifier_par = np.array([10, 25, 50, 100, 200, 500]) elif classifier_type == "knn": classifier_par = np.array([1, 3, 5, 7, 9, 11, 13, 15]) elif classifier_type == "gradientboosting": classifier_par = np.array([10, 25, 50, 100, 200, 500]) elif classifier_type == "extratrees": classifier_par = np.array([10, 25, 50, 100, 200, 500]) # get optimal classifeir parameter: temp_features = [] for feat in features: temp = [] for i in range(feat.shape[0]): temp_fv = feat[i, :] if (not np.isnan(temp_fv).any()) and (not np.isinf(temp_fv).any()): temp.append(temp_fv.tolist()) else: print("NaN Found! Feature vector not used for training") temp_features.append(np.array(temp)) features = temp_features best_param = evaluate_classifier(features, class_names, 100, classifier_type, classifier_par, 0, train_percentage) print("Selected params: {0:.5f}".format(best_param)) features_norm, mean, std = normalize_features(features) mean = mean.tolist() std = std.tolist() # STEP C: Save the classifier to file if classifier_type == "svm": classifier = train_svm(features_norm, best_param) elif classifier_type == "svm_rbf": classifier = train_svm(features_norm, best_param, kernel='rbf') elif classifier_type == "randomforest": classifier = train_random_forest(features_norm, best_param) elif classifier_type == "gradientboosting": classifier = train_gradient_boosting(features_norm, best_param) elif classifier_type == "extratrees": classifier = train_extra_trees(features_norm, best_param) if classifier_type == "knn": feature_matrix, labels = features_to_matrix(features_norm) feature_matrix = feature_matrix.tolist() labels = labels.tolist() save_path = model_name save_parameters(save_path, feature_matrix, labels, mean, std, class_names, best_param, mid_window, mid_step, short_window, short_step, compute_beat) elif classifier_type == "svm" or classifier_type == "svm_rbf" or \ classifier_type == "randomforest" or \ classifier_type == "gradientboosting" or \ classifier_type == "extratrees": with open(model_name, 'wb') as fid: cPickle.dump(classifier, fid) save_path = model_name + "MEANS" save_parameters(save_path, mean, std, class_names, mid_window, mid_step, short_window, short_step, compute_beat)
def record_audio(block_size, fs=8000, show_spec=False, show_chroma=False, log_sounds=False, logs_all=False): # inialize recording process mid_buf_size = int(fs * block_size) pa = pyaudio.PyAudio() stream = pa.open(format=FORMAT, channels=1, rate=fs, input=True, frames_per_buffer=mid_buf_size) mid_buf = [] count = 0 global all_data global outstr all_data = [] # initalize counters etc time_start = time.time() outstr = datetime.datetime.now().strftime("%Y_%m_%d_%I:%M%p") out_folder = outstr + "_segments" if log_sounds: if not os.path.exists(out_folder): os.makedirs(out_folder) # load segment model [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, _] = aT.load_model("model") while 1: try: block = stream.read(mid_buf_size) count_b = len(block) / 2 format = "%dh" % (count_b) shorts = struct.unpack(format, block) cur_win = list(shorts) mid_buf = mid_buf + cur_win del cur_win # time since recording started: e_time = (time.time() - time_start) # data-driven time data_time = (count + 1) * block_size x = numpy.int16(mid_buf) seg_len = len(x) # extract features # We are using the signal length as mid term window and step, # in order to guarantee a mid-term feature sequence of len 1 [mt_feats, _, _] = mF.mid_feature_extraction(x, fs, seg_len, seg_len, round(fs * st_win), round(fs * st_step)) cur_fv = (mt_feats[:, 0] - MEAN) / STD # classify vector: [res, prob] = aT.classifier_wrapper(classifier, "svm_rbf", cur_fv) win_class = class_names[int(res)] win_prob = prob[int(res)] if logs_all: all_data += mid_buf mid_buf = numpy.double(mid_buf) # Compute spectrogram if show_spec: (spec, t_axis, freq_axis_s) = sF.spectrogram(mid_buf, fs, 0.050 * fs, 0.050 * fs) freq_axis_s = numpy.array(freq_axis_s) # frequency axis # most dominant frequencies (for each short-term window): dominant_freqs = freq_axis_s[numpy.argmax(spec, axis=1)] # get average most dominant freq max_freq = numpy.mean(dominant_freqs) max_freq_std = numpy.std(dominant_freqs) # Compute chromagram if show_chroma: (chrom, TimeAxisC, freq_axis_c) = sF.chromagram(mid_buf, fs, 0.050 * fs, 0.050 * fs) freq_axis_c = numpy.array(freq_axis_c) # most dominant chroma classes: dominant_freqs_c = freq_axis_c[numpy.argmax(chrom, axis=1)] # get most common among all short-term windows max_freqC = most_common(dominant_freqs_c)[0] # Plot signal window signalPlotCV = plotCV( scipy.signal.resample(mid_buf + 16000, plot_w), plot_w, plot_h, 32000) cv2.imshow('Signal', signalPlotCV) cv2.moveWindow('Signal', 50, status_h + 50) # Show spectrogram if show_spec: i_spec = numpy.array(spec.T * 255, dtype=numpy.uint8) i_spec2 = cv2.resize(i_spec, (plot_w, plot_h), interpolation=cv2.INTER_CUBIC) i_spec2 = cv2.applyColorMap(i_spec2, cv2.COLORMAP_JET) cv2.putText(i_spec2, "max_freq: %.0f Hz" % max_freq, (0, 11), cv2.FONT_HERSHEY_PLAIN, 1, (200, 200, 200)) cv2.imshow('Spectrogram', i_spec2) cv2.moveWindow('Spectrogram', 50, plot_h + status_h + 60) # Show chromagram if show_chroma: i_chroma = numpy.array((chrom.T / chrom.max()) * 255, dtype=numpy.uint8) i_chroma2 = cv2.resize(i_chroma, (plot_w, plot_h), interpolation=cv2.INTER_CUBIC) i_chroma2 = cv2.applyColorMap(i_chroma2, cv2.COLORMAP_JET) cv2.putText(i_chroma2, "max_freqC: %s" % max_freqC, (0, 11), cv2.FONT_HERSHEY_PLAIN, 1, (200, 200, 200)) cv2.imshow('Chroma', i_chroma2) cv2.moveWindow('Chroma', 50, 2 * plot_h + status_h + 60) # Activity Detection: print("{0:.2f}\t{1:s}\t{2:.2f}".format(e_time, win_class, win_prob)) if log_sounds: # TODO: log audio files out_file = os.path.join( out_folder, "{0:.2f}_".format(e_time).zfill(8) + win_class + ".wav") #shutil.copyfile("temp.wav", out_file) wavfile.write(out_file, fs, x) textIm = numpy.zeros((status_h, plot_w, 3)) statusStrTime = "time: %.1f sec" % e_time + \ " - data time: %.1f sec" % data_time + \ " - loss : %.1f sec" % (e_time - data_time) cv2.putText(textIm, statusStrTime, (0, 11), cv2.FONT_HERSHEY_PLAIN, 1, (200, 200, 200)) cv2.putText(textIm, win_class, (0, 33), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 255)) cv2.imshow("Status", textIm) cv2.moveWindow("Status", 50, 0) mid_buf = [] ch = cv2.waitKey(10) count += 1 except IOError: print("Error recording")
raise 'Input directory is Empty' if not os.path.isdir(parse.audio): raise 'Input path is not a directory' if parse.groundtruth is None: raise 'Ground truth directory is Empty' if not os.path.isdir(parse.audio): raise 'Ground truth path is not a directory' files, labels = read_data(parse.audio, parse.groundtruth) one_hot = MultiLabelBinarizer() labels = one_hot.fit_transform(labels) class_names = [str(c) for c in one_hot.classes_] mid_window, mid_step, short_window, short_step = 1, 1, 0.1, 0.1 f, fn, feature_names = mF.directory_feature_extraction( parse.audio, mid_window, mid_step, short_window, short_step) x_train, y_train, x_test, y_test = split_data(f, labels, fn) x_sub, y_sub = get_minority_instace(pd.DataFrame(x_train), pd.DataFrame(y_train)) x_res, y_res = MLSMOTE(x_sub, y_sub, parse.resampled) print("Resampled") x = pd.concat([pd.DataFrame(x_train), x_res], ignore_index=True) y = pd.concat([pd.DataFrame(y_train), y_res], ignore_index=True) print('Synthetic data have been added to the train set') class_names = [str(c) for c in y.columns] print("LinearSVc Classifier") classifier = OneVsRestClassifier(LinearSVC(max_iter=50000, class_weight='balanced'), n_jobs=-1) classifier.fit(x, y)
def audio_features_extraction(dir_name="../data", mt_win=1.0, mt_step=1.0, st_win=0.050, st_step=0.050, features_audio_file='Audio2Features.pkl'): audio_dir = dir_name + '/' + 'audio' # first, extract audio from video v2a.video2audio(dir_name) features = [] file_names = [] mid_term_features = np.array([]) process_times = [] # type is WAVE file, convert using the function video_to_audio.py suffix = ".wav" index_df = pd.read_csv(dir_name + '/' + 'index.csv', sep=';') wav_file_list, mid_feature_names = [], [] # iterate each audio file print('Extracting features from audio files...') bar = progressbar.ProgressBar(maxval=len(index_df), \ widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()]) bar.start() bar_index = 0 for ind in index_df.index: name = index_df['FILE'][ind] seg = str(index_df['SEG'][ind]) file_path = audio_dir + '/' + name + '/' + seg + suffix # print("Analyzing file {0:d} of {1:d}: {2:s}".format(ind+1,len(index_df),file_path)) if os.stat(file_path).st_size == 0: logging.warning("WARNING: EMPTY FILE -- SKIPPING") continue [sampling_rate, signal] = audioBasicIO.read_audio_file(file_path) if sampling_rate == 0: logging.warning("WARNING: NO SAMPLING RATE -- SKIPPING") continue t1 = time.clock() signal = audioBasicIO.stereo_to_mono(signal) if signal.shape[0] < float(sampling_rate) / 5: logging.warning("WARNING: AUDIO FILE TOO SMALL -- SKIPPING") continue wav_file_list.append(file_path) mid_features, _, mid_feature_names = \ aF.mid_feature_extraction(signal, sampling_rate, round(mt_win * sampling_rate), round(mt_step * sampling_rate), round(st_win * sampling_rate), round(st_step * sampling_rate)) mid_features = np.transpose(mid_features) mid_features = mid_features.mean(axis=0) # long term averaging of mid-term statistics if (not np.isnan(mid_features).any()) and \ (not np.isinf(mid_features).any()): if len(mid_term_features) == 0: # append feature vector mid_term_features = mid_features else: mid_term_features = np.vstack( (mid_term_features, mid_features)) t2 = time.clock() duration = float(len(signal)) / sampling_rate process_times.append((t2 - t1) / duration) # update progress bar index bar_index += 1 bar.update(bar_index) bar.finish() if len(process_times) > 0: print("Audio feature extraction completed. Complexity ratio: " "{0:.1f} x realtime".format( (1.0 / np.mean(np.array(process_times))))) print('Shape: ' + str(mid_term_features.shape)) ftr_df = pd.DataFrame(data=mid_term_features) df = index_df.copy() df = pd.concat([df, ftr_df], axis=1) if True: df.to_pickle(dir_name + '/' + features_audio_file) return mid_term_features, wav_file_list, mid_feature_names
def extract_features_and_train(paths, mid_window, mid_step, short_window, short_step, classifier_type, model_name, compute_beat=False, train_percentage=0.90, dict_of_ids=None, use_smote=False): """ This function is used as a wrapper to segment-based audio feature extraction and classifier training. ARGUMENTS: paths: list of paths of directories. Each directory contains a signle audio class whose samples are stored in seperate WAV files. mid_window, mid_step: mid-term window length and step short_window, short_step: short-term window and step classifier_type: "svm" or "knn" or "randomforest" or "gradientboosting" or "extratrees" model_name: name of the model to be saved dict_of_ids: a dictionary which has as keys the full path of audio files and as values the respective group ids RETURNS: None. Resulting classifier along with the respective model parameters are saved on files. """ # STEP A: Feature Extraction: features, class_names, file_names = \ aF.multiple_directory_feature_extraction(paths, mid_window, mid_step, short_window, short_step, compute_beat=compute_beat) file_names = [item for sublist in file_names for item in sublist] if dict_of_ids: list_of_ids = [dict_of_ids[file] for file in file_names] else: list_of_ids = None if len(features) == 0: print("trainSVM_feature ERROR: No data found in any input folder!") return n_feats = features[0].shape[1] feature_names = ["features" + str(d + 1) for d in range(n_feats)] for i, feat in enumerate(features): if len(feat) == 0: print("trainSVM_feature ERROR: " + paths[i] + " folder is empty or non-existing!") return # STEP B: classifier Evaluation and Parameter Selection: if classifier_type == "svm" or classifier_type == "svm_rbf": classifier_par = np.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0, 20.0]) elif classifier_type == "randomforest": classifier_par = np.array([10, 25, 50, 100, 200, 500]) elif classifier_type == "knn": classifier_par = np.array([1, 3, 5, 7, 9, 11, 13, 15]) elif classifier_type == "gradientboosting": classifier_par = np.array([10, 25, 50, 100, 200, 500]) elif classifier_type == "extratrees": classifier_par = np.array([10, 25, 50, 100, 200, 500]) # get optimal classifier parameter: temp_features = [] for feat in features: temp = [] for i in range(feat.shape[0]): temp_fv = feat[i, :] if (not np.isnan(temp_fv).any()) and (not np.isinf(temp_fv).any()): temp.append(temp_fv.tolist()) else: print("NaN Found! Feature vector not used for training") temp_features.append(np.array(temp)) features = temp_features best_param = evaluate_classifier(features, class_names, classifier_type, classifier_par, 1, list_of_ids, n_exp=-1, train_percentage=train_percentage, smote=use_smote) print("Selected params: {0:.5f}".format(best_param)) # STEP C: Train and Save the classifier to file # Get featues in the X, y format: features, labels = features_to_matrix(features) # Apply smote if necessary: if use_smote: sm = SMOTE(random_state=2) features, labels = sm.fit_resample(features, labels) # Use mean/std standard feature scaling: scaler = StandardScaler() features = scaler.fit_transform(features) mean = scaler.mean_.tolist() std = scaler.scale_.tolist() # Then train the final classifier if classifier_type == "svm": classifier = train_svm(features, labels, best_param) elif classifier_type == "svm_rbf": classifier = train_svm(features, labels, best_param, kernel='rbf') elif classifier_type == "randomforest": classifier = train_random_forest(features, labels, best_param) elif classifier_type == "gradientboosting": classifier = train_gradient_boosting(features, labels, best_param) elif classifier_type == "extratrees": classifier = train_extra_trees(features, labels, best_param) # And save the model to a file, along with # - the scaling -mean/std- vectors) # - the feature extraction parameters if classifier_type == "knn": feature_matrix = features.tolist() labels = labels.tolist() save_path = model_name save_parameters(save_path, feature_matrix, labels, mean, std, class_names, best_param, mid_window, mid_step, short_window, short_step, compute_beat) elif classifier_type == "svm" or classifier_type == "svm_rbf" or \ classifier_type == "randomforest" or \ classifier_type == "gradientboosting" or \ classifier_type == "extratrees": with open(model_name, 'wb') as fid: cPickle.dump(classifier, fid) save_path = model_name + "MEANS" save_parameters(save_path, mean, std, class_names, mid_window, mid_step, short_window, short_step, compute_beat)
def create_feature_from_audio(filename): import pyogg import numpy as np import ctypes, numpy, pyogg import matplotlib.pyplot as plt import scipy.io.wavfile # https://github.com/Zuzu-Typ/PyOgg/issues/19 # file = pyogg.OpusFile(filename) # stereo # audio_path_opus = "./" file = pyogg.OpusFile(filename) target_datatype = ctypes.c_short * (file.buffer_length // 2 ) # always divide by 2 for some reason buffer_as_array = ctypes.cast(file.buffer, ctypes.POINTER(target_datatype)).contents if file.channels == 1: wav = numpy.array(buffer_as_array) elif file.channels == 2: wav = numpy.array((wav[0::2], wav[1::2])) else: raise NotImplementedError() # This is the final numpy array signal = numpy.transpose(wav) sampling_rate = 48000 print(numpy.shape(wav)) #plt.figure #plt.title("Signal Wave...") #plt.plot(signal) #plt.show() # Calculating features from final_data from pyAudioAnalysis import MidTermFeatures as mF from pyAudioAnalysis import ShortTermFeatures as sF from pyAudioAnalysis import audioBasicIO mid_window = round(0.1 * sampling_rate) mid_step = round(0.1 * sampling_rate) short_window = round(sampling_rate * 0.01) short_step = round(sampling_rate * 0.01) signal = audioBasicIO.stereo_to_mono(signal) print(type(signal)) # print(np.shape(signal)) signal = signal.astype( 'float64' ) # this line is because librosa was making an error - need floats [mid_features, short_features, mid_feature_names] = mF.mid_feature_extraction(signal, sampling_rate, mid_window, mid_step, short_window, short_step) mid_features = np.transpose(mid_features) mid_term_features = mid_features.mean(axis=0) mid_term_features = np.reshape(mid_term_features, (-1, 1)) mid_term_features = np.transpose(mid_term_features) # print(np.shape(mid_term_features)) # len(mid_feature_names) # Getting the classification result with Cough=0, No_Cough=1 from joblib import dump, load from sklearn import preprocessing cough_classifier = load('Cough_NoCough_classifier.joblib') features = preprocessing.StandardScaler().fit_transform(mid_term_features) prediction = cough_classifier.predict(features) # coughs=0 , no_cough = 1 return prediction, mid_term_features
def mid_term_file_classification(input_file, model_name, model_type, plot_results=False, gt_file=""): """ This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment """ labels = [] accuracy = 0.0 class_names = [] cm = np.array([]) if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return labels, class_names, accuracy, cm # Load classifier: if model_type == "knn": classifier, mean, std, class_names, mt_win, mid_step, st_win, \ st_step, compute_beat = at.load_model_knn(model_name) else: classifier, mean, std, class_names, mt_win, mid_step, st_win, \ st_step, compute_beat = at.load_model(model_name) if compute_beat: print("Model " + model_name + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return labels, class_names, accuracy, cm # load input file sampling_rate, signal = audioBasicIO.read_audio_file(input_file) # could not read file if sampling_rate == 0: return labels, class_names, accuracy, cm # convert stereo (if) to mono signal = audioBasicIO.stereo_to_mono(signal) # mid-term feature extraction: mt_feats, _, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mt_win * sampling_rate, mid_step * sampling_rate, round(sampling_rate * st_win), round(sampling_rate * st_step)) posterior_matrix = [] # for each feature vector (i.e. for each fix-sized segment): for col_index in range(mt_feats.shape[1]): # normalize current feature v feature_vector = (mt_feats[:, col_index] - mean) / std # classify vector: label_predicted, posterior = \ at.classifier_wrapper(classifier, model_type, feature_vector) labels.append(label_predicted) # update probability matrix posterior_matrix.append(np.max(posterior)) labels = np.array(labels) # convert fix-sized flags to segments and classes segs, classes = labels_to_segments(labels, mid_step) for i in range(len(segs)): print(segs[i], classes[i]) segs[-1] = len(signal) / float(sampling_rate) # Load grount-truth: labels_gt, class_names_gt, accuracy, cm = \ load_ground_truth(gt_file, labels, class_names, mid_step, plot_results) return labels, class_names, accuracy, cm
def speaker_diarization(filename, n_speakers, mid_window=1.0, mid_step=0.1, short_window=0.1, lda_dim=0, plot_res=False): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mid_window (opt) mid-term window size - mid_step (opt) mid-term window step - short_window (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting """ sampling_rate, signal = audioBasicIO.read_audio_file(filename) signal = audioBasicIO.stereo_to_mono(signal) duration = len(signal) / sampling_rate base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models") classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \ at.load_model(os.path.join(base_dir, "svm_rbf_speaker_10")) classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _, _ = \ at.load_model(os.path.join(base_dir, "svm_rbf_speaker_male_female")) mid_feats, st_feats, a = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * 0.05), round(sampling_rate * 0.05)) mid_term_features = np.zeros((mid_feats.shape[0] + len(class_names_all) + len(class_names_fm), mid_feats.shape[1])) for index in range(mid_feats.shape[1]): feature_norm_all = (mid_feats[:, index] - mean_all) / std_all feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "svm_rbf", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "svm_rbf", feature_norm_fm) start = mid_feats.shape[0] end = mid_feats.shape[0] + len(class_names_all) mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index] mid_term_features[start:end, index] = p1 + 1e-4 mid_term_features[end::, index] = p2 + 1e-4 # normalize features: scaler = StandardScaler() mid_feats_norm = scaler.fit_transform(mid_term_features.T) # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.1 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: # EnergyMin = np.min(mt_feats[1,:]) # EnergyMean = np.mean(mt_feats[1,:]) # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] # print i_non_outliers mt_feats_norm_or = mid_feats_norm mid_feats_norm = mid_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: # extract mid-term features with minimum step: window_ratio = int(round(mid_window / short_window)) step_ratio = int(round(short_window / short_window)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 for index in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) # for each of the short-term features: for index in range(num_of_features): cur_pos = 0 feat_len = len(st_feats[index]) while cur_pos < feat_len: n1 = cur_pos n2 = cur_pos + window_ratio if n2 > feat_len: n2 = feat_len short_features = st_feats[index][n1:n2] mt_feats_to_red[index].append(np.mean(short_features)) mt_feats_to_red[index + num_of_features].\ append(np.std(short_features)) cur_pos += step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] + len(class_names_all) + len(class_names_fm), mt_feats_to_red.shape[1])) limit = mt_feats_to_red.shape[0] + len(class_names_all) for index in range(mt_feats_to_red.shape[1]): feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "svm_rbf", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "svm_rbf", feature_norm_fm) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \ mt_feats_to_red[:, index] mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit, index] = p1 + 1e-4 mt_feats_to_red_2[limit::, index] = p2 + 1e-4 mt_feats_to_red = mt_feats_to_red_2 scaler = StandardScaler() mt_feats_to_red = scaler.fit_transform(mt_feats_to_red.T).T labels = np.zeros((mt_feats_to_red.shape[1], )) lda_step = 1.0 lda_step_ratio = lda_step / short_window for index in range(labels.shape[0]): labels[index] = int(index * short_window / lda_step_ratio) clf = sklearn.discriminant_analysis.\ LinearDiscriminantAnalysis(n_components=lda_dim) mid_feats_norm = clf.fit_transform(mt_feats_to_red.T, labels) #clf.fit(mt_feats_to_red.T, labels) #mid_feats_norm = (clf.transform(mid_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] cluster_labels = [] sil_all = [] cluster_centers = [] for speakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=speakers) k_means.fit(mid_feats_norm) cls = k_means.labels_ cluster_labels.append(cls) # cluster_centers.append(means) sil_1 = []; sil_2 = [] for c in range(speakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mid_feats_norm[cls == c, :] # compute average distance between samples # that belong to the cluster (a values) dist = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(dist)*clust_per_cent) sil_temp = [] for c2 in range(speakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) mid_features_temp = mid_feats_norm[cls == c2, :] dist = distance.cdist(mt_feats_norm_temp, mid_features_temp) sil_temp.append(np.mean(dist)*(clust_per_cent + clust_per_cent_2)/2.0) sil_temp = np.array(sil_temp) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(sil_temp)) sil_1 = np.array(sil_1) sil_2 = np.array(sil_2) sil = [] for c in range(speakers): # for each cluster (speaker) compute silhouette sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = int(np.argmax(sil_all)) # optimal number of clusters num_speakers = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) # print(cls) # cls = np.zeros((n_wins,)) # for index in range(n_wins): # j = np.argmin(np.abs(index-i_non_outliers)) # cls[index] = cluster_labels[imax][j] # Post-process method 1: hmm smoothing if lda_dim <= 0 : for index in range(1): # hmm training start_prob, transmat, means, cov = \ train_hmm_compute_statistics(mt_feats_norm_or.T, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 5) class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundtruth exists if os.path.isfile(gt_file): seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file) flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end, seg_labs, mid_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(np.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls) purity_cluster_m, purity_speaker_m = -1, -1 if os.path.isfile(gt_file): if plot_res: ax1.plot(np.array(range(len(flags_gt))) * mid_step + mid_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluate_speaker_diarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") if n_speakers <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() return cls, purity_cluster_m, purity_speaker_m
def train_hmm_from_directory(folder_path, hmm_model_name, mid_window, mid_step): """ This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored ARGUMENTS: - folder_path: the path of the data diretory - hmm_model_name: the name of the HMM model to be stored - mt_win: mid-term window size - mt_step: mid-term window step RETURNS: - hmm: an object to the resulting HMM - class_names: a list of class_names After training, hmm, class_names, along with the mt_win and mt_step values are stored in the hmm_model_name file """ flags_all = np.array([]) class_names_all = [] for i, f in enumerate(glob.glob(folder_path + os.sep + '*.wav')): # for each WAV file wav_file = f gt_file = f.replace('.wav', '.segments') if os.path.isfile(gt_file): seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file) flags, class_names = \ segments_to_labels(seg_start, seg_end, seg_labs, mid_step) for c in class_names: # update class names: if c not in class_names_all: class_names_all.append(c) sampling_rate, signal = audioBasicIO.read_audio_file(wav_file) feature_vector, _, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * 0.050), round(sampling_rate * 0.050)) flag_len = len(flags) feat_cols = feature_vector.shape[1] min_sm = min(feat_cols, flag_len) feature_vector = feature_vector[:, 0:min_sm] flags = flags[0:min_sm] flags_new = [] # append features and labels for j, fl in enumerate(flags): flags_new.append(class_names_all.index(class_names_all[flags[j]])) flags_all = np.append(flags_all, np.array(flags_new)) if i == 0: f_all = feature_vector else: f_all = np.concatenate((f_all, feature_vector), axis=1) # compute HMM statistics class_priors, transmutation_matrix, means, cov = \ train_hmm_compute_statistics(f_all, flags_all) # train the HMM hmm = hmmlearn.hmm.GaussianHMM(class_priors.shape[0], "diag") hmm.covars_ = cov hmm.means_ = means hmm.startprob_ = class_priors hmm.transmat_ = transmutation_matrix save_hmm(hmm_model_name, hmm, class_names_all, mid_window, mid_step) return hmm, class_names_all
def featureAndTrain(list_of_dirs, mt_win, mt_step, st_win, st_step, classifier_type, model_name, compute_beat=False, perTrain=0.90): """ This function is used as a wrapper to segment-based audio feature extraction and classifier training. ARGUMENTS: list_of_dirs: list of paths of directories. Each directory contains a signle audio class whose samples are stored in seperate WAV files. mt_win, mt_step: mid-term window length and step st_win, st_step: short-term window and step classifier_type: "svm" or "knn" or "randomforest" or "gradientboosting" or "extratrees" model_name: name of the model to be saved RETURNS: None. Resulting classifier along with the respective model parameters are saved on files. """ # STEP A: Feature Extraction: [features, classNames, _] = aF.multiple_directory_feature_extraction(list_of_dirs, mt_win, mt_step, st_win, st_step, compute_beat=compute_beat) if len(features) == 0: print("trainSVM_feature ERROR: No data found in any input folder!") return n_feats = features[0].shape[1] feature_names = ["features" + str(d + 1) for d in range(n_feats)] writeTrainDataToARFF(model_name, features, classNames, feature_names) for i, f in enumerate(features): if len(f) == 0: print("trainSVM_feature ERROR: " + list_of_dirs[i] + " folder is empty or non-existing!") return # STEP B: classifier Evaluation and Parameter Selection: if classifier_type == "svm" or classifier_type == "svm_rbf": classifier_par = np.array([0.001, 0.01, 0.5, 1.0, 5.0, 10.0, 20.0]) elif classifier_type == "randomforest": classifier_par = np.array([10, 25, 50, 100, 200, 500]) elif classifier_type == "knn": classifier_par = np.array([1, 3, 5, 7, 9, 11, 13, 15]) elif classifier_type == "gradientboosting": classifier_par = np.array([10, 25, 50, 100, 200, 500]) elif classifier_type == "extratrees": classifier_par = np.array([10, 25, 50, 100, 200, 500]) # get optimal classifeir parameter: features2 = [] for f in features: fTemp = [] for i in range(f.shape[0]): temp = f[i, :] if (not np.isnan(temp).any()) and (not np.isinf(temp).any()): fTemp.append(temp.tolist()) else: print("NaN Found! Feature vector not used for training") features2.append(np.array(fTemp)) features = features2 bestParam = evaluateclassifier(features, classNames, 100, classifier_type, classifier_par, 0, perTrain) print("Selected params: {0:.5f}".format(bestParam)) C = len(classNames) [features_norm, MEAN, STD] = normalizeFeatures(features) MEAN = MEAN.tolist() STD = STD.tolist() featuresNew = features_norm # STEP C: Save the classifier to file if classifier_type == "svm": classifier = trainSVM(featuresNew, bestParam) elif classifier_type == "svm_rbf": classifier = trainSVM_RBF(featuresNew, bestParam) elif classifier_type == "randomforest": classifier = trainRandomForest(featuresNew, bestParam) elif classifier_type == "gradientboosting": classifier = trainGradientBoosting(featuresNew, bestParam) elif classifier_type == "extratrees": classifier = trainExtraTrees(featuresNew, bestParam) if classifier_type == "knn": [X, Y] = listOfFeatures2Matrix(featuresNew) X = X.tolist() Y = Y.tolist() fo = open(model_name, "wb") cPickle.dump(X, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(Y, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(bestParam, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() elif classifier_type == "svm" or classifier_type == "svm_rbf" or \ classifier_type == "randomforest" or \ classifier_type == "gradientboosting" or \ classifier_type == "extratrees": with open(model_name, 'wb') as fid: cPickle.dump(classifier, fid) fo = open(model_name + "MEANS", "wb") cPickle.dump(MEAN, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(STD, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(st_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(compute_beat, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close()
"""! @brief Example 12 @details pyAudioAnalysis feature extraction for classes organized in folders and feature histogram representation (per feature and class). Binary classification task: male vs female speech segments @author Theodoros Giannakopoulos {[email protected]} """ from pyAudioAnalysis import MidTermFeatures as aF import os.path import utilities as ut if __name__ == '__main__': dirs = ["../data/gender/male", "../data/gender/female"] class_names = [os.path.basename(d) for d in dirs] m_win, m_step, s_win, s_step = 1, 1, 0.1, 0.05 features = [] for d in dirs: # get feature matrix for each directory (class) f, files, fn = aF.directory_feature_extraction(d, m_win, m_step, s_win, s_step) features.append(f) ut.plot_feature_histograms(features, fn, class_names)
# extract short-term features using a 50msec non-overlapping windows win, step = 0.050, 0.050 [f, fn] = aFs.feature_extraction(s, fs, int(fs * win), int(fs * step)) print(f'{f.shape[1]} frames, {f.shape[0]} short-term features') print('Feature names:') for i, nam in enumerate(fn): print(f'{i}:{nam}') # plot short-term energy # create time axis in seconds time = np.arange(0, duration - step, win) # get the feature whose name is 'energy' energy = f[fn.index('energy'), :] mylayout = go.Layout(yaxis=dict(title="frame energy value"), xaxis=dict(title="time (sec)")) plotly.offline.iplot(go.Figure(data=[go.Scatter(x=time, y=energy)], layout=mylayout)) # get mid-term (segment) feature statistics # and respective short-term features: mt, st, mt_n = aFm.mid_feature_extraction(s, fs, 1 * fs, 1 * fs, 0.05 * fs, 0.05 * fs) print(f'signal duration {len(s)/fs} seconds') print(f'{st.shape[1]} {st.shape[0]}-D short-term feature vectors extracted') print(f'{mt.shape[1]} {mt.shape[0]}-D segment feature statistic vectors extracted') print('mid-term feature names') for i, mi in enumerate(mt_n): print(f'{i}:{mi}')
def audio_based_feature_extraction(input_file, models_directory, raudio_features_discard=0, pyaudio_num_features="all", mode=0, pyaudio_params=None): """ Export all features for a wav file (silence based + classifiers based) :param input_file: the audio file :param models_directory: the directory which contains all trained classifiers (models' files + MEANS files) :return: features , feature_names , metadata """ # A. silence features fs, dur = get_wav_properties(input_file) fs, x = aio.read_audio_file(input_file) print(input_file) print(len(x) / fs) # get the silence estimates using pyAudioAnalysis semi-supervised approach # for different windows and steps if dur < 6.2: seg_limits_short = [[0, dur]] seg_limits_long = [[0, dur]] else: seg_limits_short = aS.silence_removal(x, fs, 0.5, 0.25, 0.5) seg_limits_long = aS.silence_removal(x, fs, 1.0, 0.25, 0.5) # short windows silence_features_short, number_of_pauses_short, total_speech_short = \ silence_features(seg_limits_short, dur) # long windows silence_features_long, number_of_pauses_long, total_speech_long = \ silence_features(seg_limits_long, dur) features = [] feature_names = [] if mode < 2: # B. segment model-based features # Load classifier: dictionaries = [] for filename in os.listdir(models_directory): model_path = os.path.join(models_directory, filename) dictionary = predict_audio_labels(input_file, model_path)[0] dictionaries.append(dictionary) # list of features and feature names feature_names = [ "Average silence duration short (sec)", "Average silence duration long (sec)", "Silence segments per minute short (segments/min)", "Silence segments per minute long (segments/min)", "Std short", "Std long", "Speech ratio short (sec)", "Speech ratio long (sec)", "Word rate in speech short (words/sec)", "Word rate in speech long (words/sec)" ] for i in range(len(silence_features_short)): features.append(silence_features_short[i]) features.append(silence_features_long[i]) for dictionary in dictionaries: for label in dictionary: feature_string = label + "(%)" feature_value = dictionary[label] feature_names.append(feature_string) features.append(feature_value) if raudio_features_discard != 0: features = features[raudio_features_discard:] feature_names = feature_names[raudio_features_discard:] # C. pyaudio features if mode > 0: (segment_features_stats, segment_features, pyaudio_feature_names) = aF.mid_feature_extraction( x, fs, round(pyaudio_params['mid_window'] * fs), round(pyaudio_params['mid_step'] * fs), round(fs * pyaudio_params['short_window']), round(fs * pyaudio_params['short_step'])) pyaudio_list = list(segment_features_stats.mean(axis=1)) if pyaudio_num_features != "all": #pyaudio_num_features = int(pyaudio_num_features) pyaudio_list = pyaudio_list[:pyaudio_num_features - 1] pyaudio_feature_names = pyaudio_feature_names[:pyaudio_num_features - 1] features = features + pyaudio_list feature_names = feature_names + pyaudio_feature_names metadata = { "Number of pauses short": number_of_pauses_short, "Number of pauses long": number_of_pauses_long, "Total speech duration short (sec)": total_speech_short, "Total speech duration long (sec)": total_speech_long } return features, feature_names, metadata
import numpy as np #extract some audio VIDEOFILE = "../data/raw/8/replay.mp4" AUDIOFILE = "./extracted.wav" FEATUREFILE = "./extracted.ft" command = f"ffmpeg -i {VIDEOFILE} -vn {AUDIOFILE} -y" subprocess.call(command, shell=True) [Fs, x] = audioBasicIO.read_audio_file(AUDIOFILE) x = audioBasicIO.stereo_to_mono(x) midF, shortF, midFNames = MidTermFeatures.mid_feature_extraction(x,Fs, 0.1*Fs,0.05*Fs,0.05*Fs,0.025*Fs) np.save(FEATUREFILE, midF) np.savetxt(FEATUREFILE + ".csv", midF.T, delimiter=",", header=",".join(midFNames)) #%% audioAnalysis.thumbnailWrapper(AUDIOFILE,50) #explore the audio audioAnalysis.fileSpectrogramWrapper(AUDIOFILE) audioAnalysis.fileChromagramWrapper(AUDIOFILE) audioAnalysis.beatExtractionWrapper(AUDIOFILE, True) #%% var = 48 print(f"{var} : {3/199:.5f}")
def exp5(): print('pyAudioAnalysis example 5') dirs = [ '{0}music/classical'.format(AfeExp.data_folder), '{0}music/metal'.format(AfeExp.data_folder) ] class_names = ['classical', 'metal'] m_win, m_step, s_win, s_step = 1, 1, 0.1, 0.05 features = [] for d in dirs: # get feature matrix for each directory (class) f, files, fn = aMF.directory_feature_extraction( d, m_win, m_step, s_win, s_step) features.append(f) print(features[0].shape, features[1].shape) f1 = np.array([ features[0][:, fn.index('spectral_centroid_mean')], features[0][:, fn.index('energy_entropy_mean')] ]) f2 = np.array([ features[1][:, fn.index('spectral_centroid_mean')], features[1][:, fn.index('energy_entropy_mean')] ]) print('f1 type:{0}; shape:{1}; value:{2};'.format( type(f1), f1.shape, f1)) print('f2 type:{0}; shape:{1}; value:{2};'.format( type(f2), f2.shape, f2)) y = np.concatenate((np.zeros(f1.shape[1]), np.ones(f2.shape[1]))) f = np.concatenate((f1.T, f2.T), axis=0) print('y: {0}; {1};'.format(y.shape, y)) print('X: {0}; {1};'.format(f.shape, f)) # train the svm classifier cl = sks.SVC(kernel='rbf', C=20) cl.fit(f, y) p1 = go.Scatter(x=f1[0, :], y=f1[1, :], name=class_names[0], marker=dict(size=10, color='rgba(255, 182, 193, .9)'), mode='markers') p2 = go.Scatter(x=f2[0, :], y=f2[1, :], name=class_names[1], marker=dict(size=10, color='rgba(100, 100, 220, .9)'), mode='markers') mylayout = go.Layout(xaxis=dict(title="spectral_centroid_mean"), yaxis=dict(title="energy_entropy_mean")) # apply the trained model on the points of a grid x_ = np.arange(f[:, 0].min(), f[:, 0].max(), 0.002) y_ = np.arange(f[:, 1].min(), f[:, 1].max(), 0.002) xx, yy = np.meshgrid(x_, y_) X_t = np.c_[xx.ravel(), yy.ravel()] print('X_t: {0};'.format(X_t.shape)) Z = cl.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) / 2 # and visualize the grid on the same plot (decision surfaces) cs = go.Heatmap(x=x_, y=y_, z=Z, showscale=False, colorscale=[[0, 'rgba(255, 182, 193, .3)'], [1, 'rgba(100, 100, 220, .3)']]) mylayout = go.Layout(xaxis=dict(title="spectral_centroid_mean"), yaxis=dict(title="energy_entropy_mean")) #plotly.offline.iplot(go.Figure(data=[p1, p2, cs], layout=mylayout)) plotly.offline.plot({ 'data': [p1, p2, cs], 'layout': mylayout }, auto_open=True)
def fileGreenwaySpeakerDiarization(filename, output_folder, speech_key="52fe944f29784ae288482e5eb3092e2a", service_region="eastus2", n_speakers=2, mt_size=2.0, mt_step=0.2, st_win=0.05, lda_dim=35): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed the filename should have a suffix of the form: ..._min_3 this informs the service that audio file corresponds to the 3rd minute of the dialogue - output_folder the folder location for saving the audio snippets generated from diarization - speech_key mid-term window size - service_region the number of speakers (clusters) in the recording (<=0 for unknown) - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mt_size (opt) mid-term window size - mt_step (opt) mid-term window step - st_win (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting - save_plot (opt) 1|True for saving plot in output folder """ ''' OUTPUTS: - cls: this is a vector with speaker ids in chronological sequence of speaker dialogue. - output: a list of python dictionaries containing dialogue sequence information. - dialogue_id - sequence_id - start_time - end_time - text ''' filename_only = filename if "/" not in filename else filename.split("/")[-1] nameoffile = filename_only.split("_min_")[0] timeoffile = filename_only.split("_min_")[1] [fs, x] = audioBasicIO.read_audio_file(filename) x = audioBasicIO.stereo_to_mono(x) duration = len(x) / fs [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "pyAudioAnalysis/data/models", "knn_speaker_10")) [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "pyAudioAnalysis/data/models", "knn_speaker_male_female")) [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs*st_win * 0.5)) MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T n_wins = mt_feats.shape[1] # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = np.min(mt_feats[1,:]) #EnergyMean = np.mean(mt_feats[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] # print i_non_outliers perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins mt_feats_norm_or = mt_feats_norm mt_feats_norm = mt_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: # [mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, # st_win * fs, round(fs*st_win), round(fs*st_win)); # extract mid-term features with minimum step: mt_win_ratio = int(round(mt_size / st_win)) mt_step_ratio = int(round(st_win / st_win)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 # for i in range(num_of_stats * num_of_features + 1): for i in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) # for each of the short-term features: for i in range(num_of_features): curPos = 0 N = len(st_feats[i]) while (curPos < N): N1 = curPos N2 = curPos + mt_win_ratio if N2 > N: N2 = N curStFeatures = st_feats[i][N1:N2] mt_feats_to_red[i].append(np.mean(curStFeatures)) mt_feats_to_red[i + num_of_features].append(np.std(curStFeatures)) curPos += mt_step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2), mt_feats_to_red.shape[1])) for i in range(mt_feats_to_red.shape[1]): cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i] mt_feats_to_red_2[mt_feats_to_red.shape[0] :mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001 mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :] #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010 (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures( [mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0) #m_dist_all = np.mean(dist_all) #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0] #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2] Labels = np.zeros((mt_feats_to_red.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / st_win # print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*st_win/LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=lda_dim) clf.fit(mt_feats_to_red.T, Labels) mt_feats_norm = (clf.transform(mt_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] clsAll = [] sil_all = [] centersAll = [] for iSpeakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(mt_feats_norm.T)) clsAll.append(cls) centersAll.append(means) sil_1 = [] sil_2 = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \ float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mt_feats_norm[:, cls == c] # compute average distance between samples # that belong to the cluster (a values) Yt = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(Yt)*clust_per_cent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2] Yt = distance.cdist(mt_feats_norm_temp.T, MidTermFeaturesNormTemp2.T) silBs.append(np.mean(Yt)*(clust_per_cent + clust_per_cent_2)/2.0) silBs = np.array(silBs) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(silBs)) sil_1 = np.array(sil_1) sil_2 = np.array(sil_2) sil = [] for c in range(iSpeakers): # for each cluster (speaker) compute silhouette sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = np.argmax(sil_all) # optimal number of clusters nSpeakersFinal = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = np.zeros((n_wins,)) for i in range(n_wins): j = np.argmin(np.abs(i-i_non_outliers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): # hmm training start_prob, transmat, means, cov = \ trainHMM_computeStatistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = sil_all[imax] class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundturh exists if os.path.isfile(gt_file): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags( seg_start, seg_end, seg_labs, mt_step) # if plot_res: # fig = plt.figure() # if n_speakers > 0: # ax1 = fig.add_subplot(111) # else: # ax1 = fig.add_subplot(211) # ax1.set_yticks(np.array(range(len(class_names)))) # ax1.axis((0, duration, -1, len(class_names))) # ax1.set_yticklabels(class_names) # ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls) # if os.path.isfile(gt_file): # if plot_res: # ax1.plot(np.array(range(len(flags_gt))) * # mt_step + mt_step / 2.0, flags_gt, 'r') # purity_cluster_m, purity_speaker_m = \ # evaluateSpeakerDiarization(cls, flags_gt) # print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, # 100 * purity_speaker_m)) # if plot_res: # plt.title("Cluster purity: {0:.1f}% - " # "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m, # 100 * purity_speaker_m)) # if plot_res: # plt.xlabel("time (seconds)") # # print s_range, sil_all # if n_speakers <= 0: # plt.subplot(212) # plt.plot(s_range, sil_all) # plt.xlabel("number of clusters") # plt.ylabel("average clustering's sillouette") # if save_plot: # plt.savefig( # f"{output_folder}{filename_only}".replace(".wav", ".png")) # else: # pass # plt.show() # Create Time Vector time_vec = np.array(range(len(cls)))*mt_step+mt_step/2.0 # Find Change Points speaker_change_index = np.where(np.roll(cls, 1) != cls)[0] # Create List of dialogue convos output_list = [] temp = {} for ind, sc in enumerate(speaker_change_index): temp['dialogue_id'] = str(datetime.now()).strip() temp['sequence_id'] = str(ind) temp['speaker'] = list(cls)[sc] temp['start_time'] = time_vec[sc] temp['end_time'] = time_vec[speaker_change_index[ind+1] - 1] if ind+1 < len(speaker_change_index) else time_vec[-1] temp["text"] = "" output_list.append(temp) temp = {} def snip_transcribe(output_list, filename, output_folder=output_folder, speech_key=speech_key, service_region=service_region): speech_config = speechsdk.SpeechConfig( subscription=speech_key, region=service_region) speech_config.enable_dictation def recognized_cb(evt): if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech: # Do something with the recognized text output_list[ind]['text'] = output_list[ind]['text'] + \ str(evt.result.text) print(evt.result.text) for ind, diag in enumerate(output_list): t1 = diag['start_time'] t2 = diag['end_time'] newAudio = AudioSegment.from_wav(filename) chunk = newAudio[t1*1000:t2*1000] filename_out = output_folder + f"snippet_{diag['sequence_id']}.wav" # Exports to a wav file in the current path. chunk.export(filename_out, format="wav") done = False def stop_cb(evt): """callback that signals to stop continuous recognition upon receiving an event `evt`""" print('CLOSING on {}'.format(evt)) nonlocal done done = True audio_input = speechsdk.AudioConfig(filename=filename_out) speech_recognizer = speechsdk.SpeechRecognizer( speech_config=speech_config, audio_config=audio_input) output_list[ind]['snippet_path'] = filename_out speech_recognizer.recognized.connect(recognized_cb) speech_recognizer.session_stopped.connect(stop_cb) speech_recognizer.canceled.connect(stop_cb) # Start continuous speech recognition speech_recognizer.start_continuous_recognition() while not done: time.sleep(.5) speech_recognizer.stop_continuous_recognition() return output_list output = snip_transcribe(output_list, filename, output_folder=output_folder) output_json = {filename_only: output} with open(f"{output_folder}{nameoffile}_{timeoffile}.txt", "w") as outfile: json.dump(output_json, outfile) return cls, output_json
The pyAudioAnalysis has two functions in order to extract a bunch of useful features from a wav file. ''' from pyAudioAnalysis import MidTermFeatures as mF import numpy as np import pandas as pd import os basepath_train_cough = 'C:/Users/Guillem/Desktop/HACKATHON 2020/Unlabeled audio/TRAIN/Cough/' basepath_train_nocough = 'C:/Users/Guillem/Desktop/HACKATHON 2020/Unlabeled audio/TRAIN/No_Cough/' [mid_term_features_cough, wav_file_list_cough, mid_feature_names] = mF.directory_feature_extraction(basepath_train_cough, 0.1, 0.1, 0.01, 0.01, compute_beat=False) [mid_term_features_nocough, wav_file_list_nocough, mid_feature_names] = mF.directory_feature_extraction(basepath_train_nocough, 0.1, 0.1, 0.01, 0.01, compute_beat=False) label_nocough = np.zeros(np.shape(mid_term_features_nocough)[0]) label_cough = np.ones(np.shape(mid_term_features_cough)[0]) features = np.concatenate( (mid_term_features_cough,
def main(argv): if argv[1] == "-shortTerm": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() F = MidTermFeatures.short_term_feature_extraction( x, Fs, 0.050 * Fs, 0.050 * Fs) t2 = time.clock() perTime1 = duration / (t2 - t1) print "short-term feature extraction: {0:.1f} x realtime".format( perTime1) elif argv[1] == "-classifyFile": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() aT.fileClassification("diarizationExample.wav", "svmSM", "svm") t2 = time.clock() perTime1 = duration / (t2 - t1) print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-mtClassify": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() [flagsInd, classesAll, acc] = aS.mtFileClassification("diarizationExample.wav", "svmSM", "svm", False, '') t2 = time.clock() perTime1 = duration / (t2 - t1) print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-hmmSegmentation": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() aS.hmmSegmentation('diarizationExample.wav', 'hmmRadioSM', False, '') t2 = time.clock() perTime1 = duration / (t2 - t1) print "HMM-based classification - segmentation \t {0:.1f} x realtime".format( perTime1) elif argv[1] == "-silenceRemoval": for i in range(nExp): [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") duration = x.shape[0] / float(Fs) t1 = time.clock() [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav") segments = aS.silenceRemoval(x, Fs, 0.050, 0.050, smoothWindow=1.0, Weight=0.3, plot=False) t2 = time.clock() perTime1 = duration / (t2 - t1) print "Silence removal \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-thumbnailing": for i in range(nExp): [Fs1, x1] = audioBasicIO.read_audio_file("scottish.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() [A1, A2, B1, B2, Smatrix] = aS.musicThumbnailing(x1, Fs1, 1.0, 1.0, 15.0) # find thumbnail endpoints t2 = time.clock() perTime1 = duration1 / (t2 - t1) print "Thumbnail \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-noLDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.read_audio_file("diarizationExample.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() aS.speakerDiarization("diarizationExample.wav", 4, LDAdim=0, PLOT=False) t2 = time.clock() perTime1 = duration1 / (t2 - t1) print "Diarization \t {0:.1f} x realtime".format(perTime1) elif argv[1] == "-diarization-LDA": for i in range(nExp): [Fs1, x1] = audioBasicIO.read_audio_file("diarizationExample.wav") duration1 = x1.shape[0] / float(Fs1) t1 = time.clock() aS.speakerDiarization("diarizationExample.wav", 4, PLOT=False) t2 = time.clock() perTime1 = duration1 / (t2 - t1) print "Diarization \t {0:.1f} x realtime".format(perTime1)
def trainHMM_fromDir(dirPath, hmm_model_name, mt_win, mt_step): """ This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored ARGUMENTS: - dirPath: the path of the data diretory - hmm_model_name: the name of the HMM model to be stored - mt_win: mid-term window size - mt_step: mid-term window step RETURNS: - hmm: an object to the resulting HMM - class_names: a list of class_names After training, hmm, class_names, along with the mt_win and mt_step values are stored in the hmm_model_name file """ flags_all = np.array([]) classes_all = [] for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')): # for each WAV file wav_file = f gt_file = f.replace('.wav', '.segments') if not os.path.isfile(gt_file): continue [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step) for c in class_names: # update class names: if c not in classes_all: classes_all.append(c) [fs, x] = audioBasicIO.read_audio_file(wav_file) [F, _, _] = aF.mid_feature_extraction(x, fs, mt_win * fs, mt_step * fs, round(fs * 0.050), round(fs * 0.050)) lenF = F.shape[1] lenL = len(flags) min_sm = min(lenF, lenL) F = F[:, 0:min_sm] flags = flags[0:min_sm] flagsNew = [] for j, fl in enumerate(flags): # append features and labels flagsNew.append(classes_all.index(class_names[flags[j]])) flags_all = np.append(flags_all, np.array(flagsNew)) if i == 0: f_all = F else: f_all = np.concatenate((f_all, F), axis=1) # compute HMM statistics start_prob, transmat, means, cov = trainHMM_computeStatistics(f_all, flags_all) # train the HMM hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov fo = open(hmm_model_name, "wb") # save HMM model cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(classes_all, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL) fo.close() return hmm, classes_all
def feature_extraction_train_regression(folder_name, mid_window, mid_step, short_window, short_step, model_type, model_name, compute_beat=False): """ This function is used as a wrapper to segment-based audio feature extraction and classifier training. ARGUMENTS: folder_name: path of directory containing the WAV files and Regression CSVs mt_win, mt_step: mid-term window length and step st_win, st_step: short-term window and step model_type: "svm" or "knn" or "randomforest" model_name: name of the model to be saved RETURNS: None. Resulting regression model along with the respective model parameters are saved on files. """ # STEP A: Feature Extraction: features, _, filenames = \ aF.multiple_directory_feature_extraction([folder_name], mid_window, mid_step, short_window, short_step, compute_beat=compute_beat) features = features[0] filenames = [ntpath.basename(f) for f in filenames[0]] f_final = [] # Read CSVs: csv_files = glob.glob(folder_name + os.sep + "*.csv") regression_labels = [] regression_names = [] f_final = [] for c in csv_files: cur_regression_labels = [] f_temp = [] # open the csv file that contains the current target value's annotations with open(c, 'rt') as csvfile: csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|') for row in csv_reader: if len(row) == 2: # ... and if the current filename exists # in the list of filenames if row[0] in filenames: index = filenames.index(row[0]) cur_regression_labels.append(float(row[1])) f_temp.append(features[index, :]) else: print("Warning: {} not found " "in list of files.".format(row[0])) else: print( "Warning: Row with unknown format in regression file") f_final.append(np.array(f_temp)) # cur_regression_labels is the list of values # for the current regression problem regression_labels.append(np.array(cur_regression_labels)) # regression task name regression_names.append(ntpath.basename(c).replace(".csv", "")) if len(features) == 0: print("ERROR: No data found in any input folder!") return # TODO: ARRF WRITE???? # STEP B: classifier Evaluation and Parameter Selection: if model_type == "svm" or model_type == "svm_rbf": model_params = np.array( [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 5.0, 10.0]) elif model_type == "randomforest": model_params = np.array([5, 10, 25, 50, 100]) errors = [] errors_base = [] best_params = [] for iRegression, r in enumerate(regression_names): # get optimal classifeir parameter: print("Regression task " + r) bestParam, error, berror = evaluate_regression( f_final[iRegression], regression_labels[iRegression], 100, model_type, model_params) errors.append(error) errors_base.append(berror) best_params.append(bestParam) print("Selected params: {0:.5f}".format(bestParam)) features_norm, mean, std = normalize_features([f_final[iRegression]]) # STEP C: Save the model to file if model_type == "svm": classifier, _ = train_svm_regression( features_norm[0], regression_labels[iRegression], bestParam) if model_type == "svm_rbf": classifier, _ = train_svm_regression( features_norm[0], regression_labels[iRegression], bestParam, kernel='rbf') if model_type == "randomforest": classifier, _ = train_random_forest_regression( features_norm[0], regression_labels[iRegression], bestParam) if model_type == "svm" or model_type == "svm_rbf" \ or model_type == "randomforest": with open(model_name + "_" + r, 'wb') as fid: cPickle.dump(classifier, fid) save_path = model_name + "_" + r + "MEANS" save_parameters(save_path, mean, std, mid_window, mid_step, short_window, short_step, compute_beat) return errors, errors_base, best_params
def mtFileClassification(input_file, model_name, model_type, plot_results=False, gt_file=""): """ This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment """ if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return (-1, -1, -1, -1) # Load classifier: if model_type == "knn": [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model_knn(model_name) else: [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model(model_name) if compute_beat: print("Model " + model_name + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return (-1, -1, -1, -1) [fs, x] = audioBasicIO.read_audio_file(input_file) # load input file if fs == -1: # could not read file return (-1, -1, -1, -1) x = audioBasicIO.stereo_to_mono(x) # convert stereo (if) to mono # mid-term feature extraction: [mt_feats, _, _] = aF.mid_feature_extraction(x, fs, mt_win * fs, mt_step * fs, round(fs * st_win), round(fs * st_step)) flags = [] Ps = [] flags_ind = [] # for each feature vector (i.e. for each fix-sized segment): for i in range(mt_feats.shape[1]): cur_fv = (mt_feats[:, i] - MEAN) / STD # normalize current feature v # classify vector: [res, P] = aT.classifierWrapper(classifier, model_type, cur_fv) flags_ind.append(res) flags.append(class_names[int(res)]) # update class label matrix Ps.append(np.max(P)) # update probability matrix flags_ind = np.array(flags_ind) # 1-window smoothing for i in range(1, len(flags_ind) - 1): if flags_ind[i-1] == flags_ind[i + 1]: flags_ind[i] = flags_ind[i + 1] # convert fix-sized flags to segments and classes (segs, classes) = flags2segs(flags, mt_step) segs[-1] = len(x) / float(fs) # Load grount-truth: if os.path.isfile(gt_file): [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt, seg_l_gt, mt_step) flags_ind_gt = [] for j, fl in enumerate(flags_gt): # "align" labels with GT if class_names_gt[flags_gt[j]] in class_names: flags_ind_gt.append(class_names.index(class_names_gt[ flags_gt[j]])) else: flags_ind_gt.append(-1) flags_ind_gt = np.array(flags_ind_gt) cm = np.zeros((len(class_names_gt), len(class_names_gt))) for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])): cm[int(flags_ind_gt[i]),int(flags_ind[i])] += 1 else: cm = [] flags_ind_gt = np.array([]) acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names, mt_step, not plot_results) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc) ) return (flags_ind, class_names_gt, acc, cm) else: return (flags_ind, class_names, acc, cm)
#TODO ''' Add number of Features Add types of Features CSV Headers(if Possible) ''' from pyAudioAnalysis import audioBasicIO from pyAudioAnalysis import MidTermFeatures # mid term features saved in csv for each individual song file MidTermFeatures.mid_feature_extraction_file_dir( "D:/Capstone/Testing/New_directory", 1.0, 0.75, 0.050, 0.005, store_short_features=True, store_csv=True, plot=False) # (folder_path, mid_window, mid_step,short_window, short_step,store_short_features=False, store_csv=False,plot=False)
values, labels = torch.max(test_result, 1) y_pred = labels.data.numpy() return f1_score(y_pred, test_labels) from pyAudioAnalysis import MidTermFeatures as mt from pyAudioAnalysis import audioTrainTest as aT import numpy as np import os if os.path.isfile("features.npy"): with open('features.npy', 'rb') as f: X = np.load(f) y = np.load(f) else: features, class_names, file_names = mt.multiple_directory_feature_extraction( ["audio/speech", "audio/noise"], 1, 1, 0.1, 0.1, False) X, y = aT.features_to_matrix(features) with open('features.npy', 'wb') as f: np.save(f, np.array(X)) np.save(f, np.array(y)) dimensions = X.shape[1] # Split to train/test X_train = X[::2, :] y_train = y[::2] X_test = X[1::2, :] y_test = y[1::2] n_nodes = 256