"""! @brief Example 29 @details: Music segmentation example @author Theodoros Giannakopoulos {[email protected]} """ import os, readchar, sklearn.cluster from pyAudioAnalysis.MidTermFeatures import mid_feature_extraction as mT from pyAudioAnalysis.audioBasicIO import read_audio_file, stereo_to_mono from pyAudioAnalysis.audioSegmentation import labels_to_segments from pyAudioAnalysis.audioTrainTest import normalize_features if __name__ == '__main__': # read signal and get normalized segment features: input_file = "../data/song1.mp3" fs, x = read_audio_file(input_file) x = stereo_to_mono(x) mt_size, mt_step, st_win = 5, 0.5, 0.05 [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs * st_win * 0.5)) (mt_feats_norm, MEAN, STD) = normalize_features([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T # perform clustering (k = 4) n_clusters = 4 k_means = sklearn.cluster.KMeans(n_clusters=n_clusters) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ segs, c = labels_to_segments(cls, mt_step) # convert flags to segment limits for sp in range(n_clusters): # play each cluster's segment for i in range(len(c)): if c[i] == sp and segs[i, 1] - segs[i, 0] > 5:
def silenceRemoval(x, fs, st_win, st_step, smoothWindow=0.5, weight=0.5, plot=False): """ Event Detection (silence removal) ARGUMENTS: - x: the input audio signal - fs: sampling freq - st_win, st_step: window size and step in seconds - smoothWindow: (optinal) smooth window (in seconds) - weight: (optinal) weight factor (0 < weight < 1) the higher, the more strict - plot: (optinal) True if results are to be plotted RETURNS: - seg_limits: list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds """ if weight >= 1: weight = 0.99 if weight <= 0: weight = 0.01 # Step 1: feature extraction x = audioBasicIO.stereo_to_mono(x) st_feats, _ = sF.feature_extraction(x, fs, st_win * fs, st_step * fs) # Step 2: train binary svm classifier of low vs high energy frames # keep only the energy short-term sequence (2nd feature) st_energy = st_feats[1, :] en = np.sort(st_energy) # number of 10% of the total short-term windows l1 = int(len(en) / 10) # compute "lower" 10% energy threshold t1 = np.mean(en[0:l1]) + 0.000000000000001 # compute "higher" 10% energy threshold t2 = np.mean(en[-l1:-1]) + 0.000000000000001 # get all features that correspond to low energy class1 = st_feats[:, np.where(st_energy <= t1)[0]] # get all features that correspond to high energy class2 = st_feats[:, np.where(st_energy >= t2)[0]] # form the binary classification task and ... faets_s = [class1.T, class2.T] # normalize and train the respective svm probabilistic model # (ONSET vs SILENCE) [faets_s_norm, means_s, stds_s] = aT.normalizeFeatures(faets_s) svm = aT.trainSVM(faets_s_norm, 1.0) # Step 3: compute onset probability based on the trained svm prob_on_set = [] for i in range(st_feats.shape[1]): # for each frame cur_fv = (st_feats[:, i] - means_s) / stds_s # get svm probability (that it belongs to the ONSET class) prob_on_set.append(svm.predict_proba(cur_fv.reshape(1,-1))[0][1]) prob_on_set = np.array(prob_on_set) # smooth probability: prob_on_set = smoothMovingAvg(prob_on_set, smoothWindow / st_step) # Step 4A: detect onset frame indices: prog_on_set_sort = np.sort(prob_on_set) # find probability Threshold as a weighted average # of top 10% and lower 10% of the values Nt = int(prog_on_set_sort.shape[0] / 10) T = (np.mean((1 - weight) * prog_on_set_sort[0:Nt]) + weight * np.mean(prog_on_set_sort[-Nt::])) max_idx = np.where(prob_on_set > T)[0] # get the indices of the frames that satisfy the thresholding i = 0 time_clusters = [] seg_limits = [] # Step 4B: group frame indices to onset segments while i < len(max_idx): # for each of the detected onset indices cur_cluster = [max_idx[i]] if i == len(max_idx)-1: break while max_idx[i+1] - cur_cluster[-1] <= 2: cur_cluster.append(max_idx[i+1]) i += 1 if i == len(max_idx)-1: break i += 1 time_clusters.append(cur_cluster) seg_limits.append([cur_cluster[0] * st_step, cur_cluster[-1] * st_step]) # Step 5: Post process: remove very small segments: min_dur = 0.2 seg_limits_2 = [] for s in seg_limits: if s[1] - s[0] > min_dur: seg_limits_2.append(s) seg_limits = seg_limits_2 if plot: timeX = np.arange(0, x.shape[0] / float(fs), 1.0 / fs) plt.subplot(2, 1, 1) plt.plot(timeX, x) for s in seg_limits: plt.axvline(x=s[0], color='red') plt.axvline(x=s[1], color='red') plt.subplot(2, 1, 2) plt.plot(np.arange(0, prob_on_set.shape[0] * st_step, st_step), prob_on_set) plt.title('Signal') for s in seg_limits: plt.axvline(x=s[0], color='red') plt.axvline(x=s[1], color='red') plt.title('svm Probability') plt.show() return seg_limits
def speakerDiarization(filename, n_speakers, mt_size=2.0, mt_step=0.2, st_win=0.05, lda_dim=35, plot_res=False): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mt_size (opt) mid-term window size - mt_step (opt) mid-term window step - st_win (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting """ [fs, x] = audioBasicIO.read_audio_file(filename) x = audioBasicIO.stereo_to_mono(x) duration = len(x) / fs [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models", "knn_speaker_10")) [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models", "knn_speaker_male_female")) [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs*st_win * 0.5)) MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0]+len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T n_wins = mt_feats.shape[1] # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = np.min(mt_feats[1,:]) #EnergyMean = np.mean(mt_feats[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] #print i_non_outliers perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins mt_feats_norm_or = mt_feats_norm mt_feats_norm = mt_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, # st_win * fs, round(fs*st_win), round(fs*st_win)); # extract mid-term features with minimum step: mt_win_ratio = int(round(mt_size / st_win)) mt_step_ratio = int(round(st_win / st_win)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 #for i in range(num_of_stats * num_of_features + 1): for i in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) for i in range(num_of_features): # for each of the short-term features: curPos = 0 N = len(st_feats[i]) while (curPos < N): N1 = curPos N2 = curPos + mt_win_ratio if N2 > N: N2 = N curStFeatures = st_feats[i][N1:N2] mt_feats_to_red[i].append(np.mean(curStFeatures)) mt_feats_to_red[i+num_of_features].append(np.std(curStFeatures)) curPos += mt_step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2), mt_feats_to_red.shape[1])) for i in range(mt_feats_to_red.shape[1]): cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i] mt_feats_to_red_2[mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001 mt_feats_to_red_2[mt_feats_to_red.shape[0]+len(classNames1)::, i] = P2 + 0.0001 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :] #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010 (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures([mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0) #m_dist_all = np.mean(dist_all) #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0] #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2] Labels = np.zeros((mt_feats_to_red.shape[1], )); LDAstep = 1.0 LDAstepRatio = LDAstep / st_win #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*st_win/LDAstepRatio); clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=lda_dim) clf.fit(mt_feats_to_red.T, Labels) mt_feats_norm = (clf.transform(mt_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] clsAll = [] sil_all = [] centersAll = [] for iSpeakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(mt_feats_norm.T)) clsAll.append(cls) centersAll.append(means) sil_1 = []; sil_2 = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \ float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mt_feats_norm[:, cls==c] # compute average distance between samples # that belong to the cluster (a values) Yt = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(Yt)*clust_per_cent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2] Yt = distance.cdist(mt_feats_norm_temp.T, MidTermFeaturesNormTemp2.T) silBs.append(np.mean(Yt)*(clust_per_cent + clust_per_cent_2)/2.0) silBs = np.array(silBs) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(silBs)) sil_1 = np.array(sil_1); sil_2 = np.array(sil_2); sil = [] for c in range(iSpeakers): # for each cluster (speaker) compute silhouette sil.append( ( sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = np.argmax(sil_all) # optimal number of clusters nSpeakersFinal = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = np.zeros((n_wins,)) for i in range(n_wins): j = np.argmin(np.abs(i-i_non_outliers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): # hmm training start_prob, transmat, means, cov = \ trainHMM_computeStatistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = sil_all[imax] class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]; # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundturh exists if os.path.isfile(gt_file): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(np.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls) if os.path.isfile(gt_file): if plot_res: ax1.plot(np.array(range(len(flags_gt))) * mt_step + mt_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluateSpeakerDiarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") #print s_range, sil_all if n_speakers<=0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters"); plt.ylabel("average clustering's sillouette"); plt.show() return cls
def musicThumbnailing(x, fs, short_term_size=1.0, short_term_step=0.5, thumb_size=10.0, limit_1 = 0, limit_2 = 1): """ This function detects instances of the most representative part of a music recording, also called "music thumbnails". A technique similar to the one proposed in [1], however a wider set of audio features is used instead of chroma features. In particular the following steps are followed: - Extract short-term audio features. Typical short-term window size: 1 second - Compute the self-similarity matrix, i.e. all pairwise similarities between feature vectors - Apply a diagonal mask is as a moving average filter on the values of the self-similarty matrix. The size of the mask is equal to the desirable thumbnail length. - Find the position of the maximum value of the new (filtered) self-similarity matrix. The audio segments that correspond to the diagonial around that position are the selected thumbnails ARGUMENTS: - x: input signal - fs: sampling frequency - short_term_size: window size (in seconds) - short_term_step: window step (in seconds) - thumb_size: desider thumbnail size (in seconds) RETURNS: - A1: beginning of 1st thumbnail (in seconds) - A2: ending of 1st thumbnail (in seconds) - B1: beginning of 2nd thumbnail (in seconds) - B2: ending of 2nd thumbnail (in seconds) USAGE EXAMPLE: import audioFeatureExtraction as aF [fs, x] = basicIO.readAudioFile(input_file) [A1, A2, B1, B2] = musicThumbnailing(x, fs) [1] Bartsch, M. A., & Wakefield, G. H. (2005). Audio thumbnailing of popular music using chroma-based representations. Multimedia, IEEE Transactions on, 7(1), 96-104. """ x = audioBasicIO.stereo_to_mono(x); # feature extraction: st_feats, _ = sF.feature_extraction(x, fs, fs * short_term_size, fs * short_term_step) # self-similarity matrix S = selfSimilarityMatrix(st_feats) # moving filter: M = int(round(thumb_size / short_term_step)) B = np.eye(M,M) S = scipy.signal.convolve2d(S, B, 'valid') # post-processing (remove main diagonal elements) min_sm = np.min(S) for i in range(S.shape[0]): for j in range(S.shape[1]): if abs(i-j) < 5.0 / short_term_step or i > j: S[i,j] = min_sm; # find max position: S[0:int(limit_1 * S.shape[0]), :] = min_sm S[:, 0:int(limit_1 * S.shape[0])] = min_sm S[int(limit_2 * S.shape[0])::, :] = min_sm S[:, int(limit_2 * S.shape[0])::] = min_sm maxVal = np.max(S) [I, J] = np.unravel_index(S.argmax(), S.shape) #plt.imshow(S) #plt.show() # expand: i1 = I i2 = I j1 = J j2 = J while i2-i1<M: if i1 <=0 or j1<=0 or i2 >= S.shape[0]-2 or j2 >= S.shape[1]-2: break if S[i1-1, j1-1] > S[i2 + 1, j2 + 1]: i1 -= 1 j1 -= 1 else: i2 += 1 j2 += 1 return short_term_step * i1, short_term_step * i2, \ short_term_step * j1, short_term_step * j2, S
def mtFileClassification(input_file, model_name, model_type, plot_results=False, gt_file=""): """ This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment """ if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return (-1, -1, -1, -1) # Load classifier: if model_type == "knn": [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model_knn(model_name) else: [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model(model_name) if compute_beat: print("Model " + model_name + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return (-1, -1, -1, -1) [fs, x] = audioBasicIO.read_audio_file(input_file) # load input file if fs == -1: # could not read file return (-1, -1, -1, -1) x = audioBasicIO.stereo_to_mono(x) # convert stereo (if) to mono # mid-term feature extraction: [mt_feats, _, _] = aF.mid_feature_extraction(x, fs, mt_win * fs, mt_step * fs, round(fs * st_win), round(fs * st_step)) flags = [] Ps = [] flags_ind = [] # for each feature vector (i.e. for each fix-sized segment): for i in range(mt_feats.shape[1]): cur_fv = (mt_feats[:, i] - MEAN) / STD # normalize current feature v # classify vector: [res, P] = aT.classifierWrapper(classifier, model_type, cur_fv) flags_ind.append(res) flags.append(class_names[int(res)]) # update class label matrix Ps.append(np.max(P)) # update probability matrix flags_ind = np.array(flags_ind) # 1-window smoothing for i in range(1, len(flags_ind) - 1): if flags_ind[i-1] == flags_ind[i + 1]: flags_ind[i] = flags_ind[i + 1] # convert fix-sized flags to segments and classes (segs, classes) = flags2segs(flags, mt_step) segs[-1] = len(x) / float(fs) # Load grount-truth: if os.path.isfile(gt_file): [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt, seg_l_gt, mt_step) flags_ind_gt = [] for j, fl in enumerate(flags_gt): # "align" labels with GT if class_names_gt[flags_gt[j]] in class_names: flags_ind_gt.append(class_names.index(class_names_gt[ flags_gt[j]])) else: flags_ind_gt.append(-1) flags_ind_gt = np.array(flags_ind_gt) cm = np.zeros((len(class_names_gt), len(class_names_gt))) for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])): cm[int(flags_ind_gt[i]),int(flags_ind[i])] += 1 else: cm = [] flags_ind_gt = np.array([]) acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names, mt_step, not plot_results) if acc >= 0: print("Overall Accuracy: {0:.3f}".format(acc) ) return (flags_ind, class_names_gt, acc, cm) else: return (flags_ind, class_names, acc, cm)
def audio_features_extraction(dir_name="../data", mt_win=1.0, mt_step=1.0, st_win=0.050, st_step=0.050, features_audio_file='Audio2Features.pkl'): audio_dir = dir_name + '/' + 'audio' # first, extract audio from video v2a.video2audio(dir_name) features = [] file_names = [] mid_term_features = np.array([]) process_times = [] # type is WAVE file, convert using the function video_to_audio.py suffix = ".wav" index_df = pd.read_csv(dir_name + '/' + 'index.csv', sep=';') wav_file_list, mid_feature_names = [], [] # iterate each audio file print('Extracting features from audio files...') bar = progressbar.ProgressBar(maxval=len(index_df), \ widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()]) bar.start() bar_index = 0 for ind in index_df.index: name = index_df['FILE'][ind] seg = str(index_df['SEG'][ind]) file_path = audio_dir + '/' + name + '/' + seg + suffix # print("Analyzing file {0:d} of {1:d}: {2:s}".format(ind+1,len(index_df),file_path)) if os.stat(file_path).st_size == 0: logging.warning("WARNING: EMPTY FILE -- SKIPPING") continue [sampling_rate, signal] = audioBasicIO.read_audio_file(file_path) if sampling_rate == 0: logging.warning("WARNING: NO SAMPLING RATE -- SKIPPING") continue t1 = time.clock() signal = audioBasicIO.stereo_to_mono(signal) if signal.shape[0] < float(sampling_rate) / 5: logging.warning("WARNING: AUDIO FILE TOO SMALL -- SKIPPING") continue wav_file_list.append(file_path) mid_features, _, mid_feature_names = \ aF.mid_feature_extraction(signal, sampling_rate, round(mt_win * sampling_rate), round(mt_step * sampling_rate), round(st_win * sampling_rate), round(st_step * sampling_rate)) mid_features = np.transpose(mid_features) mid_features = mid_features.mean(axis=0) # long term averaging of mid-term statistics if (not np.isnan(mid_features).any()) and \ (not np.isinf(mid_features).any()): if len(mid_term_features) == 0: # append feature vector mid_term_features = mid_features else: mid_term_features = np.vstack( (mid_term_features, mid_features)) t2 = time.clock() duration = float(len(signal)) / sampling_rate process_times.append((t2 - t1) / duration) # update progress bar index bar_index += 1 bar.update(bar_index) bar.finish() if len(process_times) > 0: print("Audio feature extraction completed. Complexity ratio: " "{0:.1f} x realtime".format( (1.0 / np.mean(np.array(process_times))))) print('Shape: ' + str(mid_term_features.shape)) ftr_df = pd.DataFrame(data=mid_term_features) df = index_df.copy() df = pd.concat([df, ftr_df], axis=1) if True: df.to_pickle(dir_name + '/' + features_audio_file) return mid_term_features, wav_file_list, mid_feature_names
def music_thumbnailing(signal, sampling_rate, short_window=1.0, short_step=0.5, thumb_size=10.0, limit_1=0, limit_2=1): """ This function detects instances of the most representative part of a music recording, also called "music thumbnails". A technique similar to the one proposed in [1], however a wider set of audio features is used instead of chroma features. In particular the following steps are followed: - Extract short-term audio features. Typical short-term window size: 1 second - Compute the self-similarity matrix, i.e. all pairwise similarities between feature vectors - Apply a diagonal mask is as a moving average filter on the values of the self-similarty matrix. The size of the mask is equal to the desirable thumbnail length. - Find the position of the maximum value of the new (filtered) self-similarity matrix. The audio segments that correspond to the diagonial around that position are the selected thumbnails ARGUMENTS: - signal: input signal - sampling_rate: sampling frequency - short_window: window size (in seconds) - short_step: window step (in seconds) - thumb_size: desider thumbnail size (in seconds) RETURNS: - A1: beginning of 1st thumbnail (in seconds) - A2: ending of 1st thumbnail (in seconds) - B1: beginning of 2nd thumbnail (in seconds) - B2: ending of 2nd thumbnail (in seconds) USAGE EXAMPLE: import audioFeatureExtraction as aF [fs, x] = basicIO.readAudioFile(input_file) [A1, A2, B1, B2] = musicThumbnailing(x, fs) [1] Bartsch, M. A., & Wakefield, G. H. (2005). Audio thumbnailing of popular music using chroma-based representations. Multimedia, IEEE Transactions on, 7(1), 96-104. """ signal = audioBasicIO.stereo_to_mono(signal) # feature extraction: st_feats, _ = stf.feature_extraction(signal, sampling_rate, sampling_rate * short_window, sampling_rate * short_step) # self-similarity matrix sim_matrix = self_similarity_matrix(st_feats) # moving filter: m_filter = int(round(thumb_size / short_step)) diagonal = np.eye(m_filter, m_filter) sim_matrix = scipy.signal.convolve2d(sim_matrix, diagonal, 'valid') # post-processing (remove main diagonal elements) min_sm = np.min(sim_matrix) for i in range(sim_matrix.shape[0]): for j in range(sim_matrix.shape[1]): if abs(i-j) < 5.0 / short_step or i > j: sim_matrix[i, j] = min_sm # find max position: sim_matrix[0:int(limit_1 * sim_matrix.shape[0]), :] = min_sm sim_matrix[:, 0:int(limit_1 * sim_matrix.shape[0])] = min_sm sim_matrix[int(limit_2 * sim_matrix.shape[0])::, :] = min_sm sim_matrix[:, int(limit_2 * sim_matrix.shape[0])::] = min_sm rows, cols = np.unravel_index(sim_matrix.argmax(), sim_matrix.shape) i1 = rows i2 = rows j1 = cols j2 = cols while i2-i1 < m_filter: if i1 <= 0 or j1 <= 0 or i2 >= sim_matrix.shape[0]-2 or \ j2 >= sim_matrix.shape[1]-2: break if sim_matrix[i1-1, j1-1] > sim_matrix[i2 + 1, j2 + 1]: i1 -= 1 j1 -= 1 else: i2 += 1 j2 += 1 return short_step * i1, short_step * i2, short_step * j1, short_step * j2, \ sim_matrix
def silenceRemoval(x, fs, st_win, st_step, smoothWindow=0.5, weight=0.5, plot=False): """ Event Detection (silence removal) ARGUMENTS: - x: the input audio signal - fs: sampling freq - st_win, st_step: window size and step in seconds - smoothWindow: (optinal) smooth window (in seconds) - weight: (optinal) weight factor (0 < weight < 1) the higher, the more strict - plot: (optinal) True if results are to be plotted RETURNS: - seg_limits: list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds """ if weight >= 1: weight = 0.99 if weight <= 0: weight = 0.01 # Step 1: feature extraction 特征提取 x = audioBasicIO.stereo_to_mono(x) st_feats, _ = sF.feature_extraction(x, fs, st_win * fs, st_step * fs) # st_feats (68个特征,966) # Step 2: train binary svm classifier of low vs high energy frames 训练低能量帧与高能量帧的二进制svm分类器 # keep only the energy short-term sequence (2nd feature) 仅保留能量短期序列(第二个特征) st_energy = st_feats[1, :] # st_feats (966,) en = np.sort(st_energy) # 将帧按能量大小进行排序 # number of 10% of the total short-term windows 短期窗口总数的10% l1 = int(len(en) / 10) # compute "lower" 10% energy threshold 计算“较低”的10%能量阈值 均值 t1 = np.mean(en[0:l1]) + 0.000000000000001 # compute "higher" 10% energy threshold 计算“较高”的10%能量阈值 均值 t2 = np.mean(en[-l1:-1]) + 0.000000000000001 # get all features that correspond to low energy 获得所有与低能耗相对应的功能 class1 = st_feats[:, np.where(st_energy <= t1)[0]] # get all features that correspond to high energy 获得所有与高能量对应的特征 class2 = st_feats[:, np.where(st_energy >= t2)[0]] # form the binary classification task and ... 形成二进制分类任务并... faets_s = [class1.T, class2.T] # class1.T(58,68) class2.T(38,68) # normalize and train the respective svm probabilistic model 规范化并训练各自的svm概率模型 # (ONSET vs SILENCE) (开始vs沉默) [faets_s_norm, means_s, stds_s] = aT.normalizeFeatures(faets_s) # 标准化:减均值除方差 svm = aT.trainSVM(faets_s_norm, 1.0) # Step 3: compute onset probability based on the trained svm 根据受过训练的svm计算发作概率 prob_on_set = [] for i in range(st_feats.shape[1]): # st_feats.shape[1] 966 # for each frame cur_fv = (st_feats[:, i] - means_s) / stds_s # 每帧的特征 (68,) # get svm probability (that it belongs to the ONSET class) 获取svm概率(它属于ONSET类) prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1]) prob_on_set = np.array(prob_on_set) # smooth probability: 平稳概率 prob_on_set = smoothMovingAvg(prob_on_set, smoothWindow / st_step) # Step 4A: detect onset frame indices: 检测起始帧索引 prog_on_set_sort = np.sort(prob_on_set) # 对检测概率进行排序 # find probability Threshold as a weighted average 查找概率阈值作为加权平均值 # of top 10% and lower 10% of the values 值的前10%和下10% Nt = int(prog_on_set_sort.shape[0] / 10) T = ( np.mean((1 - weight) * prog_on_set_sort[0:Nt]) + # 排序后取 前96帧 weight * np.mean(prog_on_set_sort[-Nt::])) # 排序后取 后96帧 # 加权平均得到阈值 max_idx = np.where(prob_on_set > T)[0] # 大于阈值的帧(491,0) # get the indices of the frames that satisfy the thresholding 获取满足阈值的帧的索引 i = 0 time_clusters = [] seg_limits = [] # Step 4B: group frame indices to onset segments 将框架索引分组以开始片段 while i < len(max_idx): # for each of the detected onset indices 对于每个检测到的发病指数 cur_cluster = [max_idx[i]] if i == len(max_idx) - 1: break while max_idx[i + 1] - cur_cluster[-1] <= 2: cur_cluster.append(max_idx[i + 1]) i += 1 if i == len(max_idx) - 1: break i += 1 time_clusters.append(cur_cluster) seg_limits.append( [cur_cluster[0] * st_step, cur_cluster[-1] * st_step]) # seg_limits= [[0.12,1.73],[3.65,5.29],[7.72,9.35]] # Step 5: Post process: remove very small segments: 发布过程:删除非常小的细分 # 删除 小于0.2s的部分 min_dur = 0.2 seg_limits_2 = [] for s in seg_limits: if s[1] - s[0] > min_dur: seg_limits_2.append(s) seg_limits = seg_limits_2 if plot: timeX = np.arange(0, x.shape[0] / float(fs), 1.0 / fs) plt.subplot(2, 1, 1) plt.plot(timeX, x / x.max()) for s in seg_limits: plt.axvline(x=s[0], color='red') plt.axvline(x=s[1], color='red') plt.subplot(2, 1, 2) plt.plot(np.arange(0, prob_on_set.shape[0] * st_step, st_step), prob_on_set) plt.title('Signal') for s in seg_limits: plt.axvline(x=s[0], color='red') plt.axvline(x=s[1], color='red') plt.ylim(0, 1) plt.title('svm Probability') plt.tight_layout() plt.show() return seg_limits
def audio_to_asr_text(audio_path, google_credentials_file): """ Audio to asr using google speech API :param audio_path: wav audio file to analyze :param google_credentials_file: path to google api credentials file :return: my_results: output dict of the format: ['word':..., 'st': ..., 'et':...] data: raw text output (not structured) """ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_credentials_file language_code = "en-US" fs, dur = get_wav_properties(audio_path) cur_pos = 0 my_results = [] data = "" number_of_words = 0 # stereo to mono sampling_rate, signal = audioBasicIO.read_audio_file(audio_path) signal = audioBasicIO.stereo_to_mono(signal) wavfile.write(audio_path,fs,signal) #command1 = f"ffmpeg -i {audio_path} -ac 1 {audio_path} -y" #os.system(command1) while cur_pos < dur: encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 client = speech.SpeechClient() config = { "language_code": language_code, "sample_rate_hertz": fs, "enable_word_time_offsets": True, "encoding": encoding, } cur_end = cur_pos + MAX_FILE_DURATION if dur < cur_end: cur_end = dur command = f"ffmpeg -i {audio_path} -ss {cur_pos} -to " \ f"{cur_end} temp.wav -loglevel panic -y" os.system(command) with io.open("temp.wav", "rb") as f: content = f.read() audio = {"content": content} response = client.long_running_recognize(config,audio).result() for flag, result in enumerate(response.results): alternative = result.alternatives[0] data += alternative.transcript number_of_words = number_of_words + len(alternative.words) for w in alternative.words: my_results.append({"word": w.word, "st": w.start_time.seconds + float(w.start_time.nanos) / 10**9 + cur_pos, "et": w.end_time.seconds + float(w.end_time.nanos) / 10**9 + cur_pos }) cur_pos += MAX_FILE_DURATION return my_results, data,number_of_words,dur
def long_feature_wav(wav_file, mid_window, mid_step, short_window, short_step, accept_small_wavs=False, compute_beat=True, librosa_features=False, surfboard_features=False): """ This function computes the long-term feature per WAV file. It is identical to directory_feature_extraction, with simple modifications in order to be applied to singular files. Very useful to create a collection of json files (1 song -> 1 json). Genre as a feature should be added (very simple). ARGUMENTS: - wav_file: the path of the WAVE directory - mid_window, mid_step: mid-term window and step (in seconds) - short_window, short_step: short-term window and step (in seconds) RETURNS: - mid_term_feaures: The feature vector of a singular wav file - mid_feature_names: The feature names, useful for formating """ mid_term_features = np.array([]) sampling_rate, signal = audioBasicIO.read_audio_file(wav_file) if sampling_rate == 0: return -1 signal = audioBasicIO.stereo_to_mono(signal) size_tolerance = 5 if accept_small_wavs: size_tolerance = 100 if signal.shape[0] < float(sampling_rate) / size_tolerance: print(" (AUDIO FILE TOO SMALL - SKIPPING)") return -1 if compute_beat: mid_features, short_features, mid_feature_names = \ mid_feature_extraction(signal, sampling_rate, round(mid_window * sampling_rate), round(mid_step * sampling_rate), round(sampling_rate * short_window), round(sampling_rate * short_step)) beat, beat_conf = beat_extraction(short_features, short_step) else: mid_features, _, mid_feature_names = \ mid_feature_extraction(signal, sampling_rate, round(mid_window * sampling_rate), round(mid_step * sampling_rate), round(sampling_rate * short_window), round(sampling_rate * short_step)) mid_features = np.transpose(mid_features) mid_features = mid_features.mean(axis=0) # long term averaging of mid-term statistics if (not np.isnan(mid_features).any()) and \ (not np.isinf(mid_features).any()): if compute_beat: mid_features = np.append(mid_features, beat) mid_features = np.append(mid_features, beat_conf) mid_feature_names.append("beat") mid_feature_names.append("beat_conf") # Block of code responsible for extra features if librosa_features: librosa_feat, librosa_feat_names = _audio_to_librosa_features( wav_file, sampling_rate=sampling_rate) mid_features = np.append(mid_features, librosa_feat) for element in librosa_feat_names: mid_feature_names.append(element) if surfboard_features: surfboard_feat, surfboard_feat_names = _audio_to_surfboard_features( wav_file, sampling_rate=sampling_rate) mid_features = np.append(mid_features, surfboard_feat) for element in surfboard_feat_names: mid_feature_names.append(element) if len(mid_term_features) == 0: # append feature vector mid_term_features = mid_features else: mid_term_features = np.vstack((mid_term_features, mid_features)) return mid_term_features, mid_feature_names
def create_feature_from_audio(filename): import pyogg import numpy as np import ctypes, numpy, pyogg import matplotlib.pyplot as plt import scipy.io.wavfile # https://github.com/Zuzu-Typ/PyOgg/issues/19 # file = pyogg.OpusFile(filename) # stereo # audio_path_opus = "./" file = pyogg.OpusFile(filename) target_datatype = ctypes.c_short * (file.buffer_length // 2 ) # always divide by 2 for some reason buffer_as_array = ctypes.cast(file.buffer, ctypes.POINTER(target_datatype)).contents if file.channels == 1: wav = numpy.array(buffer_as_array) elif file.channels == 2: wav = numpy.array((wav[0::2], wav[1::2])) else: raise NotImplementedError() # This is the final numpy array signal = numpy.transpose(wav) sampling_rate = 48000 print(numpy.shape(wav)) #plt.figure #plt.title("Signal Wave...") #plt.plot(signal) #plt.show() # Calculating features from final_data from pyAudioAnalysis import MidTermFeatures as mF from pyAudioAnalysis import ShortTermFeatures as sF from pyAudioAnalysis import audioBasicIO mid_window = round(0.1 * sampling_rate) mid_step = round(0.1 * sampling_rate) short_window = round(sampling_rate * 0.01) short_step = round(sampling_rate * 0.01) signal = audioBasicIO.stereo_to_mono(signal) print(type(signal)) # print(np.shape(signal)) signal = signal.astype( 'float64' ) # this line is because librosa was making an error - need floats [mid_features, short_features, mid_feature_names] = mF.mid_feature_extraction(signal, sampling_rate, mid_window, mid_step, short_window, short_step) mid_features = np.transpose(mid_features) mid_term_features = mid_features.mean(axis=0) mid_term_features = np.reshape(mid_term_features, (-1, 1)) mid_term_features = np.transpose(mid_term_features) # print(np.shape(mid_term_features)) # len(mid_feature_names) # Getting the classification result with Cough=0, No_Cough=1 from joblib import dump, load from sklearn import preprocessing cough_classifier = load('Cough_NoCough_classifier.joblib') features = preprocessing.StandardScaler().fit_transform(mid_term_features) prediction = cough_classifier.predict(features) # coughs=0 , no_cough = 1 return prediction, mid_term_features
def speaker_diarization(filename, n_speakers, mid_window=1.0, mid_step=0.1, short_window=0.1, lda_dim=0, plot_res=False): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mid_window (opt) mid-term window size - mid_step (opt) mid-term window step - short_window (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting """ sampling_rate, signal = audioBasicIO.read_audio_file(filename) signal = audioBasicIO.stereo_to_mono(signal) duration = len(signal) / sampling_rate base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models") classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \ at.load_model(os.path.join(base_dir, "svm_rbf_speaker_10")) classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _, _ = \ at.load_model(os.path.join(base_dir, "svm_rbf_speaker_male_female")) mid_feats, st_feats, a = \ mtf.mid_feature_extraction(signal, sampling_rate, mid_window * sampling_rate, mid_step * sampling_rate, round(sampling_rate * 0.05), round(sampling_rate * 0.05)) mid_term_features = np.zeros((mid_feats.shape[0] + len(class_names_all) + len(class_names_fm), mid_feats.shape[1])) for index in range(mid_feats.shape[1]): feature_norm_all = (mid_feats[:, index] - mean_all) / std_all feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "svm_rbf", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "svm_rbf", feature_norm_fm) start = mid_feats.shape[0] end = mid_feats.shape[0] + len(class_names_all) mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index] mid_term_features[start:end, index] = p1 + 1e-4 mid_term_features[end::, index] = p2 + 1e-4 # normalize features: scaler = StandardScaler() mid_feats_norm = scaler.fit_transform(mid_term_features.T) # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.1 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: # EnergyMin = np.min(mt_feats[1,:]) # EnergyMean = np.mean(mt_feats[1,:]) # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] # print i_non_outliers mt_feats_norm_or = mid_feats_norm mid_feats_norm = mid_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: # extract mid-term features with minimum step: window_ratio = int(round(mid_window / short_window)) step_ratio = int(round(short_window / short_window)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 for index in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) # for each of the short-term features: for index in range(num_of_features): cur_pos = 0 feat_len = len(st_feats[index]) while cur_pos < feat_len: n1 = cur_pos n2 = cur_pos + window_ratio if n2 > feat_len: n2 = feat_len short_features = st_feats[index][n1:n2] mt_feats_to_red[index].append(np.mean(short_features)) mt_feats_to_red[index + num_of_features].\ append(np.std(short_features)) cur_pos += step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] + len(class_names_all) + len(class_names_fm), mt_feats_to_red.shape[1])) limit = mt_feats_to_red.shape[0] + len(class_names_all) for index in range(mt_feats_to_red.shape[1]): feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm _, p1 = at.classifier_wrapper(classifier_all, "svm_rbf", feature_norm_all) _, p2 = at.classifier_wrapper(classifier_fm, "svm_rbf", feature_norm_fm) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \ mt_feats_to_red[:, index] mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit, index] = p1 + 1e-4 mt_feats_to_red_2[limit::, index] = p2 + 1e-4 mt_feats_to_red = mt_feats_to_red_2 scaler = StandardScaler() mt_feats_to_red = scaler.fit_transform(mt_feats_to_red.T).T labels = np.zeros((mt_feats_to_red.shape[1], )) lda_step = 1.0 lda_step_ratio = lda_step / short_window for index in range(labels.shape[0]): labels[index] = int(index * short_window / lda_step_ratio) clf = sklearn.discriminant_analysis.\ LinearDiscriminantAnalysis(n_components=lda_dim) mid_feats_norm = clf.fit_transform(mt_feats_to_red.T, labels) #clf.fit(mt_feats_to_red.T, labels) #mid_feats_norm = (clf.transform(mid_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] cluster_labels = [] sil_all = [] cluster_centers = [] for speakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=speakers) k_means.fit(mid_feats_norm) cls = k_means.labels_ cluster_labels.append(cls) # cluster_centers.append(means) sil_1 = []; sil_2 = [] for c in range(speakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mid_feats_norm[cls == c, :] # compute average distance between samples # that belong to the cluster (a values) dist = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(dist)*clust_per_cent) sil_temp = [] for c2 in range(speakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) mid_features_temp = mid_feats_norm[cls == c2, :] dist = distance.cdist(mt_feats_norm_temp, mid_features_temp) sil_temp.append(np.mean(dist)*(clust_per_cent + clust_per_cent_2)/2.0) sil_temp = np.array(sil_temp) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(sil_temp)) sil_1 = np.array(sil_1) sil_2 = np.array(sil_2) sil = [] for c in range(speakers): # for each cluster (speaker) compute silhouette sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = int(np.argmax(sil_all)) # optimal number of clusters num_speakers = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) # print(cls) # cls = np.zeros((n_wins,)) # for index in range(n_wins): # j = np.argmin(np.abs(index-i_non_outliers)) # cls[index] = cluster_labels[imax][j] # Post-process method 1: hmm smoothing if lda_dim <= 0 : for index in range(1): # hmm training start_prob, transmat, means, cov = \ train_hmm_compute_statistics(mt_feats_norm_or.T, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means; hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 5) class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundtruth exists if os.path.isfile(gt_file): seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file) flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end, seg_labs, mid_step) if plot_res: fig = plt.figure() if n_speakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(np.array(range(len(class_names)))) ax1.axis((0, duration, -1, len(class_names))) ax1.set_yticklabels(class_names) ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls) purity_cluster_m, purity_speaker_m = -1, -1 if os.path.isfile(gt_file): if plot_res: ax1.plot(np.array(range(len(flags_gt))) * mid_step + mid_step / 2.0, flags_gt, 'r') purity_cluster_m, purity_speaker_m = \ evaluate_speaker_diarization(cls, flags_gt) print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.title("Cluster purity: {0:.1f}% - " "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m, 100 * purity_speaker_m)) if plot_res: plt.xlabel("time (seconds)") if n_speakers <= 0: plt.subplot(212) plt.plot(s_range, sil_all) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() return cls, purity_cluster_m, purity_speaker_m
def silence_removal(signal, sampling_rate, st_win, st_step, smooth_window=0.5, weight=0.5, plot=False): """ Event Detection (silence removal) ARGUMENTS: - signal: the input audio signal - sampling_rate: sampling freq - st_win, st_step: window size and step in seconds - smoothWindow: (optinal) smooth window (in seconds) - weight: (optinal) weight factor (0 < weight < 1) the higher, the more strict - plot: (optinal) True if results are to be plotted RETURNS: - seg_limits: list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds """ if weight >= 1: weight = 0.99 if weight <= 0: weight = 0.01 # Step 1: feature extraction signal = audioBasicIO.stereo_to_mono(signal) st_feats, _ = stf.feature_extraction(signal, sampling_rate, st_win * sampling_rate, st_step * sampling_rate) # Step 2: train binary svm classifier of low vs high energy frames # keep only the energy short-term sequence (2nd feature) st_energy = st_feats[1, :] en = np.sort(st_energy) # number of 10% of the total short-term windows st_windows_fraction = int(len(en) / 10) # compute "lower" 10% energy threshold low_threshold = np.mean(en[0:st_windows_fraction]) + 1e-15 # compute "higher" 10% energy threshold high_threshold = np.mean(en[-st_windows_fraction:-1]) + 1e-15 # get all features that correspond to low energy low_energy = st_feats[:, np.where(st_energy <= low_threshold)[0]] # get all features that correspond to high energy high_energy = st_feats[:, np.where(st_energy >= high_threshold)[0]] # form the binary classification task and ... features = [low_energy.T, high_energy.T] # normalize and train the respective svm probabilistic model # (ONSET vs SILENCE) features, labels = at.features_to_matrix(features) scaler = StandardScaler() features_norm = scaler.fit_transform(features) mean = scaler.mean_ std = scaler.scale_ svm = at.train_svm(features_norm, labels, 1.0) # Step 3: compute onset probability based on the trained svm prob_on_set = [] for index in range(st_feats.shape[1]): # for each frame cur_fv = (st_feats[:, index] - mean) / std # get svm probability (that it belongs to the ONSET class) prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1]) prob_on_set = np.array(prob_on_set) # smooth probability: prob_on_set = smooth_moving_avg(prob_on_set, smooth_window / st_step) # Step 4A: detect onset frame indices: prog_on_set_sort = np.sort(prob_on_set) # find probability Threshold as a weighted average # of top 10% and lower 10% of the values nt = int(prog_on_set_sort.shape[0] / 10) threshold = (np.mean((1 - weight) * prog_on_set_sort[0:nt]) + weight * np.mean(prog_on_set_sort[-nt::])) max_indices = np.where(prob_on_set > threshold)[0] # get the indices of the frames that satisfy the thresholding index = 0 seg_limits = [] time_clusters = [] # Step 4B: group frame indices to onset segments while index < len(max_indices): # for each of the detected onset indices cur_cluster = [max_indices[index]] if index == len(max_indices)-1: break while max_indices[index+1] - cur_cluster[-1] <= 2: cur_cluster.append(max_indices[index+1]) index += 1 if index == len(max_indices)-1: break index += 1 time_clusters.append(cur_cluster) seg_limits.append([cur_cluster[0] * st_step, cur_cluster[-1] * st_step]) # Step 5: Post process: remove very small segments: min_duration = 0.2 seg_limits_2 = [] for s_lim in seg_limits: if s_lim[1] - s_lim[0] > min_duration: seg_limits_2.append(s_lim) seg_limits = seg_limits_2 if plot: time_x = np.arange(0, signal.shape[0] / float(sampling_rate), 1.0 / sampling_rate) plt.subplot(2, 1, 1) plt.plot(time_x, signal) for s_lim in seg_limits: plt.axvline(x=s_lim[0], color='red') plt.axvline(x=s_lim[1], color='red') plt.subplot(2, 1, 2) plt.plot(np.arange(0, prob_on_set.shape[0] * st_step, st_step), prob_on_set) plt.title('Signal') for s_lim in seg_limits: plt.axvline(x=s_lim[0], color='red') plt.axvline(x=s_lim[1], color='red') plt.title('svm Probability') plt.show() return seg_limits
def mid_term_file_classification(input_file, model_name, model_type, plot_results=False, gt_file=""): """ This function performs mid-term classification of an audio stream. Towards this end, supervised knowledge is used, i.e. a pre-trained classifier. ARGUMENTS: - input_file: path of the input WAV file - model_name: name of the classification model - model_type: svm or knn depending on the classifier type - plot_results: True if results are to be plotted using matplotlib along with a set of statistics RETURNS: - segs: a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds) - classes: a sequence of class flags: class[i] is the class ID of the i-th segment """ labels = [] accuracy = 0.0 class_names = [] cm = np.array([]) if not os.path.isfile(model_name): print("mtFileClassificationError: input model_type not found!") return labels, class_names, accuracy, cm # Load classifier: if model_type == "knn": classifier, mean, std, class_names, mt_win, mid_step, st_win, \ st_step, compute_beat = at.load_model_knn(model_name) else: classifier, mean, std, class_names, mt_win, mid_step, st_win, \ st_step, compute_beat = at.load_model(model_name) if compute_beat: print("Model " + model_name + " contains long-term music features " "(beat etc) and cannot be used in " "segmentation") return labels, class_names, accuracy, cm # load input file sampling_rate, signal = audioBasicIO.read_audio_file(input_file) # could not read file if sampling_rate == 0: return labels, class_names, accuracy, cm # convert stereo (if) to mono signal = audioBasicIO.stereo_to_mono(signal) # mid-term feature extraction: mt_feats, _, _ = \ mtf.mid_feature_extraction(signal, sampling_rate, mt_win * sampling_rate, mid_step * sampling_rate, round(sampling_rate * st_win), round(sampling_rate * st_step)) posterior_matrix = [] # for each feature vector (i.e. for each fix-sized segment): for col_index in range(mt_feats.shape[1]): # normalize current feature v feature_vector = (mt_feats[:, col_index] - mean) / std # classify vector: label_predicted, posterior = \ at.classifier_wrapper(classifier, model_type, feature_vector) labels.append(label_predicted) # update probability matrix posterior_matrix.append(np.max(posterior)) labels = np.array(labels) # convert fix-sized flags to segments and classes segs, classes = labels_to_segments(labels, mid_step) for i in range(len(segs)): print(segs[i], classes[i]) segs[-1] = len(signal) / float(sampling_rate) # Load grount-truth: labels_gt, class_names_gt, accuracy, cm = \ load_ground_truth(gt_file, labels, class_names, mid_step, plot_results) return labels, class_names, accuracy, cm
import matplotlib.pyplot as plt import subprocess import numpy as np #extract some audio VIDEOFILE = "../data/raw/8/replay.mp4" AUDIOFILE = "./extracted.wav" FEATUREFILE = "./extracted.ft" command = f"ffmpeg -i {VIDEOFILE} -vn {AUDIOFILE} -y" subprocess.call(command, shell=True) [Fs, x] = audioBasicIO.read_audio_file(AUDIOFILE) x = audioBasicIO.stereo_to_mono(x) midF, shortF, midFNames = MidTermFeatures.mid_feature_extraction(x,Fs, 0.1*Fs,0.05*Fs,0.05*Fs,0.025*Fs) np.save(FEATUREFILE, midF) np.savetxt(FEATUREFILE + ".csv", midF.T, delimiter=",", header=",".join(midFNames)) #%% audioAnalysis.thumbnailWrapper(AUDIOFILE,50) #explore the audio audioAnalysis.fileSpectrogramWrapper(AUDIOFILE) audioAnalysis.fileChromagramWrapper(AUDIOFILE) audioAnalysis.beatExtractionWrapper(AUDIOFILE, True) #%%
def dirWavFeatureExtraction(dirName, mt_win, mt_step, st_win, st_step, compute_beat=False): """ This function extracts the mid-term features of the WAVE files of a particular folder. The resulting feature vector is extracted by long-term averaging the mid-term features. Therefore ONE FEATURE VECTOR is extracted for each WAV file. ARGUMENTS: - dirName: the path of the WAVE directory - mt_win, mt_step: mid-term window and step (in seconds) - st_win, st_step: short-term window and step (in seconds) """ all_mt_feats = np.array([]) process_times = [] types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au', '*.ogg') wav_file_list = [] for files in types: wav_file_list.extend(glob.glob(os.path.join(dirName, files))) wav_file_list = sorted(wav_file_list) wav_file_list2, mt_feature_names = [], [] for i, wavFile in enumerate(wav_file_list): print("Analyzing file {0:d} of " "{1:d}: {2:s}".format(i+1, len(wav_file_list), wavFile)) if os.stat(wavFile).st_size == 0: print(" (EMPTY FILE -- SKIPPING)") continue [fs, x] = audioBasicIO.read_audio_file(wavFile) if isinstance(x, int): continue t1 = time.clock() x = audioBasicIO.stereo_to_mono(x) if x.shape[0]<float(fs)/5: print(" (AUDIO FILE TOO SMALL - SKIPPING)") continue wav_file_list2.append(wavFile) if compute_beat: [mt_term_feats, st_features, mt_feature_names] = \ mid_term_feature_extraction(x, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step)) [beat, beat_conf] = beatExtraction(st_features, st_step) else: [mt_term_feats, _, mt_feature_names] = \ mid_term_feature_extraction(x, fs, round(mt_win * fs), round(mt_step * fs), round(fs * st_win), round(fs * st_step)) mt_term_feats = np.transpose(mt_term_feats) mt_term_feats = mt_term_feats.mean(axis=0) # long term averaging of mid-term statistics if (not np.isnan(mt_term_feats).any()) and \ (not np.isinf(mt_term_feats).any()): if compute_beat: mt_term_feats = np.append(mt_term_feats, beat) mt_term_feats = np.append(mt_term_feats, beat_conf) if len(all_mt_feats) == 0: # append feature vector all_mt_feats = mt_term_feats else: all_mt_feats = np.vstack((all_mt_feats, mt_term_feats)) t2 = time.clock() duration = float(len(x)) / fs process_times.append((t2 - t1) / duration) if len(process_times) > 0: print("Feature extraction complexity ratio: " "{0:.1f} x realtime".format((1.0 / np.mean(np.array(process_times))))) return (all_mt_feats, wav_file_list2, mt_feature_names)
def fileGreenwaySpeakerDiarization(filename, output_folder, speech_key="52fe944f29784ae288482e5eb3092e2a", service_region="eastus2", n_speakers=2, mt_size=2.0, mt_step=0.2, st_win=0.05, lda_dim=35): """ ARGUMENTS: - filename: the name of the WAV file to be analyzed the filename should have a suffix of the form: ..._min_3 this informs the service that audio file corresponds to the 3rd minute of the dialogue - output_folder the folder location for saving the audio snippets generated from diarization - speech_key mid-term window size - service_region the number of speakers (clusters) in the recording (<=0 for unknown) - n_speakers the number of speakers (clusters) in the recording (<=0 for unknown) - mt_size (opt) mid-term window size - mt_step (opt) mid-term window step - st_win (opt) short-term window size - lda_dim (opt LDA dimension (0 for no LDA) - plot_res (opt) 0 for not plotting the results 1 for plotting - save_plot (opt) 1|True for saving plot in output folder """ ''' OUTPUTS: - cls: this is a vector with speaker ids in chronological sequence of speaker dialogue. - output: a list of python dictionaries containing dialogue sequence information. - dialogue_id - sequence_id - start_time - end_time - text ''' filename_only = filename if "/" not in filename else filename.split("/")[-1] nameoffile = filename_only.split("_min_")[0] timeoffile = filename_only.split("_min_")[1] [fs, x] = audioBasicIO.read_audio_file(filename) x = audioBasicIO.stereo_to_mono(x) duration = len(x) / fs [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "pyAudioAnalysis/data/models", "knn_speaker_10")) [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn( os.path.join(os.path.dirname(os.path.realpath(__file__)), "pyAudioAnalysis/data/models", "knn_speaker_male_female")) [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs, mt_step * fs, round(fs * st_win), round(fs*st_win * 0.5)) MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) + len(classNames2), mt_feats.shape[1])) for i in range(mt_feats.shape[1]): cur_f1 = (mt_feats[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i] MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats = MidTermFeatures2 # TODO iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] mt_feats = mt_feats[iFeaturesSelect, :] (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T]) mt_feats_norm = mt_feats_norm[0].T n_wins = mt_feats.shape[1] # remove outliers: dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)), axis=0) m_dist_all = np.mean(dist_all) i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = np.min(mt_feats[1,:]) #EnergyMean = np.mean(mt_feats[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0] # print i_non_outliers perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins mt_feats_norm_or = mt_feats_norm mt_feats_norm = mt_feats_norm[:, i_non_outliers] # LDA dimensionality reduction: if lda_dim > 0: # [mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, # st_win * fs, round(fs*st_win), round(fs*st_win)); # extract mid-term features with minimum step: mt_win_ratio = int(round(mt_size / st_win)) mt_step_ratio = int(round(st_win / st_win)) mt_feats_to_red = [] num_of_features = len(st_feats) num_of_stats = 2 # for i in range(num_of_stats * num_of_features + 1): for i in range(num_of_stats * num_of_features): mt_feats_to_red.append([]) # for each of the short-term features: for i in range(num_of_features): curPos = 0 N = len(st_feats[i]) while (curPos < N): N1 = curPos N2 = curPos + mt_win_ratio if N2 > N: N2 = N curStFeatures = st_feats[i][N1:N2] mt_feats_to_red[i].append(np.mean(curStFeatures)) mt_feats_to_red[i + num_of_features].append(np.std(curStFeatures)) curPos += mt_step_ratio mt_feats_to_red = np.array(mt_feats_to_red) mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2), mt_feats_to_red.shape[1])) for i in range(mt_feats_to_red.shape[1]): cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1 cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2 [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1) [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2) mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i] mt_feats_to_red_2[mt_feats_to_red.shape[0] :mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001 mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mt_feats_to_red = mt_feats_to_red_2 mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :] #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010 (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures( [mt_feats_to_red.T]) mt_feats_to_red = mt_feats_to_red[0].T #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0) #m_dist_all = np.mean(dist_all) #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0] #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2] Labels = np.zeros((mt_feats_to_red.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / st_win # print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i*st_win/LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=lda_dim) clf.fit(mt_feats_to_red.T, Labels) mt_feats_norm = (clf.transform(mt_feats_norm.T)).T if n_speakers <= 0: s_range = range(2, 10) else: s_range = [n_speakers] clsAll = [] sil_all = [] centersAll = [] for iSpeakers in s_range: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(mt_feats_norm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(mt_feats_norm.T)) clsAll.append(cls) centersAll.append(means) sil_1 = [] sil_2 = [] for c in range(iSpeakers): # for each speaker (i.e. for each extracted cluster) clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \ float(len(cls)) if clust_per_cent < 0.020: sil_1.append(0.0) sil_2.append(0.0) else: # get subset of feature vectors mt_feats_norm_temp = mt_feats_norm[:, cls == c] # compute average distance between samples # that belong to the cluster (a values) Yt = distance.pdist(mt_feats_norm_temp.T) sil_1.append(np.mean(Yt)*clust_per_cent) silBs = [] for c2 in range(iSpeakers): # compute distances from samples of other clusters if c2 != c: clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\ float(len(cls)) MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2] Yt = distance.cdist(mt_feats_norm_temp.T, MidTermFeaturesNormTemp2.T) silBs.append(np.mean(Yt)*(clust_per_cent + clust_per_cent_2)/2.0) silBs = np.array(silBs) # ... and keep the minimum value (i.e. # the distance from the "nearest" cluster) sil_2.append(min(silBs)) sil_1 = np.array(sil_1) sil_2 = np.array(sil_2) sil = [] for c in range(iSpeakers): # for each cluster (speaker) compute silhouette sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001)) # keep the AVERAGE SILLOUETTE sil_all.append(np.mean(sil)) imax = np.argmax(sil_all) # optimal number of clusters nSpeakersFinal = s_range[imax] # generate the final set of cluster labels # (important: need to retrieve the outlier windows: # this is achieved by giving them the value of their # nearest non-outlier window) cls = np.zeros((n_wins,)) for i in range(n_wins): j = np.argmin(np.abs(i-i_non_outliers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): # hmm training start_prob, transmat, means, cov = \ trainHMM_computeStatistics(mt_feats_norm_or, cls) hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag") hmm.startprob_ = start_prob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(mt_feats_norm_or.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = sil_all[imax] class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)] # load ground-truth if available gt_file = filename.replace('.wav', '.segments') # if groundturh exists if os.path.isfile(gt_file): [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file) flags_gt, class_names_gt = segs2flags( seg_start, seg_end, seg_labs, mt_step) # if plot_res: # fig = plt.figure() # if n_speakers > 0: # ax1 = fig.add_subplot(111) # else: # ax1 = fig.add_subplot(211) # ax1.set_yticks(np.array(range(len(class_names)))) # ax1.axis((0, duration, -1, len(class_names))) # ax1.set_yticklabels(class_names) # ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls) # if os.path.isfile(gt_file): # if plot_res: # ax1.plot(np.array(range(len(flags_gt))) * # mt_step + mt_step / 2.0, flags_gt, 'r') # purity_cluster_m, purity_speaker_m = \ # evaluateSpeakerDiarization(cls, flags_gt) # print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m, # 100 * purity_speaker_m)) # if plot_res: # plt.title("Cluster purity: {0:.1f}% - " # "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m, # 100 * purity_speaker_m)) # if plot_res: # plt.xlabel("time (seconds)") # # print s_range, sil_all # if n_speakers <= 0: # plt.subplot(212) # plt.plot(s_range, sil_all) # plt.xlabel("number of clusters") # plt.ylabel("average clustering's sillouette") # if save_plot: # plt.savefig( # f"{output_folder}{filename_only}".replace(".wav", ".png")) # else: # pass # plt.show() # Create Time Vector time_vec = np.array(range(len(cls)))*mt_step+mt_step/2.0 # Find Change Points speaker_change_index = np.where(np.roll(cls, 1) != cls)[0] # Create List of dialogue convos output_list = [] temp = {} for ind, sc in enumerate(speaker_change_index): temp['dialogue_id'] = str(datetime.now()).strip() temp['sequence_id'] = str(ind) temp['speaker'] = list(cls)[sc] temp['start_time'] = time_vec[sc] temp['end_time'] = time_vec[speaker_change_index[ind+1] - 1] if ind+1 < len(speaker_change_index) else time_vec[-1] temp["text"] = "" output_list.append(temp) temp = {} def snip_transcribe(output_list, filename, output_folder=output_folder, speech_key=speech_key, service_region=service_region): speech_config = speechsdk.SpeechConfig( subscription=speech_key, region=service_region) speech_config.enable_dictation def recognized_cb(evt): if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech: # Do something with the recognized text output_list[ind]['text'] = output_list[ind]['text'] + \ str(evt.result.text) print(evt.result.text) for ind, diag in enumerate(output_list): t1 = diag['start_time'] t2 = diag['end_time'] newAudio = AudioSegment.from_wav(filename) chunk = newAudio[t1*1000:t2*1000] filename_out = output_folder + f"snippet_{diag['sequence_id']}.wav" # Exports to a wav file in the current path. chunk.export(filename_out, format="wav") done = False def stop_cb(evt): """callback that signals to stop continuous recognition upon receiving an event `evt`""" print('CLOSING on {}'.format(evt)) nonlocal done done = True audio_input = speechsdk.AudioConfig(filename=filename_out) speech_recognizer = speechsdk.SpeechRecognizer( speech_config=speech_config, audio_config=audio_input) output_list[ind]['snippet_path'] = filename_out speech_recognizer.recognized.connect(recognized_cb) speech_recognizer.session_stopped.connect(stop_cb) speech_recognizer.canceled.connect(stop_cb) # Start continuous speech recognition speech_recognizer.start_continuous_recognition() while not done: time.sleep(.5) speech_recognizer.stop_continuous_recognition() return output_list output = snip_transcribe(output_list, filename, output_folder=output_folder) output_json = {filename_only: output} with open(f"{output_folder}{nameoffile}_{timeoffile}.txt", "w") as outfile: json.dump(output_json, outfile) return cls, output_json
def loadAudio(path): Fs, x = audioBasicIO.read_audio_file(path) x = audioBasicIO.stereo_to_mono(x) return Fs, x
def directory_feature_extraction(folder_path, mid_window, mid_step, short_window, short_step, compute_beat=True): """ This function extracts the mid-term features of the WAVE files of a particular folder. The resulting feature vector is extracted by long-term averaging the mid-term features. Therefore ONE FEATURE VECTOR is extracted for each WAV file. ARGUMENTS: - folder_path: the path of the WAVE directory - mid_window, mid_step: mid-term window and step (in seconds) - short_window, short_step: short-term window and step (in seconds) """ mid_term_features = np.array([]) process_times = [] types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au', '*.ogg') wav_file_list = [] for files in types: wav_file_list.extend(glob.glob(os.path.join(folder_path, files))) wav_file_list = sorted(wav_file_list) wav_file_list2, mid_feature_names = [], [] for i, file_path in enumerate(wav_file_list): print("Analyzing file {0:d} of {1:d}: {2:s}".format( i + 1, len(wav_file_list), file_path)) if os.stat(file_path).st_size == 0: print(" (EMPTY FILE -- SKIPPING)") continue sampling_rate, signal = audioBasicIO.read_audio_file(file_path) if sampling_rate == 0: continue t1 = time.time() signal = audioBasicIO.stereo_to_mono(signal) if signal.shape[0] < float(sampling_rate) / 5: print(" (AUDIO FILE TOO SMALL - SKIPPING)") continue wav_file_list2.append(file_path) if compute_beat: mid_features, short_features, mid_feature_names = \ mid_feature_extraction(signal, sampling_rate, round(mid_window * sampling_rate), round(mid_step * sampling_rate), round(sampling_rate * short_window), round(sampling_rate * short_step)) beat, beat_conf = beat_extraction(short_features, short_step) else: mid_features, _, mid_feature_names = \ mid_feature_extraction(signal, sampling_rate, round(mid_window * sampling_rate), round(mid_step * sampling_rate), round(sampling_rate * short_window), round(sampling_rate * short_step)) mid_features = np.transpose(mid_features) mid_features = mid_features.mean(axis=0) # long term averaging of mid-term statistics if (not np.isnan(mid_features).any()) and \ (not np.isinf(mid_features).any()): if compute_beat: mid_features = np.append(mid_features, beat) mid_features = np.append(mid_features, beat_conf) if len(mid_term_features) == 0: # append feature vector mid_term_features = mid_features else: mid_term_features = np.vstack( (mid_term_features, mid_features)) t2 = time.time() duration = float(len(signal)) / sampling_rate process_times.append((t2 - t1) / duration) if len(process_times) > 0: print("Feature extraction complexity ratio: " "{0:.1f} x realtime".format( (1.0 / np.mean(np.array(process_times))))) return mid_term_features, wav_file_list2, mid_feature_names
from pyAudioAnalysis import audioBasicIO from tqdm import tqdm smile = opensmile.Smile(feature_set=opensmile.FeatureSet.eGeMAPSv01b, feature_level=opensmile.FeatureLevel.Functionals, ) data = pd.DataFrame([]) for wave_file in tqdm(os.listdir(pos_path)): wav_file_path = os.path.join(pos_path,wave_file) sampling_rate, signal = audioBasicIO.read_audio_file(wav_file_path) signal = audioBasicIO.stereo_to_mono(signal) ps = smile.process_signal(signal,sampling_rate) ps = pd.DataFrame(ps).reset_index().iloc[:,2:] ps['filename'] = wave_file.split('.')[0] ps['label'] = np.ones(len(ps)) data = pd.concat([data, ps]) data_pos = data.reset_index().iloc[:,1:] data = pd.DataFrame([]) for idx, wave_file in enumerate(os.listdir(neg_path)): wav_file_path = os.path.join(neg_path,wave_file) sampling_rate, signal = audioBasicIO.read_audio_file(wav_file_path)