Ejemplo n.º 1
0
"""! 
@brief Example 29
@details: Music segmentation example
@author Theodoros Giannakopoulos {[email protected]}
"""
import os, readchar, sklearn.cluster
from pyAudioAnalysis.MidTermFeatures import mid_feature_extraction as mT
from pyAudioAnalysis.audioBasicIO import read_audio_file, stereo_to_mono
from pyAudioAnalysis.audioSegmentation import labels_to_segments
from pyAudioAnalysis.audioTrainTest import normalize_features

if __name__ == '__main__':
    # read signal and get normalized segment features:
    input_file = "../data/song1.mp3"
    fs, x = read_audio_file(input_file)
    x = stereo_to_mono(x)
    mt_size, mt_step, st_win = 5, 0.5, 0.05
    [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs,
                                 round(fs * st_win), round(fs * st_win * 0.5))
    (mt_feats_norm, MEAN, STD) = normalize_features([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    # perform clustering (k = 4)
    n_clusters = 4
    k_means = sklearn.cluster.KMeans(n_clusters=n_clusters)
    k_means.fit(mt_feats_norm.T)
    cls = k_means.labels_
    segs, c = labels_to_segments(cls,
                                 mt_step)  # convert flags to segment limits
    for sp in range(n_clusters):  # play each cluster's segment
        for i in range(len(c)):
            if c[i] == sp and segs[i, 1] - segs[i, 0] > 5:
Ejemplo n.º 2
0
def silenceRemoval(x, fs, st_win, st_step, smoothWindow=0.5, weight=0.5, plot=False):
    """
    Event Detection (silence removal)
    ARGUMENTS:
         - x:                the input audio signal
         - fs:               sampling freq
         - st_win, st_step:    window size and step in seconds
         - smoothWindow:     (optinal) smooth window (in seconds)
         - weight:           (optinal) weight factor (0 < weight < 1)
                              the higher, the more strict
         - plot:             (optinal) True if results are to be plotted
    RETURNS:
         - seg_limits:    list of segment limits in seconds (e.g [[0.1, 0.9],
                          [1.4, 3.0]] means that
                          the resulting segments are (0.1 - 0.9) seconds
                          and (1.4, 3.0) seconds
    """

    if weight >= 1:
        weight = 0.99
    if weight <= 0:
        weight = 0.01

    # Step 1: feature extraction
    x = audioBasicIO.stereo_to_mono(x)
    st_feats, _ = sF.feature_extraction(x, fs, st_win * fs, st_step * fs)

    # Step 2: train binary svm classifier of low vs high energy frames
    # keep only the energy short-term sequence (2nd feature)
    st_energy = st_feats[1, :]
    en = np.sort(st_energy)
    # number of 10% of the total short-term windows
    l1 = int(len(en) / 10)
    # compute "lower" 10% energy threshold
    t1 = np.mean(en[0:l1]) + 0.000000000000001
    # compute "higher" 10% energy threshold
    t2 = np.mean(en[-l1:-1]) + 0.000000000000001
    # get all features that correspond to low energy
    class1 = st_feats[:, np.where(st_energy <= t1)[0]]
    # get all features that correspond to high energy
    class2 = st_feats[:, np.where(st_energy >= t2)[0]]
    # form the binary classification task and ...
    faets_s = [class1.T, class2.T]
    # normalize and train the respective svm probabilistic model
    # (ONSET vs SILENCE)
    [faets_s_norm, means_s, stds_s] = aT.normalizeFeatures(faets_s)
    svm = aT.trainSVM(faets_s_norm, 1.0)

    # Step 3: compute onset probability based on the trained svm
    prob_on_set = []
    for i in range(st_feats.shape[1]):
        # for each frame
        cur_fv = (st_feats[:, i] - means_s) / stds_s
        # get svm probability (that it belongs to the ONSET class)
        prob_on_set.append(svm.predict_proba(cur_fv.reshape(1,-1))[0][1])
    prob_on_set = np.array(prob_on_set)
    # smooth probability:
    prob_on_set = smoothMovingAvg(prob_on_set, smoothWindow / st_step)

    # Step 4A: detect onset frame indices:
    prog_on_set_sort = np.sort(prob_on_set)
    # find probability Threshold as a weighted average
    # of top 10% and lower 10% of the values
    Nt = int(prog_on_set_sort.shape[0] / 10)
    T = (np.mean((1 - weight) * prog_on_set_sort[0:Nt]) +
         weight * np.mean(prog_on_set_sort[-Nt::]))

    max_idx = np.where(prob_on_set > T)[0]
    # get the indices of the frames that satisfy the thresholding
    i = 0
    time_clusters = []
    seg_limits = []

    # Step 4B: group frame indices to onset segments
    while i < len(max_idx):
        # for each of the detected onset indices
        cur_cluster = [max_idx[i]]
        if i == len(max_idx)-1:
            break
        while max_idx[i+1] - cur_cluster[-1] <= 2:
            cur_cluster.append(max_idx[i+1])
            i += 1
            if i == len(max_idx)-1:
                break
        i += 1
        time_clusters.append(cur_cluster)
        seg_limits.append([cur_cluster[0] * st_step,
                           cur_cluster[-1] * st_step])

    # Step 5: Post process: remove very small segments:
    min_dur = 0.2
    seg_limits_2 = []
    for s in seg_limits:
        if s[1] - s[0] > min_dur:
            seg_limits_2.append(s)
    seg_limits = seg_limits_2

    if plot:
        timeX = np.arange(0, x.shape[0] / float(fs), 1.0 / fs)

        plt.subplot(2, 1, 1)
        plt.plot(timeX, x)
        for s in seg_limits:
            plt.axvline(x=s[0], color='red')
            plt.axvline(x=s[1], color='red')
        plt.subplot(2, 1, 2)
        plt.plot(np.arange(0, prob_on_set.shape[0] * st_step, st_step), 
                 prob_on_set)
        plt.title('Signal')
        for s in seg_limits:
            plt.axvline(x=s[0], color='red')
            plt.axvline(x=s[1], color='red')
        plt.title('svm Probability')
        plt.show()

    return seg_limits
Ejemplo n.º 3
0
def speakerDiarization(filename, n_speakers, mt_size=2.0, mt_step=0.2, 
                       st_win=0.05, lda_dim=35, plot_res=False):
    """
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
        - n_speakers       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - mt_size (opt)    mid-term window size
        - mt_step (opt)    mid-term window step
        - st_win  (opt)    short-term window size
        - lda_dim (opt     LDA dimension (0 for no LDA)
        - plot_res         (opt)   0 for not plotting the results 1 for plotting
    """
    [fs, x] = audioBasicIO.read_audio_file(filename)
    x = audioBasicIO.stereo_to_mono(x)
    duration = len(x) / fs

    [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models", "knn_speaker_10"))
    [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models", "knn_speaker_male_female"))

    [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs,
                                                        mt_step * fs,
                                                        round(fs * st_win),
                                                        round(fs*st_win * 0.5))

    MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) +
                                    len(classNames2), mt_feats.shape[1]))

    for i in range(mt_feats.shape[1]):
        cur_f1 = (mt_feats[:, i] - MEAN1) / STD1
        cur_f2 = (mt_feats[:, i] - MEAN2) / STD2
        [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
        [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
        MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
        MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0]+len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001

    mt_feats = MidTermFeatures2    # TODO
    iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41,
                       42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]

    mt_feats = mt_feats[iFeaturesSelect, :]

    (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    n_wins = mt_feats.shape[1]

    # remove outliers:
    dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)),
                         axis=0)
    m_dist_all = np.mean(dist_all)
    i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = np.min(mt_feats[1,:])
    #EnergyMean = np.mean(mt_feats[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
    #print i_non_outliers

    perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins
    mt_feats_norm_or = mt_feats_norm
    mt_feats_norm = mt_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs,
        # st_win * fs, round(fs*st_win), round(fs*st_win));
        # extract mid-term features with minimum step:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        #for i in range(num_of_stats * num_of_features + 1):
        for i in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        for i in range(num_of_features):  # for each of the short-term features:
            curPos = 0
            N = len(st_feats[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mt_win_ratio
                if N2 > N:
                    N2 = N
                curStFeatures = st_feats[i][N1:N2]
                mt_feats_to_red[i].append(np.mean(curStFeatures))
                mt_feats_to_red[i+num_of_features].append(np.std(curStFeatures))
                curPos += mt_step_ratio
        mt_feats_to_red = np.array(mt_feats_to_red)
        mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] +
                                        len(classNames1) + len(classNames2),
                                         mt_feats_to_red.shape[1]))
        for i in range(mt_feats_to_red.shape[1]):
            cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1
            cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2
            [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
            [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i]
            mt_feats_to_red_2[mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001
            mt_feats_to_red_2[mt_feats_to_red.shape[0]+len(classNames1)::, i] = P2 + 0.0001
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :]
        #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010
        (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures([mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0)
        #m_dist_all = np.mean(dist_all)
        #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0]
        #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2]
        Labels = np.zeros((mt_feats_to_red.shape[1], ));
        LDAstep = 1.0
        LDAstepRatio = LDAstep / st_win
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i*st_win/LDAstepRatio);        
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, Labels)
        mt_feats_norm = (clf.transform(mt_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    clsAll = []
    sil_all = []
    centersAll = []
    
    for iSpeakers in s_range:        
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(mt_feats_norm.T)
        cls = k_means.labels_        
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(mt_feats_norm.T))
        clsAll.append(cls)
        centersAll.append(means)
        sil_1 = []; sil_2 = []
        for c in range(iSpeakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \
                             float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mt_feats_norm[:, cls==c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                Yt = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(np.mean(Yt)*clust_per_cent)
                silBs = []
                for c2 in range(iSpeakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\
                                           float(len(cls))
                        MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2]
                        Yt = distance.cdist(mt_feats_norm_temp.T, 
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(np.mean(Yt)*(clust_per_cent
                                                     + clust_per_cent_2)/2.0)
                silBs = np.array(silBs)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(silBs))
        sil_1 = np.array(sil_1); 
        sil_2 = np.array(sil_2); 
        sil = []
        for c in range(iSpeakers):
            # for each cluster (speaker) compute silhouette
            sil.append( ( sil_2[c] - sil_1[c]) / (max(sil_2[c],
                                                      sil_1[c]) + 0.00001))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(np.mean(sil))

    imax = np.argmax(sil_all)
    # optimal number of clusters
    nSpeakersFinal = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = np.zeros((n_wins,))
    for i in range(n_wins):
        j = np.argmin(np.abs(i-i_non_outliers))        
        cls[i] = clsAll[imax][j]
        
    # Post-process method 1: hmm smoothing
    for i in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
            trainHMM_computeStatistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat            
        hmm.means_ = means; hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)                    
    
    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]
    class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)];


    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundturh exists
    if os.path.isfile(gt_file):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step)

    if plot_res:
        fig = plt.figure()    
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(np.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls)

    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(np.array(range(len(flags_gt))) *
                     mt_step + mt_step / 2.0, flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
            evaluateSpeakerDiarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m,
                                                        100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        #print s_range, sil_all    
        if n_speakers<=0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters");
            plt.ylabel("average clustering's sillouette");
        plt.show()
    return cls
Ejemplo n.º 4
0
def musicThumbnailing(x, fs, short_term_size=1.0, short_term_step=0.5, 
                      thumb_size=10.0, limit_1 = 0, limit_2 = 1):
    """
    This function detects instances of the most representative part of a
    music recording, also called "music thumbnails".
    A technique similar to the one proposed in [1], however a wider set of
    audio features is used instead of chroma features.
    In particular the following steps are followed:
     - Extract short-term audio features. Typical short-term window size: 1 second
     - Compute the self-similarity matrix, i.e. all pairwise similarities
       between feature vectors
     - Apply a diagonal mask is as a moving average filter on the values of the
       self-similarty matrix.
       The size of the mask is equal to the desirable thumbnail length.
     - Find the position of the maximum value of the new (filtered)
       self-similarity matrix. The audio segments that correspond to the
       diagonial around that position are the selected thumbnails
    

    ARGUMENTS:
     - x:            input signal
     - fs:            sampling frequency
     - short_term_size:     window size (in seconds)
     - short_term_step:    window step (in seconds)
     - thumb_size:    desider thumbnail size (in seconds)
    
    RETURNS:
     - A1:            beginning of 1st thumbnail (in seconds)
     - A2:            ending of 1st thumbnail (in seconds)
     - B1:            beginning of 2nd thumbnail (in seconds)
     - B2:            ending of 2nd thumbnail (in seconds)

    USAGE EXAMPLE:
       import audioFeatureExtraction as aF
     [fs, x] = basicIO.readAudioFile(input_file)
     [A1, A2, B1, B2] = musicThumbnailing(x, fs)

    [1] Bartsch, M. A., & Wakefield, G. H. (2005). Audio thumbnailing
    of popular music using chroma-based representations.
    Multimedia, IEEE Transactions on, 7(1), 96-104.
    """
    x = audioBasicIO.stereo_to_mono(x);
    # feature extraction:
    st_feats, _ = sF.feature_extraction(x, fs, fs * short_term_size,
                                        fs * short_term_step)

    # self-similarity matrix
    S = selfSimilarityMatrix(st_feats)

    # moving filter:
    M = int(round(thumb_size / short_term_step))
    B = np.eye(M,M)
    S = scipy.signal.convolve2d(S, B, 'valid')


    # post-processing (remove main diagonal elements)
    min_sm = np.min(S)
    for i in range(S.shape[0]):
        for j in range(S.shape[1]):
            if abs(i-j) < 5.0 / short_term_step or i > j:
                S[i,j] = min_sm;

    # find max position:
    S[0:int(limit_1 * S.shape[0]), :] = min_sm
    S[:, 0:int(limit_1 * S.shape[0])] = min_sm
    S[int(limit_2 * S.shape[0])::, :] = min_sm
    S[:, int(limit_2 * S.shape[0])::] = min_sm

    maxVal = np.max(S)        
    [I, J] = np.unravel_index(S.argmax(), S.shape)
    #plt.imshow(S)
    #plt.show()
    # expand:
    i1 = I
    i2 = I
    j1 = J
    j2 = J

    while i2-i1<M: 
        if i1 <=0 or j1<=0 or i2 >= S.shape[0]-2 or j2 >= S.shape[1]-2:
            break
        if S[i1-1, j1-1] > S[i2 + 1, j2 + 1]:
            i1 -= 1
            j1 -= 1            
        else:            
            i2 += 1
            j2 += 1            

    return short_term_step * i1, short_term_step * i2, \
           short_term_step * j1, short_term_step * j2, S
Ejemplo n.º 5
0
def mtFileClassification(input_file, model_name, model_type,
                         plot_results=False, gt_file=""):
    """
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used,
    i.e. a pre-trained classifier.
    ARGUMENTS:
        - input_file:        path of the input WAV file
        - model_name:        name of the classification model
        - model_type:        svm or knn depending on the classifier type
        - plot_results:      True if results are to be plotted using
                             matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the
                            endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the
                            class ID of the i-th segment
    """

    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if model_type == "knn":
        [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step,
         compute_beat] = aT.load_model_knn(model_name)
    else:
        [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step,
         compute_beat] = aT.load_model(model_name)

    if compute_beat:
        print("Model " + model_name + " contains long-term music features "
                                      "(beat etc) and cannot be used in "
                                      "segmentation")
        return (-1, -1, -1, -1)
    [fs, x] = audioBasicIO.read_audio_file(input_file) # load input file
    if fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audioBasicIO.stereo_to_mono(x)  # convert stereo (if) to mono
    # mid-term feature extraction:
    [mt_feats, _, _] = aF.mid_feature_extraction(x, fs, mt_win * fs,
                                                 mt_step * fs,
                                                 round(fs * st_win),
                                                 round(fs * st_step))
    flags = []
    Ps = []
    flags_ind = []
    # for each feature vector (i.e. for each fix-sized segment):
    for i in range(mt_feats.shape[1]):
        cur_fv = (mt_feats[:, i] - MEAN) / STD  # normalize current feature v
        # classify vector:
        [res, P] = aT.classifierWrapper(classifier, model_type, cur_fv)
        flags_ind.append(res)
        flags.append(class_names[int(res)])  # update class label matrix
        Ps.append(np.max(P))   # update probability matrix
    flags_ind = np.array(flags_ind)
    # 1-window smoothing
    for i in range(1, len(flags_ind) - 1):
        if flags_ind[i-1] == flags_ind[i + 1]:
            flags_ind[i] = flags_ind[i + 1]
    # convert fix-sized flags to segments and classes
    (segs, classes) = flags2segs(flags, mt_step)
    segs[-1] = len(x) / float(fs)

    # Load grount-truth:        
    if os.path.isfile(gt_file):
        [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt,
                                              seg_l_gt, mt_step)
        flags_ind_gt = []
        for j, fl in enumerate(flags_gt):
            # "align" labels with GT
            if class_names_gt[flags_gt[j]] in class_names:
                flags_ind_gt.append(class_names.index(class_names_gt[
                                                          flags_gt[j]]))
            else:
                flags_ind_gt.append(-1)
        flags_ind_gt = np.array(flags_ind_gt)        
        cm = np.zeros((len(class_names_gt), len(class_names_gt)))
        for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])):
            cm[int(flags_ind_gt[i]),int(flags_ind[i])] += 1        
    else:
        cm = []
        flags_ind_gt = np.array([])
    acc = plotSegmentationResults(flags_ind, flags_ind_gt,
                                  class_names, mt_step, not plot_results)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc)  )
        return (flags_ind, class_names_gt, acc, cm)
    else:
        return (flags_ind, class_names, acc, cm)
def audio_features_extraction(dir_name="../data",
                              mt_win=1.0,
                              mt_step=1.0,
                              st_win=0.050,
                              st_step=0.050,
                              features_audio_file='Audio2Features.pkl'):

    audio_dir = dir_name + '/' + 'audio'

    # first, extract audio from video
    v2a.video2audio(dir_name)

    features = []
    file_names = []

    mid_term_features = np.array([])
    process_times = []

    # type is WAVE file, convert using the function video_to_audio.py
    suffix = ".wav"
    index_df = pd.read_csv(dir_name + '/' + 'index.csv', sep=';')

    wav_file_list, mid_feature_names = [], []

    # iterate each audio file
    print('Extracting features from audio files...')

    bar = progressbar.ProgressBar(maxval=len(index_df), \
                                  widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    bar_index = 0
    for ind in index_df.index:
        name = index_df['FILE'][ind]
        seg = str(index_df['SEG'][ind])

        file_path = audio_dir + '/' + name + '/' + seg + suffix
        # print("Analyzing file {0:d} of {1:d}: {2:s}".format(ind+1,len(index_df),file_path))

        if os.stat(file_path).st_size == 0:
            logging.warning("WARNING: EMPTY FILE -- SKIPPING")
            continue
        [sampling_rate, signal] = audioBasicIO.read_audio_file(file_path)
        if sampling_rate == 0:
            logging.warning("WARNING: NO SAMPLING RATE -- SKIPPING")
            continue

        t1 = time.clock()
        signal = audioBasicIO.stereo_to_mono(signal)
        if signal.shape[0] < float(sampling_rate) / 5:
            logging.warning("WARNING: AUDIO FILE TOO SMALL -- SKIPPING")
            continue
        wav_file_list.append(file_path)

        mid_features, _, mid_feature_names = \
            aF.mid_feature_extraction(signal, sampling_rate,
                                    round(mt_win * sampling_rate),
                                    round(mt_step * sampling_rate),
                                    round(st_win * sampling_rate),
                                    round(st_step * sampling_rate))
        mid_features = np.transpose(mid_features)
        mid_features = mid_features.mean(axis=0)
        # long term averaging of mid-term statistics
        if (not np.isnan(mid_features).any()) and \
                (not np.isinf(mid_features).any()):
            if len(mid_term_features) == 0:
                # append feature vector
                mid_term_features = mid_features
            else:
                mid_term_features = np.vstack(
                    (mid_term_features, mid_features))

        t2 = time.clock()
        duration = float(len(signal)) / sampling_rate
        process_times.append((t2 - t1) / duration)

        # update progress bar index
        bar_index += 1
        bar.update(bar_index)

    bar.finish()

    if len(process_times) > 0:
        print("Audio feature extraction completed. Complexity ratio: "
              "{0:.1f} x realtime".format(
                  (1.0 / np.mean(np.array(process_times)))))

    print('Shape: ' + str(mid_term_features.shape))

    ftr_df = pd.DataFrame(data=mid_term_features)
    df = index_df.copy()
    df = pd.concat([df, ftr_df], axis=1)
    if True:
        df.to_pickle(dir_name + '/' + features_audio_file)

    return mid_term_features, wav_file_list, mid_feature_names
Ejemplo n.º 7
0
def music_thumbnailing(signal, sampling_rate, short_window=1.0, short_step=0.5,
                       thumb_size=10.0, limit_1=0, limit_2=1):
    """
    This function detects instances of the most representative part of a
    music recording, also called "music thumbnails".
    A technique similar to the one proposed in [1], however a wider set of
    audio features is used instead of chroma features.
    In particular the following steps are followed:
     - Extract short-term audio features. Typical short-term window size: 1
       second
     - Compute the self-similarity matrix, i.e. all pairwise similarities
       between feature vectors
     - Apply a diagonal mask is as a moving average filter on the values of the
       self-similarty matrix.
       The size of the mask is equal to the desirable thumbnail length.
     - Find the position of the maximum value of the new (filtered)
       self-similarity matrix. The audio segments that correspond to the
       diagonial around that position are the selected thumbnails
    

    ARGUMENTS:
     - signal:            input signal
     - sampling_rate:            sampling frequency
     - short_window:     window size (in seconds)
     - short_step:    window step (in seconds)
     - thumb_size:    desider thumbnail size (in seconds)
    
    RETURNS:
     - A1:            beginning of 1st thumbnail (in seconds)
     - A2:            ending of 1st thumbnail (in seconds)
     - B1:            beginning of 2nd thumbnail (in seconds)
     - B2:            ending of 2nd thumbnail (in seconds)

    USAGE EXAMPLE:
       import audioFeatureExtraction as aF
     [fs, x] = basicIO.readAudioFile(input_file)
     [A1, A2, B1, B2] = musicThumbnailing(x, fs)

    [1] Bartsch, M. A., & Wakefield, G. H. (2005). Audio thumbnailing
    of popular music using chroma-based representations.
    Multimedia, IEEE Transactions on, 7(1), 96-104.
    """
    signal = audioBasicIO.stereo_to_mono(signal)
    # feature extraction:
    st_feats, _ = stf.feature_extraction(signal, sampling_rate,
                                         sampling_rate * short_window,
                                         sampling_rate * short_step)

    # self-similarity matrix
    sim_matrix = self_similarity_matrix(st_feats)

    # moving filter:
    m_filter = int(round(thumb_size / short_step))
    diagonal = np.eye(m_filter, m_filter)
    sim_matrix = scipy.signal.convolve2d(sim_matrix, diagonal, 'valid')

    # post-processing (remove main diagonal elements)
    min_sm = np.min(sim_matrix)
    for i in range(sim_matrix.shape[0]):
        for j in range(sim_matrix.shape[1]):
            if abs(i-j) < 5.0 / short_step or i > j:
                sim_matrix[i, j] = min_sm

    # find max position:
    sim_matrix[0:int(limit_1 * sim_matrix.shape[0]), :] = min_sm
    sim_matrix[:, 0:int(limit_1 * sim_matrix.shape[0])] = min_sm
    sim_matrix[int(limit_2 * sim_matrix.shape[0])::, :] = min_sm
    sim_matrix[:, int(limit_2 * sim_matrix.shape[0])::] = min_sm

    rows, cols = np.unravel_index(sim_matrix.argmax(), sim_matrix.shape)
    i1 = rows
    i2 = rows
    j1 = cols
    j2 = cols

    while i2-i1 < m_filter:
        if i1 <= 0 or j1 <= 0 or i2 >= sim_matrix.shape[0]-2 or \
                j2 >= sim_matrix.shape[1]-2:
            break
        if sim_matrix[i1-1, j1-1] > sim_matrix[i2 + 1, j2 + 1]:
            i1 -= 1
            j1 -= 1            
        else:            
            i2 += 1
            j2 += 1            

    return short_step * i1, short_step * i2, short_step * j1, short_step * j2, \
        sim_matrix
Ejemplo n.º 8
0
def silenceRemoval(x,
                   fs,
                   st_win,
                   st_step,
                   smoothWindow=0.5,
                   weight=0.5,
                   plot=False):
    """
    Event Detection (silence removal)
    ARGUMENTS:
         - x:                the input audio signal
         - fs:               sampling freq
         - st_win, st_step:    window size and step in seconds
         - smoothWindow:     (optinal) smooth window (in seconds)
         - weight:           (optinal) weight factor (0 < weight < 1)
                              the higher, the more strict
         - plot:             (optinal) True if results are to be plotted
    RETURNS:
         - seg_limits:    list of segment limits in seconds (e.g [[0.1, 0.9],
                          [1.4, 3.0]] means that
                          the resulting segments are (0.1 - 0.9) seconds
                          and (1.4, 3.0) seconds
    """

    if weight >= 1:
        weight = 0.99
    if weight <= 0:
        weight = 0.01

    # Step 1: feature extraction 特征提取
    x = audioBasicIO.stereo_to_mono(x)
    st_feats, _ = sF.feature_extraction(x, fs, st_win * fs,
                                        st_step * fs)  # st_feats (68个特征,966)

    # Step 2: train binary svm classifier of low vs high energy frames  训练低能量帧与高能量帧的二进制svm分类器
    # keep only the energy short-term sequence (2nd feature) 仅保留能量短期序列(第二个特征)
    st_energy = st_feats[1, :]  # st_feats (966,)
    en = np.sort(st_energy)  # 将帧按能量大小进行排序
    # number of 10% of the total short-term windows 短期窗口总数的10%
    l1 = int(len(en) / 10)
    # compute "lower" 10% energy threshold  计算“较低”的10%能量阈值 均值
    t1 = np.mean(en[0:l1]) + 0.000000000000001
    # compute "higher" 10% energy threshold  计算“较高”的10%能量阈值  均值
    t2 = np.mean(en[-l1:-1]) + 0.000000000000001
    # get all features that correspond to low energy  获得所有与低能耗相对应的功能
    class1 = st_feats[:, np.where(st_energy <= t1)[0]]
    # get all features that correspond to high energy  获得所有与高能量对应的特征
    class2 = st_feats[:, np.where(st_energy >= t2)[0]]
    # form the binary classification task and ...  形成二进制分类任务并...
    faets_s = [class1.T, class2.T]  # class1.T(58,68) class2.T(38,68)
    # normalize and train the respective svm probabilistic model 规范化并训练各自的svm概率模型
    # (ONSET vs SILENCE)  (开始vs沉默)
    [faets_s_norm, means_s,
     stds_s] = aT.normalizeFeatures(faets_s)  # 标准化:减均值除方差
    svm = aT.trainSVM(faets_s_norm, 1.0)

    # Step 3: compute onset probability based on the trained svm 根据受过训练的svm计算发作概率
    prob_on_set = []
    for i in range(st_feats.shape[1]):  # st_feats.shape[1] 966
        # for each frame
        cur_fv = (st_feats[:, i] - means_s) / stds_s  # 每帧的特征 (68,)
        # get svm probability (that it belongs to the ONSET class)  获取svm概率(它属于ONSET类)
        prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1])
    prob_on_set = np.array(prob_on_set)
    # smooth probability: 平稳概率
    prob_on_set = smoothMovingAvg(prob_on_set, smoothWindow / st_step)

    # Step 4A: detect onset frame indices: 检测起始帧索引
    prog_on_set_sort = np.sort(prob_on_set)  # 对检测概率进行排序
    # find probability Threshold as a weighted average 查找概率阈值作为加权平均值
    # of top 10% and lower 10% of the values 值的前10%和下10%
    Nt = int(prog_on_set_sort.shape[0] / 10)
    T = (
        np.mean((1 - weight) * prog_on_set_sort[0:Nt]) +  # 排序后取 前96帧
        weight * np.mean(prog_on_set_sort[-Nt::]))  # 排序后取 后96帧
    # 加权平均得到阈值

    max_idx = np.where(prob_on_set > T)[0]  # 大于阈值的帧(491,0)
    # get the indices of the frames that satisfy the thresholding   获取满足阈值的帧的索引
    i = 0
    time_clusters = []
    seg_limits = []

    # Step 4B: group frame indices to onset segments  将框架索引分组以开始片段
    while i < len(max_idx):
        # for each of the detected onset indices  对于每个检测到的发病指数
        cur_cluster = [max_idx[i]]
        if i == len(max_idx) - 1:
            break
        while max_idx[i + 1] - cur_cluster[-1] <= 2:
            cur_cluster.append(max_idx[i + 1])
            i += 1
            if i == len(max_idx) - 1:
                break
        i += 1
        time_clusters.append(cur_cluster)
        seg_limits.append(
            [cur_cluster[0] * st_step, cur_cluster[-1] * st_step])

    # seg_limits= [[0.12,1.73],[3.65,5.29],[7.72,9.35]]

    # Step 5: Post process: remove very small segments: 发布过程:删除非常小的细分
    # 删除 小于0.2s的部分
    min_dur = 0.2
    seg_limits_2 = []
    for s in seg_limits:
        if s[1] - s[0] > min_dur:
            seg_limits_2.append(s)
    seg_limits = seg_limits_2

    if plot:
        timeX = np.arange(0, x.shape[0] / float(fs), 1.0 / fs)

        plt.subplot(2, 1, 1)
        plt.plot(timeX, x / x.max())
        for s in seg_limits:
            plt.axvline(x=s[0], color='red')
            plt.axvline(x=s[1], color='red')
        plt.subplot(2, 1, 2)
        plt.plot(np.arange(0, prob_on_set.shape[0] * st_step, st_step),
                 prob_on_set)
        plt.title('Signal')
        for s in seg_limits:
            plt.axvline(x=s[0], color='red')
            plt.axvline(x=s[1], color='red')
        plt.ylim(0, 1)
        plt.title('svm Probability')
        plt.tight_layout()
        plt.show()

    return seg_limits
Ejemplo n.º 9
0
def audio_to_asr_text(audio_path, google_credentials_file):
    """
    Audio to asr using google speech API
    :param audio_path: wav audio file to analyze
    :param google_credentials_file:  path to google api credentials file
    :return:
        my_results: output dict of the format: ['word':..., 'st': ..., 'et':...]
        data: raw text output (not structured)
    """

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_credentials_file
    language_code = "en-US"

    fs, dur = get_wav_properties(audio_path)

    cur_pos = 0
    my_results = []
    data = ""

    number_of_words = 0

    # stereo to mono
    sampling_rate, signal = audioBasicIO.read_audio_file(audio_path)
    signal = audioBasicIO.stereo_to_mono(signal)
    wavfile.write(audio_path,fs,signal)
    #command1 = f"ffmpeg -i {audio_path} -ac 1 {audio_path} -y"
    #os.system(command1)

    while cur_pos < dur:

        encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
        client = speech.SpeechClient()
        config = {
                "language_code": language_code,
                "sample_rate_hertz": fs,
                "enable_word_time_offsets": True,
                "encoding": encoding,
            }
        cur_end = cur_pos + MAX_FILE_DURATION
        if dur < cur_end:
            cur_end = dur
            
        command = f"ffmpeg -i {audio_path} -ss {cur_pos} -to " \
                  f"{cur_end} temp.wav -loglevel panic -y"
        os.system(command)
        with io.open("temp.wav", "rb") as f:
            content = f.read()

       
        audio = {"content": content}

        response = client.long_running_recognize(config,audio).result()

        for flag, result in enumerate(response.results):
            alternative = result.alternatives[0]
            data += alternative.transcript
            number_of_words = number_of_words + len(alternative.words)
            for w in alternative.words:
                my_results.append({"word": w.word,
                                   "st": w.start_time.seconds +
                                         float(w.start_time.nanos) / 10**9 +
                                         cur_pos,
                                   "et": w.end_time.seconds +
                                         float(w.end_time.nanos) / 10**9 +
                                         cur_pos
                                   })

        cur_pos += MAX_FILE_DURATION
    return my_results, data,number_of_words,dur
Ejemplo n.º 10
0
def long_feature_wav(wav_file,
                     mid_window,
                     mid_step,
                     short_window,
                     short_step,
                     accept_small_wavs=False,
                     compute_beat=True,
                     librosa_features=False,
                     surfboard_features=False):
    """
    This function computes the long-term feature per WAV file.
    It is identical to directory_feature_extraction, with simple
    modifications in order to be applied to singular files.
    Very useful to create a collection of json files (1 song -> 1 json).
    Genre as a feature should be added (very simple).

    ARGUMENTS:
        - wav_file:        the path of the WAVE directory
        - mid_window, mid_step:    mid-term window and step (in seconds)
        - short_window, short_step:    short-term window and step (in seconds)

    RETURNS:
        - mid_term_feaures: The feature vector of a singular wav file
        - mid_feature_names: The feature names, useful for formating 
    """

    mid_term_features = np.array([])

    sampling_rate, signal = audioBasicIO.read_audio_file(wav_file)
    if sampling_rate == 0:
        return -1

    signal = audioBasicIO.stereo_to_mono(signal)

    size_tolerance = 5
    if accept_small_wavs:
        size_tolerance = 100
    if signal.shape[0] < float(sampling_rate) / size_tolerance:
        print("  (AUDIO FILE TOO SMALL - SKIPPING)")
        return -1

    if compute_beat:
        mid_features, short_features, mid_feature_names = \
        mid_feature_extraction(signal, sampling_rate,
                                    round(mid_window * sampling_rate),
                                    round(mid_step * sampling_rate),
                                    round(sampling_rate * short_window),
                                    round(sampling_rate * short_step))
        beat, beat_conf = beat_extraction(short_features, short_step)
    else:
        mid_features, _, mid_feature_names = \
        mid_feature_extraction(signal, sampling_rate,
                                   round(mid_window * sampling_rate),
                                   round(mid_step * sampling_rate),
                                   round(sampling_rate * short_window),
                                   round(sampling_rate * short_step))

    mid_features = np.transpose(mid_features)
    mid_features = mid_features.mean(axis=0)
    # long term averaging of mid-term statistics
    if (not np.isnan(mid_features).any()) and \
        (not np.isinf(mid_features).any()):
        if compute_beat:
            mid_features = np.append(mid_features, beat)
            mid_features = np.append(mid_features, beat_conf)
            mid_feature_names.append("beat")
            mid_feature_names.append("beat_conf")

        # Block of code responsible for extra features

        if librosa_features:
            librosa_feat, librosa_feat_names = _audio_to_librosa_features(
                wav_file, sampling_rate=sampling_rate)
            mid_features = np.append(mid_features, librosa_feat)
            for element in librosa_feat_names:
                mid_feature_names.append(element)

        if surfboard_features:
            surfboard_feat, surfboard_feat_names = _audio_to_surfboard_features(
                wav_file, sampling_rate=sampling_rate)
            mid_features = np.append(mid_features, surfboard_feat)
            for element in surfboard_feat_names:
                mid_feature_names.append(element)

        if len(mid_term_features) == 0:
            # append feature vector
            mid_term_features = mid_features
        else:
            mid_term_features = np.vstack((mid_term_features, mid_features))

    return mid_term_features, mid_feature_names
Ejemplo n.º 11
0
def create_feature_from_audio(filename):
    import pyogg
    import numpy as np
    import ctypes, numpy, pyogg
    import matplotlib.pyplot as plt
    import scipy.io.wavfile

    # https://github.com/Zuzu-Typ/PyOgg/issues/19
    # file = pyogg.OpusFile(filename)  # stereo
    # audio_path_opus = "./"
    file = pyogg.OpusFile(filename)
    target_datatype = ctypes.c_short * (file.buffer_length // 2
                                        )  # always divide by 2 for some reason
    buffer_as_array = ctypes.cast(file.buffer,
                                  ctypes.POINTER(target_datatype)).contents
    if file.channels == 1:
        wav = numpy.array(buffer_as_array)
    elif file.channels == 2:
        wav = numpy.array((wav[0::2], wav[1::2]))
    else:
        raise NotImplementedError()
    # This is the final numpy array
    signal = numpy.transpose(wav)
    sampling_rate = 48000
    print(numpy.shape(wav))

    #plt.figure
    #plt.title("Signal Wave...")
    #plt.plot(signal)
    #plt.show()

    # Calculating features from final_data
    from pyAudioAnalysis import MidTermFeatures as mF
    from pyAudioAnalysis import ShortTermFeatures as sF
    from pyAudioAnalysis import audioBasicIO

    mid_window = round(0.1 * sampling_rate)
    mid_step = round(0.1 * sampling_rate)
    short_window = round(sampling_rate * 0.01)
    short_step = round(sampling_rate * 0.01)

    signal = audioBasicIO.stereo_to_mono(signal)
    print(type(signal))
    # print(np.shape(signal))
    signal = signal.astype(
        'float64'
    )  # this line is because librosa was making an error - need floats

    [mid_features, short_features,
     mid_feature_names] = mF.mid_feature_extraction(signal, sampling_rate,
                                                    mid_window, mid_step,
                                                    short_window, short_step)
    mid_features = np.transpose(mid_features)
    mid_term_features = mid_features.mean(axis=0)
    mid_term_features = np.reshape(mid_term_features, (-1, 1))
    mid_term_features = np.transpose(mid_term_features)
    # print(np.shape(mid_term_features))
    # len(mid_feature_names)

    # Getting the classification result with Cough=0, No_Cough=1
    from joblib import dump, load
    from sklearn import preprocessing
    cough_classifier = load('Cough_NoCough_classifier.joblib')
    features = preprocessing.StandardScaler().fit_transform(mid_term_features)
    prediction = cough_classifier.predict(features)  # coughs=0 , no_cough = 1
    return prediction, mid_term_features
Ejemplo n.º 12
0
def speaker_diarization(filename, n_speakers, mid_window=1.0, mid_step=0.1,
                        short_window=0.1, lda_dim=0, plot_res=False):
    """
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
        - n_speakers       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - mid_window (opt)    mid-term window size
        - mid_step (opt)    mid-term window step
        - short_window  (opt)    short-term window size
        - lda_dim (opt     LDA dimension (0 for no LDA)
        - plot_res         (opt)   0 for not plotting the results 1 for plotting
    """
    sampling_rate, signal = audioBasicIO.read_audio_file(filename)
    signal = audioBasicIO.stereo_to_mono(signal)
    duration = len(signal) / sampling_rate

    base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "data/models")

    classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \
        at.load_model(os.path.join(base_dir, "svm_rbf_speaker_10"))
    classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _,  _ = \
        at.load_model(os.path.join(base_dir, "svm_rbf_speaker_male_female"))


    mid_feats, st_feats, a = \
        mtf.mid_feature_extraction(signal, sampling_rate,
                                   mid_window * sampling_rate,
                                   mid_step * sampling_rate,
                                   round(sampling_rate * 0.05),
                                   round(sampling_rate * 0.05))

    mid_term_features = np.zeros((mid_feats.shape[0] + len(class_names_all) +
                                  len(class_names_fm), mid_feats.shape[1]))
    for index in range(mid_feats.shape[1]):
        feature_norm_all = (mid_feats[:, index] - mean_all) / std_all
        feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm
        _, p1 = at.classifier_wrapper(classifier_all, "svm_rbf", feature_norm_all)
        _, p2 = at.classifier_wrapper(classifier_fm, "svm_rbf", feature_norm_fm)
        start = mid_feats.shape[0]
        end = mid_feats.shape[0] + len(class_names_all)
        mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index]
        mid_term_features[start:end, index] = p1 + 1e-4
        mid_term_features[end::, index] = p2 + 1e-4
    # normalize features:
    scaler = StandardScaler()
    mid_feats_norm = scaler.fit_transform(mid_term_features.T)

    # remove outliers:
    dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)),
                      axis=0)
    m_dist_all = np.mean(dist_all)
    i_non_outliers = np.nonzero(dist_all < 1.1 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    # EnergyMin = np.min(mt_feats[1,:])
    # EnergyMean = np.mean(mt_feats[1,:])
    # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
    # print i_non_outliers

    mt_feats_norm_or = mid_feats_norm
    mid_feats_norm = mid_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:

        # extract mid-term features with minimum step:
        window_ratio = int(round(mid_window / short_window))
        step_ratio = int(round(short_window / short_window))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        for index in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        # for each of the short-term features:
        for index in range(num_of_features):
            cur_pos = 0
            feat_len = len(st_feats[index])
            while cur_pos < feat_len:
                n1 = cur_pos
                n2 = cur_pos + window_ratio
                if n2 > feat_len:
                    n2 = feat_len
                short_features = st_feats[index][n1:n2]
                mt_feats_to_red[index].append(np.mean(short_features))
                mt_feats_to_red[index + num_of_features].\
                    append(np.std(short_features))
                cur_pos += step_ratio
        mt_feats_to_red = np.array(mt_feats_to_red)
        mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] +
                                      len(class_names_all) +
                                      len(class_names_fm),
                                      mt_feats_to_red.shape[1]))
        limit = mt_feats_to_red.shape[0] + len(class_names_all)
        for index in range(mt_feats_to_red.shape[1]):
            feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all
            feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm
            _, p1 = at.classifier_wrapper(classifier_all, "svm_rbf",
                                          feature_norm_all)
            _, p2 = at.classifier_wrapper(classifier_fm, "svm_rbf", feature_norm_fm)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \
                mt_feats_to_red[:, index]
            mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit, index] = p1 + 1e-4
            mt_feats_to_red_2[limit::, index] = p2 + 1e-4
        mt_feats_to_red = mt_feats_to_red_2
        scaler = StandardScaler()
        mt_feats_to_red = scaler.fit_transform(mt_feats_to_red.T).T
        labels = np.zeros((mt_feats_to_red.shape[1], ))
        lda_step = 1.0
        lda_step_ratio = lda_step / short_window
        for index in range(labels.shape[0]):
            labels[index] = int(index * short_window / lda_step_ratio)
        clf = sklearn.discriminant_analysis.\
            LinearDiscriminantAnalysis(n_components=lda_dim)
        mid_feats_norm = clf.fit_transform(mt_feats_to_red.T, labels)
        #clf.fit(mt_feats_to_red.T, labels)
        #mid_feats_norm = (clf.transform(mid_feats_norm.T)).T
    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    cluster_labels = []
    sil_all = []
    cluster_centers = []
    
    for speakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=speakers)
        k_means.fit(mid_feats_norm)
        cls = k_means.labels_ 
        cluster_labels.append(cls)
#        cluster_centers.append(means)
        sil_1 = []; sil_2 = []
        for c in range(speakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mid_feats_norm[cls == c, :]
                # compute average distance between samples
                # that belong to the cluster (a values)
                dist = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(np.mean(dist)*clust_per_cent)
                sil_temp = []
                for c2 in range(speakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\
                                           float(len(cls))
                        mid_features_temp = mid_feats_norm[cls == c2, :]
                        dist = distance.cdist(mt_feats_norm_temp,
                                              mid_features_temp)
                        sil_temp.append(np.mean(dist)*(clust_per_cent
                                                       + clust_per_cent_2)/2.0)
                sil_temp = np.array(sil_temp)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(sil_temp))
        sil_1 = np.array(sil_1)
        sil_2 = np.array(sil_2)
        sil = []
        for c in range(speakers):
            # for each cluster (speaker) compute silhouette
            sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(np.mean(sil))

    imax = int(np.argmax(sil_all))
    # optimal number of clusters
    num_speakers = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
#    print(cls)
#    cls = np.zeros((n_wins,))
#    for index in range(n_wins):
#        j = np.argmin(np.abs(index-i_non_outliers))
#        cls[index] = cluster_labels[imax][j]
    # Post-process method 1: hmm smoothing
    if lda_dim <= 0 :
        for index in range(1):
            # hmm training
            start_prob, transmat, means, cov = \
                train_hmm_compute_statistics(mt_feats_norm_or.T, cls)
            hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
            hmm.startprob_ = start_prob
            hmm.transmat_ = transmat            
            hmm.means_ = means; hmm.covars_ = cov
            cls = hmm.predict(mt_feats_norm_or)                        
    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 5)

    class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundtruth exists
    if os.path.isfile(gt_file):
        seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file)
        flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end,
                                                      seg_labs, mid_step)

    if plot_res:
        fig = plt.figure()    
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(np.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls)

    purity_cluster_m, purity_speaker_m = -1, -1
    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(np.array(range(len(flags_gt))) *
                     mid_step + mid_step / 2.0, flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
            evaluate_speaker_diarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m,
                                                        100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        if n_speakers <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
    return cls, purity_cluster_m, purity_speaker_m
Ejemplo n.º 13
0
def silence_removal(signal, sampling_rate, st_win, st_step, smooth_window=0.5,
                    weight=0.5, plot=False):
    """
    Event Detection (silence removal)
    ARGUMENTS:
         - signal:                the input audio signal
         - sampling_rate:               sampling freq
         - st_win, st_step:    window size and step in seconds
         - smoothWindow:     (optinal) smooth window (in seconds)
         - weight:           (optinal) weight factor (0 < weight < 1)
                              the higher, the more strict
         - plot:             (optinal) True if results are to be plotted
    RETURNS:
         - seg_limits:    list of segment limits in seconds (e.g [[0.1, 0.9],
                          [1.4, 3.0]] means that
                          the resulting segments are (0.1 - 0.9) seconds
                          and (1.4, 3.0) seconds
    """

    if weight >= 1:
        weight = 0.99
    if weight <= 0:
        weight = 0.01

    # Step 1: feature extraction
    signal = audioBasicIO.stereo_to_mono(signal)
    st_feats, _ = stf.feature_extraction(signal, sampling_rate,
                                         st_win * sampling_rate,
                                         st_step * sampling_rate)

    # Step 2: train binary svm classifier of low vs high energy frames
    # keep only the energy short-term sequence (2nd feature)
    st_energy = st_feats[1, :]
    en = np.sort(st_energy)
    # number of 10% of the total short-term windows
    st_windows_fraction = int(len(en) / 10)

    # compute "lower" 10% energy threshold
    low_threshold = np.mean(en[0:st_windows_fraction]) + 1e-15

    # compute "higher" 10% energy threshold
    high_threshold = np.mean(en[-st_windows_fraction:-1]) + 1e-15

    # get all features that correspond to low energy
    low_energy = st_feats[:, np.where(st_energy <= low_threshold)[0]]

    # get all features that correspond to high energy
    high_energy = st_feats[:, np.where(st_energy >= high_threshold)[0]]

    # form the binary classification task and ...
    features = [low_energy.T, high_energy.T]
    # normalize and train the respective svm probabilistic model

    # (ONSET vs SILENCE)
    features, labels = at.features_to_matrix(features)
    scaler = StandardScaler()
    features_norm = scaler.fit_transform(features)
    mean = scaler.mean_
    std = scaler.scale_
    svm = at.train_svm(features_norm, labels, 1.0)

    # Step 3: compute onset probability based on the trained svm
    prob_on_set = []
    for index in range(st_feats.shape[1]):
        # for each frame
        cur_fv = (st_feats[:, index] - mean) / std
        # get svm probability (that it belongs to the ONSET class)
        prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1])
    prob_on_set = np.array(prob_on_set)

    # smooth probability:
    prob_on_set = smooth_moving_avg(prob_on_set, smooth_window / st_step)

    # Step 4A: detect onset frame indices:
    prog_on_set_sort = np.sort(prob_on_set)

    # find probability Threshold as a weighted average
    # of top 10% and lower 10% of the values
    nt = int(prog_on_set_sort.shape[0] / 10)
    threshold = (np.mean((1 - weight) * prog_on_set_sort[0:nt]) +
         weight * np.mean(prog_on_set_sort[-nt::]))

    max_indices = np.where(prob_on_set > threshold)[0]
    # get the indices of the frames that satisfy the thresholding
    index = 0
    seg_limits = []
    time_clusters = []

    # Step 4B: group frame indices to onset segments
    while index < len(max_indices):
        # for each of the detected onset indices
        cur_cluster = [max_indices[index]]
        if index == len(max_indices)-1:
            break
        while max_indices[index+1] - cur_cluster[-1] <= 2:
            cur_cluster.append(max_indices[index+1])
            index += 1
            if index == len(max_indices)-1:
                break
        index += 1
        time_clusters.append(cur_cluster)
        seg_limits.append([cur_cluster[0] * st_step,
                           cur_cluster[-1] * st_step])

    # Step 5: Post process: remove very small segments:
    min_duration = 0.2
    seg_limits_2 = []
    for s_lim in seg_limits:
        if s_lim[1] - s_lim[0] > min_duration:
            seg_limits_2.append(s_lim)
    seg_limits = seg_limits_2

    if plot:
        time_x = np.arange(0, signal.shape[0] / float(sampling_rate), 1.0 /
                           sampling_rate)

        plt.subplot(2, 1, 1)
        plt.plot(time_x, signal)
        for s_lim in seg_limits:
            plt.axvline(x=s_lim[0], color='red')
            plt.axvline(x=s_lim[1], color='red')
        plt.subplot(2, 1, 2)
        plt.plot(np.arange(0, prob_on_set.shape[0] * st_step, st_step), 
                 prob_on_set)
        plt.title('Signal')
        for s_lim in seg_limits:
            plt.axvline(x=s_lim[0], color='red')
            plt.axvline(x=s_lim[1], color='red')
        plt.title('svm Probability')
        plt.show()

    return seg_limits
Ejemplo n.º 14
0
def mid_term_file_classification(input_file, model_name, model_type,
                                 plot_results=False, gt_file=""):
    """
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used,
    i.e. a pre-trained classifier.
    ARGUMENTS:
        - input_file:        path of the input WAV file
        - model_name:        name of the classification model
        - model_type:        svm or knn depending on the classifier type
        - plot_results:      True if results are to be plotted using
                             matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the
                            endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the
                            class ID of the i-th segment
    """
    labels = []
    accuracy = 0.0
    class_names = []
    cm = np.array([])
    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return labels, class_names, accuracy, cm

    # Load classifier:
    if model_type == "knn":
        classifier, mean, std, class_names, mt_win, mid_step, st_win, \
         st_step, compute_beat = at.load_model_knn(model_name)
    else:
        classifier, mean, std, class_names, mt_win, mid_step, st_win, \
         st_step, compute_beat = at.load_model(model_name)
    if compute_beat:
        print("Model " + model_name + " contains long-term music features "
                                      "(beat etc) and cannot be used in "
                                      "segmentation")
        return labels, class_names, accuracy, cm
    # load input file
    sampling_rate, signal = audioBasicIO.read_audio_file(input_file)

    # could not read file
    if sampling_rate == 0:
        return labels, class_names, accuracy, cm

    # convert stereo (if) to mono
    signal = audioBasicIO.stereo_to_mono(signal)

    # mid-term feature extraction:
    mt_feats, _, _ = \
        mtf.mid_feature_extraction(signal, sampling_rate,
                                   mt_win * sampling_rate,
                                   mid_step * sampling_rate,
                                   round(sampling_rate * st_win),
                                   round(sampling_rate * st_step))
    posterior_matrix = []

    # for each feature vector (i.e. for each fix-sized segment):
    for col_index in range(mt_feats.shape[1]):
        # normalize current feature v
        feature_vector = (mt_feats[:, col_index] - mean) / std

        # classify vector:
        label_predicted, posterior = \
            at.classifier_wrapper(classifier, model_type, feature_vector)
        labels.append(label_predicted)

        # update probability matrix
        posterior_matrix.append(np.max(posterior))
    labels = np.array(labels)

    # convert fix-sized flags to segments and classes
    segs, classes = labels_to_segments(labels, mid_step)
    for i in range(len(segs)):
        print(segs[i], classes[i])
    segs[-1] = len(signal) / float(sampling_rate)
    # Load grount-truth:
    labels_gt, class_names_gt, accuracy, cm = \
        load_ground_truth(gt_file, labels, class_names, mid_step, plot_results)

    return labels, class_names, accuracy, cm
Ejemplo n.º 15
0
import matplotlib.pyplot as plt
import subprocess
import numpy as np

#extract some audio

VIDEOFILE = "../data/raw/8/replay.mp4"
AUDIOFILE = "./extracted.wav"
FEATUREFILE = "./extracted.ft"

command = f"ffmpeg -i {VIDEOFILE} -vn {AUDIOFILE} -y"

subprocess.call(command, shell=True)

[Fs, x] = audioBasicIO.read_audio_file(AUDIOFILE)
x = audioBasicIO.stereo_to_mono(x)

midF, shortF, midFNames = MidTermFeatures.mid_feature_extraction(x,Fs, 0.1*Fs,0.05*Fs,0.05*Fs,0.025*Fs)

np.save(FEATUREFILE, midF)
np.savetxt(FEATUREFILE + ".csv", midF.T, delimiter=",", header=",".join(midFNames))
#%%
audioAnalysis.thumbnailWrapper(AUDIOFILE,50)
#explore the audio

audioAnalysis.fileSpectrogramWrapper(AUDIOFILE)

audioAnalysis.fileChromagramWrapper(AUDIOFILE)

audioAnalysis.beatExtractionWrapper(AUDIOFILE, True)
#%%
Ejemplo n.º 16
0
def dirWavFeatureExtraction(dirName, mt_win, mt_step, st_win, st_step,
                            compute_beat=False):
    """
    This function extracts the mid-term features of the WAVE files of a 
    particular folder.

    The resulting feature vector is extracted by long-term averaging the
    mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - dirName:        the path of the WAVE directory
        - mt_win, mt_step:    mid-term window and step (in seconds)
        - st_win, st_step:    short-term window and step (in seconds)
    """

    all_mt_feats = np.array([])
    process_times = []

    types = ('*.wav', '*.aif',  '*.aiff', '*.mp3', '*.au', '*.ogg')
    wav_file_list = []
    for files in types:
        wav_file_list.extend(glob.glob(os.path.join(dirName, files)))

    wav_file_list = sorted(wav_file_list)    
    wav_file_list2, mt_feature_names = [], []
    for i, wavFile in enumerate(wav_file_list):        
        print("Analyzing file {0:d} of "
              "{1:d}: {2:s}".format(i+1,
                                    len(wav_file_list),
                                    wavFile))
        if os.stat(wavFile).st_size == 0:
            print("   (EMPTY FILE -- SKIPPING)")
            continue        
        [fs, x] = audioBasicIO.read_audio_file(wavFile)
        if isinstance(x, int):
            continue        

        t1 = time.clock()        
        x = audioBasicIO.stereo_to_mono(x)
        if x.shape[0]<float(fs)/5:
            print("  (AUDIO FILE TOO SMALL - SKIPPING)")
            continue
        wav_file_list2.append(wavFile)
        if compute_beat:
            [mt_term_feats, st_features, mt_feature_names] = \
                mid_term_feature_extraction(x, fs, round(mt_win * fs),
                                            round(mt_step * fs),
                                            round(fs * st_win), 
                                            round(fs * st_step))
            [beat, beat_conf] = beatExtraction(st_features, st_step)
        else:
            [mt_term_feats, _, mt_feature_names] = \
                mid_term_feature_extraction(x, fs, round(mt_win * fs),
                                            round(mt_step * fs),
                                            round(fs * st_win), 
                                            round(fs * st_step))

        mt_term_feats = np.transpose(mt_term_feats)
        mt_term_feats = mt_term_feats.mean(axis=0)
        # long term averaging of mid-term statistics
        if (not np.isnan(mt_term_feats).any()) and \
                (not np.isinf(mt_term_feats).any()):
            if compute_beat:
                mt_term_feats = np.append(mt_term_feats, beat)
                mt_term_feats = np.append(mt_term_feats, beat_conf)
            if len(all_mt_feats) == 0:
                # append feature vector
                all_mt_feats = mt_term_feats
            else:
                all_mt_feats = np.vstack((all_mt_feats, mt_term_feats))
            t2 = time.clock()
            duration = float(len(x)) / fs
            process_times.append((t2 - t1) / duration)
    if len(process_times) > 0:
        print("Feature extraction complexity ratio: "
              "{0:.1f} x realtime".format((1.0 / 
                                           np.mean(np.array(process_times)))))
    return (all_mt_feats, wav_file_list2, mt_feature_names)
Ejemplo n.º 17
0
def fileGreenwaySpeakerDiarization(filename, output_folder, speech_key="52fe944f29784ae288482e5eb3092e2a", service_region="eastus2",
                                   n_speakers=2, mt_size=2.0, mt_step=0.2,
                                   st_win=0.05, lda_dim=35):
    """
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
                            the filename should have a suffix of the form: ..._min_3
                            this informs the service that audio file corresponds to the 3rd minute of the dialogue
        - output_folder    the folder location for saving the audio snippets generated from diarization                           
        - speech_key       mid-term window size            
        - service_region       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - n_speakers       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - mt_size (opt)    mid-term window size
        - mt_step (opt)    mid-term window step
        - st_win  (opt)    short-term window size
        - lda_dim (opt     LDA dimension (0 for no LDA)
        - plot_res         (opt)   0 for not plotting the results 1 for plotting
        - save_plot        (opt)   1|True for saving plot in output folder
    """
    '''
    OUTPUTS:
        - cls:             this is a vector with speaker ids in chronological sequence of speaker dialogue.
        - output:          a list of python dictionaries containing dialogue sequence information.
                            - dialogue_id
                            - sequence_id
                            - start_time
                            - end_time
                            - text
    '''

    filename_only = filename if "/" not in filename else filename.split("/")[-1]
    nameoffile = filename_only.split("_min_")[0]
    timeoffile = filename_only.split("_min_")[1]

    [fs, x] = audioBasicIO.read_audio_file(filename)
    x = audioBasicIO.stereo_to_mono(x)
    duration = len(x) / fs

    [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "pyAudioAnalysis/data/models", "knn_speaker_10"))
    [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "pyAudioAnalysis/data/models", "knn_speaker_male_female"))

    [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs,
                                                        mt_step * fs,
                                                        round(fs * st_win),
                                                        round(fs*st_win * 0.5))

    MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) +
                                 len(classNames2), mt_feats.shape[1]))

    for i in range(mt_feats.shape[1]):
        cur_f1 = (mt_feats[:, i] - MEAN1) / STD1
        cur_f2 = (mt_feats[:, i] - MEAN2) / STD2
        [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
        [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
        MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
        MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[mt_feats.shape[0] +
                         len(classNames1)::, i] = P2 + 0.0001

    mt_feats = MidTermFeatures2    # TODO
    iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41,
                       42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]

    mt_feats = mt_feats[iFeaturesSelect, :]

    (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    n_wins = mt_feats.shape[1]

    # remove outliers:
    dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)),
                      axis=0)
    m_dist_all = np.mean(dist_all)
    i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = np.min(mt_feats[1,:])
    #EnergyMean = np.mean(mt_feats[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
    # print i_non_outliers

    perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins
    mt_feats_norm_or = mt_feats_norm
    mt_feats_norm = mt_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        # [mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs,
        # st_win * fs, round(fs*st_win), round(fs*st_win));
        # extract mid-term features with minimum step:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        # for i in range(num_of_stats * num_of_features + 1):
        for i in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        # for each of the short-term features:
        for i in range(num_of_features):
            curPos = 0
            N = len(st_feats[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mt_win_ratio
                if N2 > N:
                    N2 = N
                curStFeatures = st_feats[i][N1:N2]
                mt_feats_to_red[i].append(np.mean(curStFeatures))
                mt_feats_to_red[i +
                                num_of_features].append(np.std(curStFeatures))
                curPos += mt_step_ratio
        mt_feats_to_red = np.array(mt_feats_to_red)
        mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] +
                                      len(classNames1) + len(classNames2),
                                      mt_feats_to_red.shape[1]))
        for i in range(mt_feats_to_red.shape[1]):
            cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1
            cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2
            [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
            [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0],
                              i] = mt_feats_to_red[:, i]
            mt_feats_to_red_2[mt_feats_to_red.shape[0]                              :mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001
            mt_feats_to_red_2[mt_feats_to_red.shape[0] +
                              len(classNames1)::, i] = P2 + 0.0001
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :]
        #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010
        (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures(
            [mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0)
        #m_dist_all = np.mean(dist_all)
        #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0]
        #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2]
        Labels = np.zeros((mt_feats_to_red.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / st_win
        # print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i*st_win/LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, Labels)
        mt_feats_norm = (clf.transform(mt_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    clsAll = []
    sil_all = []
    centersAll = []

    for iSpeakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(mt_feats_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(mt_feats_norm.T))
        clsAll.append(cls)
        centersAll.append(means)
        sil_1 = []
        sil_2 = []
        for c in range(iSpeakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \
                float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mt_feats_norm[:, cls == c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                Yt = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(np.mean(Yt)*clust_per_cent)
                silBs = []
                for c2 in range(iSpeakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\
                            float(len(cls))
                        MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2]
                        Yt = distance.cdist(mt_feats_norm_temp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(np.mean(Yt)*(clust_per_cent
                                                  + clust_per_cent_2)/2.0)
                silBs = np.array(silBs)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(silBs))
        sil_1 = np.array(sil_1)
        sil_2 = np.array(sil_2)
        sil = []
        for c in range(iSpeakers):
            # for each cluster (speaker) compute silhouette
            sil.append((sil_2[c] - sil_1[c]) / (max(sil_2[c],
                                                    sil_1[c]) + 0.00001))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(np.mean(sil))

    imax = np.argmax(sil_all)
    # optimal number of clusters
    nSpeakersFinal = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = np.zeros((n_wins,))
    for i in range(n_wins):
        j = np.argmin(np.abs(i-i_non_outliers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
            trainHMM_computeStatistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]
    class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundturh exists
    if os.path.isfile(gt_file):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(
            seg_start, seg_end, seg_labs, mt_step)

    # if plot_res:
    #     fig = plt.figure()
    #     if n_speakers > 0:
    #         ax1 = fig.add_subplot(111)
    #     else:
    #         ax1 = fig.add_subplot(211)
    #     ax1.set_yticks(np.array(range(len(class_names))))
    #     ax1.axis((0, duration, -1, len(class_names)))
    #     ax1.set_yticklabels(class_names)
    #     ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls)

    # if os.path.isfile(gt_file):
    #     if plot_res:
    #         ax1.plot(np.array(range(len(flags_gt))) *
    #                  mt_step + mt_step / 2.0, flags_gt, 'r')
    #     purity_cluster_m, purity_speaker_m = \
    #         evaluateSpeakerDiarization(cls, flags_gt)
    #     print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
    #                                     100 * purity_speaker_m))
        # if plot_res:
        #     plt.title("Cluster purity: {0:.1f}% - "
        #               "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m,
        #                                                 100 * purity_speaker_m))
    # if plot_res:
    #     plt.xlabel("time (seconds)")
    #     # print s_range, sil_all
    #     if n_speakers <= 0:
    #         plt.subplot(212)
    #         plt.plot(s_range, sil_all)
    #         plt.xlabel("number of clusters")
    #         plt.ylabel("average clustering's sillouette")
    #     if save_plot:
    #         plt.savefig(
    #             f"{output_folder}{filename_only}".replace(".wav", ".png"))
    #     else:
    #         pass
    #     plt.show()

    # Create Time Vector
    time_vec = np.array(range(len(cls)))*mt_step+mt_step/2.0

    # Find Change Points
    speaker_change_index = np.where(np.roll(cls, 1) != cls)[0]

    # Create List of dialogue convos
    output_list = []
    temp = {}
    for ind, sc in enumerate(speaker_change_index):
        temp['dialogue_id'] = str(datetime.now()).strip()
        temp['sequence_id'] = str(ind)
        temp['speaker'] = list(cls)[sc]
        temp['start_time'] = time_vec[sc]
        temp['end_time'] = time_vec[speaker_change_index[ind+1] -
                                    1] if ind+1 < len(speaker_change_index) else time_vec[-1]
        temp["text"] = ""
        output_list.append(temp)
        temp = {}

    def snip_transcribe(output_list, filename, output_folder=output_folder,
                        speech_key=speech_key, service_region=service_region):
        speech_config = speechsdk.SpeechConfig(
            subscription=speech_key, region=service_region)
        speech_config.enable_dictation

        def recognized_cb(evt):
            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                # Do something with the recognized text
                output_list[ind]['text'] = output_list[ind]['text'] + \
                    str(evt.result.text)
                print(evt.result.text)

        for ind, diag in enumerate(output_list):
            t1 = diag['start_time']
            t2 = diag['end_time']
            newAudio = AudioSegment.from_wav(filename)
            chunk = newAudio[t1*1000:t2*1000]
            filename_out = output_folder + f"snippet_{diag['sequence_id']}.wav"
            # Exports to a wav file in the current path.
            chunk.export(filename_out, format="wav")
            done = False

            def stop_cb(evt):
                """callback that signals to stop continuous recognition upon receiving an event `evt`"""
                print('CLOSING on {}'.format(evt))
                nonlocal done
                done = True

            audio_input = speechsdk.AudioConfig(filename=filename_out)
            speech_recognizer = speechsdk.SpeechRecognizer(
                speech_config=speech_config, audio_config=audio_input)
            output_list[ind]['snippet_path'] = filename_out

            speech_recognizer.recognized.connect(recognized_cb)

            speech_recognizer.session_stopped.connect(stop_cb)
            speech_recognizer.canceled.connect(stop_cb)

            # Start continuous speech recognition
            speech_recognizer.start_continuous_recognition()
            while not done:
                time.sleep(.5)

            speech_recognizer.stop_continuous_recognition()

        return output_list

    output = snip_transcribe(output_list, filename,
                             output_folder=output_folder)
    output_json = {filename_only: output}

    with open(f"{output_folder}{nameoffile}_{timeoffile}.txt", "w") as outfile:
        json.dump(output_json, outfile)

    return cls, output_json
Ejemplo n.º 18
0
def loadAudio(path):
    Fs, x = audioBasicIO.read_audio_file(path)
    x = audioBasicIO.stereo_to_mono(x)
    return Fs, x
Ejemplo n.º 19
0
def directory_feature_extraction(folder_path,
                                 mid_window,
                                 mid_step,
                                 short_window,
                                 short_step,
                                 compute_beat=True):
    """
    This function extracts the mid-term features of the WAVE files of a 
    particular folder.

    The resulting feature vector is extracted by long-term averaging the
    mid-term features.
    Therefore ONE FEATURE VECTOR is extracted for each WAV file.

    ARGUMENTS:
        - folder_path:        the path of the WAVE directory
        - mid_window, mid_step:    mid-term window and step (in seconds)
        - short_window, short_step:    short-term window and step (in seconds)
    """

    mid_term_features = np.array([])
    process_times = []

    types = ('*.wav', '*.aif', '*.aiff', '*.mp3', '*.au', '*.ogg')
    wav_file_list = []
    for files in types:
        wav_file_list.extend(glob.glob(os.path.join(folder_path, files)))

    wav_file_list = sorted(wav_file_list)
    wav_file_list2, mid_feature_names = [], []
    for i, file_path in enumerate(wav_file_list):
        print("Analyzing file {0:d} of {1:d}: {2:s}".format(
            i + 1, len(wav_file_list), file_path))
        if os.stat(file_path).st_size == 0:
            print("   (EMPTY FILE -- SKIPPING)")
            continue
        sampling_rate, signal = audioBasicIO.read_audio_file(file_path)
        if sampling_rate == 0:
            continue

        t1 = time.time()
        signal = audioBasicIO.stereo_to_mono(signal)
        if signal.shape[0] < float(sampling_rate) / 5:
            print("  (AUDIO FILE TOO SMALL - SKIPPING)")
            continue
        wav_file_list2.append(file_path)
        if compute_beat:
            mid_features, short_features, mid_feature_names = \
                mid_feature_extraction(signal, sampling_rate,
                                       round(mid_window * sampling_rate),
                                       round(mid_step * sampling_rate),
                                       round(sampling_rate * short_window),
                                       round(sampling_rate * short_step))
            beat, beat_conf = beat_extraction(short_features, short_step)
        else:
            mid_features, _, mid_feature_names = \
                mid_feature_extraction(signal, sampling_rate,
                                       round(mid_window * sampling_rate),
                                       round(mid_step * sampling_rate),
                                       round(sampling_rate * short_window),
                                       round(sampling_rate * short_step))

        mid_features = np.transpose(mid_features)
        mid_features = mid_features.mean(axis=0)
        # long term averaging of mid-term statistics
        if (not np.isnan(mid_features).any()) and \
                (not np.isinf(mid_features).any()):
            if compute_beat:
                mid_features = np.append(mid_features, beat)
                mid_features = np.append(mid_features, beat_conf)
            if len(mid_term_features) == 0:
                # append feature vector
                mid_term_features = mid_features
            else:
                mid_term_features = np.vstack(
                    (mid_term_features, mid_features))
            t2 = time.time()
            duration = float(len(signal)) / sampling_rate
            process_times.append((t2 - t1) / duration)
    if len(process_times) > 0:
        print("Feature extraction complexity ratio: "
              "{0:.1f} x realtime".format(
                  (1.0 / np.mean(np.array(process_times)))))
    return mid_term_features, wav_file_list2, mid_feature_names
Ejemplo n.º 20
0


from pyAudioAnalysis import audioBasicIO
from tqdm import tqdm

smile = opensmile.Smile(feature_set=opensmile.FeatureSet.eGeMAPSv01b,
    feature_level=opensmile.FeatureLevel.Functionals,
)

data = pd.DataFrame([])
for wave_file in tqdm(os.listdir(pos_path)):

    wav_file_path = os.path.join(pos_path,wave_file)
    sampling_rate, signal = audioBasicIO.read_audio_file(wav_file_path)
    signal = audioBasicIO.stereo_to_mono(signal)

    ps = smile.process_signal(signal,sampling_rate)
    ps = pd.DataFrame(ps).reset_index().iloc[:,2:]
    ps['filename'] = wave_file.split('.')[0]
    ps['label'] = np.ones(len(ps))

    data = pd.concat([data, ps])

data_pos = data.reset_index().iloc[:,1:]

data = pd.DataFrame([])
for idx, wave_file in enumerate(os.listdir(neg_path)):

    wav_file_path = os.path.join(neg_path,wave_file)
    sampling_rate, signal = audioBasicIO.read_audio_file(wav_file_path)