def self_similarity_matrix(feature_vectors):
    """
    This function computes the self-similarity matrix for a sequence
    of feature vectors.
    ARGUMENTS:
     - feature_vectors:    a np matrix (nDims x nVectors) whose i-th column
                           corresponds to the i-th feature vector
    RETURNS:
     - sim_matrix:         the self-similarity matrix (nVectors x nVectors)
    """
    norm_feature_vectors, mean, std = at.normalize_features(
        [feature_vectors.T])
    norm_feature_vectors = norm_feature_vectors[0].T
    sim_matrix = 1.0 - distance.squareform(
        distance.pdist(norm_feature_vectors.T, 'cosine'))
    return sim_matrix
Exemple #2
0
def _get_features(fn, mt_step=0.1, st_win=0.05):
    '''
    Generate the feature vector for the audio located at fn
    '''
    sr, signal = read_audio_file(fn)
    if len(signal.shape) > 1:
        raise Exception(
            f'Single channel audio only! This audio has the shape: {signal.shape}'
        )
    mid_step = mt_step * sr
    short_window = round(st_win * sr)
    short_step = round(st_win * sr * 0.5)
    [feats, short_feats, feat_names] = paa.MidTermFeatures\
                                             .mid_feature_extraction (signal, sr, mid_window,
                                                                      mid_step, short_window, short_step)
    (feats_norm, MEAN, STD) = normalize_features([feats.T])
    feats_norm = feats_norm[0].T
    return feats_norm
def visualizeFeaturesFolder(folder, dimReductionMethod, priorKnowledge="none"):
    '''
    This function generates a chordial visualization for the recordings of the provided path.
    ARGUMENTS:
        - folder:        path of the folder that contains the WAV files to be processed
        - dimReductionMethod:    method used to reduce the dimension of the initial feature space before computing the similarity.
        - priorKnowledge:    if this is set equal to "artist"
    '''
    if dimReductionMethod == "pca":
        allMtFeatures, wavFilesList, _ = aF.directory_feature_extraction(
            folder, 30.0, 30.0, 0.050, 0.050, compute_beat=True)
        if allMtFeatures.shape[0] == 0:
            print("Error: No data found! Check input folder")
            return

        namesCategoryToVisualize = [
            ntpath.basename(w).replace('.wav', '').split(" --- ")[0]
            for w in wavFilesList
        ]
        namesToVisualize = [
            ntpath.basename(w).replace('.wav', '') for w in wavFilesList
        ]

        (F, MEAN, STD) = aT.normalize_features([allMtFeatures])
        F = np.concatenate(F)

        # check that the new PCA dimension is at most equal to the number of samples
        K1 = 2
        K2 = 10
        if K1 > F.shape[0]:
            K1 = F.shape[0]
        if K2 > F.shape[0]:
            K2 = F.shape[0]
        pca1 = sklearn.decomposition.PCA(n_components=K1)
        pca1.fit(F)
        pca2 = sklearn.decomposition.PCA(n_components=K2)
        pca2.fit(F)

        finalDims = pca1.transform(F)
        finalDims2 = pca2.transform(F)
    else:
        allMtFeatures, Ys, wavFilesList = aF.directory_feature_extraction_no_avg(
            folder, 20.0, 5.0, 0.040, 0.040
        )  # long-term statistics cannot be applied in this context (LDA needs mid-term features)
        if allMtFeatures.shape[0] == 0:
            print("Error: No data found! Check input folder")
            return

        namesCategoryToVisualize = [
            ntpath.basename(w).replace('.wav', '').split(" --- ")[0]
            for w in wavFilesList
        ]
        namesToVisualize = [
            ntpath.basename(w).replace('.wav', '') for w in wavFilesList
        ]

        ldaLabels = Ys
        if priorKnowledge == "artist":
            uNamesCategoryToVisualize = list(set(namesCategoryToVisualize))
            YsNew = np.zeros(Ys.shape)
            for i, uname in enumerate(
                    uNamesCategoryToVisualize):  # for each unique artist name:
                indicesUCategories = [
                    j for j, x in enumerate(namesCategoryToVisualize)
                    if x == uname
                ]
                for j in indicesUCategories:
                    indices = np.nonzero(Ys == j)
                    YsNew[indices] = i
            ldaLabels = YsNew

        (F, MEAN, STD) = aT.normalize_features([allMtFeatures])
        F = np.array(F[0])

        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=10)
        clf.fit(F, ldaLabels)
        reducedDims = clf.transform(F)

        pca = sklearn.decomposition.PCA(n_components=2)
        pca.fit(reducedDims)
        reducedDims = pca.transform(reducedDims)

        # TODO: CHECK THIS ... SHOULD LDA USED IN SEMI-SUPERVISED ONLY????

        uLabels = np.sort(
            np.unique((Ys))
        )  # uLabels must have as many labels as the number of wavFilesList elements
        reducedDimsAvg = np.zeros((uLabels.shape[0], reducedDims.shape[1]))
        finalDims = np.zeros((uLabels.shape[0], 2))
        for i, u in enumerate(uLabels):
            indices = [j for j, x in enumerate(Ys) if x == u]
            f = reducedDims[indices, :]
            finalDims[i, :] = f.mean(axis=0)
        finalDims2 = reducedDims

    for i in range(finalDims.shape[0]):
        plt.text(finalDims[i, 0],
                 finalDims[i, 1],
                 ntpath.basename(wavFilesList[i].replace('.wav', '')),
                 horizontalalignment='center',
                 verticalalignment='center',
                 fontsize=10)
        plt.plot(finalDims[i, 0], finalDims[i, 1], '*r')
    plt.xlim([1.2 * finalDims[:, 0].min(), 1.2 * finalDims[:, 0].max()])
    plt.ylim([1.2 * finalDims[:, 1].min(), 1.2 * finalDims[:, 1].max()])
    plt.show()

    SM = 1.0 - distance.squareform(distance.pdist(finalDims2, 'cosine'))
    for i in range(SM.shape[0]):
        SM[i, i] = 0.0

    chordialDiagram("visualization", SM, 0.50, namesToVisualize,
                    namesCategoryToVisualize)

    SM = 1.0 - distance.squareform(distance.pdist(F, 'cosine'))
    for i in range(SM.shape[0]):
        SM[i, i] = 0.0
    chordialDiagram("visualizationInitial", SM, 0.50, namesToVisualize,
                    namesCategoryToVisualize)

    # plot super-categories (i.e. artistname
    uNamesCategoryToVisualize = sort(list(set(namesCategoryToVisualize)))
    finalDimsGroup = np.zeros(
        (len(uNamesCategoryToVisualize), finalDims2.shape[1]))
    for i, uname in enumerate(uNamesCategoryToVisualize):
        indices = [
            j for j, x in enumerate(namesCategoryToVisualize) if x == uname
        ]
        f = finalDims2[indices, :]
        finalDimsGroup[i, :] = f.mean(axis=0)

    SMgroup = 1.0 - distance.squareform(
        distance.pdist(finalDimsGroup, 'cosine'))
    for i in range(SMgroup.shape[0]):
        SMgroup[i, i] = 0.0
    chordialDiagram("visualizationGroup", SMgroup, 0.50,
                    uNamesCategoryToVisualize, uNamesCategoryToVisualize)
def speaker_diarization(filename,
                        n_speakers,
                        mid_window=2.0,
                        mid_step=0.2,
                        short_window=0.05,
                        lda_dim=35,
                        plot_res=False):
    """
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
        - n_speakers       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - mid_window (opt)    mid-term window size
        - mid_step (opt)    mid-term window step
        - short_window  (opt)    short-term window size
        - lda_dim (opt     LDA dimension (0 for no LDA)
        - plot_res         (opt)   0 for not plotting the results 1 for plotting
    """
    sampling_rate, signal = audioBasicIO.read_audio_file(filename)
    signal = audioBasicIO.stereo_to_mono(signal)
    duration = len(signal) / sampling_rate

    base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "data/models")

    classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \
        at.load_model_knn(os.path.join(base_dir, "knn_speaker_10"))
    classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _,  _ = \
        at.load_model_knn(os.path.join(base_dir, "knn_speaker_male_female"))

    mid_feats, st_feats, _ = \
        mtf.mid_feature_extraction(signal, sampling_rate,
                                   mid_window * sampling_rate,
                                   mid_step * sampling_rate,
                                   round(sampling_rate * short_window),
                                   round(sampling_rate * short_window * 0.5))

    mid_term_features = np.zeros(
        (mid_feats.shape[0] + len(class_names_all) + len(class_names_fm),
         mid_feats.shape[1]))

    for index in range(mid_feats.shape[1]):
        feature_norm_all = (mid_feats[:, index] - mean_all) / std_all
        feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm
        _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all)
        _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm)
        start = mid_feats.shape[0]
        end = mid_feats.shape[0] + len(class_names_all)
        mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index]
        mid_term_features[start:end, index] = p1 + 1e-4
        mid_term_features[end::, index] = p2 + 1e-4

    mid_feats = mid_term_features  # TODO
    feature_selected = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]

    mid_feats = mid_feats[feature_selected, :]

    mid_feats_norm, mean, std = at.normalize_features([mid_feats.T])
    mid_feats_norm = mid_feats_norm[0].T
    n_wins = mid_feats.shape[1]

    # remove outliers:
    dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)),
                      axis=0)
    m_dist_all = np.mean(dist_all)
    i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    # EnergyMin = np.min(mt_feats[1,:])
    # EnergyMean = np.mean(mt_feats[1,:])
    # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
    # print i_non_outliers

    mt_feats_norm_or = mid_feats_norm
    mid_feats_norm = mid_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:

        # extract mid-term features with minimum step:
        window_ratio = int(round(mid_window / short_window))
        step_ratio = int(round(short_window / short_window))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        for index in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        # for each of the short-term features:
        for index in range(num_of_features):
            cur_pos = 0
            feat_len = len(st_feats[index])
            while cur_pos < feat_len:
                n1 = cur_pos
                n2 = cur_pos + window_ratio
                if n2 > feat_len:
                    n2 = feat_len
                short_features = st_feats[index][n1:n2]
                mt_feats_to_red[index].append(np.mean(short_features))
                mt_feats_to_red[index + num_of_features].\
                    append(np.std(short_features))
                cur_pos += step_ratio
        mt_feats_to_red = np.array(mt_feats_to_red)
        mt_feats_to_red_2 = np.zeros(
            (mt_feats_to_red.shape[0] + len(class_names_all) +
             len(class_names_fm), mt_feats_to_red.shape[1]))
        limit = mt_feats_to_red.shape[0] + len(class_names_all)
        for index in range(mt_feats_to_red.shape[1]):
            feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all
            feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm
            _, p1 = at.classifier_wrapper(classifier_all, "knn",
                                          feature_norm_all)
            _, p2 = at.classifier_wrapper(classifier_fm, "knn",
                                          feature_norm_fm)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \
                mt_feats_to_red[:, index]
            mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit,
                              index] = p1 + 1e-4
            mt_feats_to_red_2[limit::, index] = p2 + 1e-4
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[feature_selected, :]
        mt_feats_to_red, mean, std = at.normalize_features([mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        labels = np.zeros((mt_feats_to_red.shape[1], ))
        lda_step = 1.0
        lda_step_ratio = lda_step / short_window
        for index in range(labels.shape[0]):
            labels[index] = int(index * short_window / lda_step_ratio)
        clf = sklearn.discriminant_analysis.\
            LinearDiscriminantAnalysis(n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, labels)
        mid_feats_norm = (clf.transform(mid_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    cluster_labels = []
    sil_all = []
    cluster_centers = []

    for speakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=speakers)
        k_means.fit(mid_feats_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        cluster_labels.append(cls)
        cluster_centers.append(means)
        sil_1 = []
        sil_2 = []
        for c in range(speakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mid_feats_norm[:, cls == c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                dist = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(np.mean(dist) * clust_per_cent)
                sil_temp = []
                for c2 in range(speakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\
                                           float(len(cls))
                        mid_features_temp = mid_feats_norm[:, cls == c2]
                        dist = distance.cdist(mt_feats_norm_temp.T,
                                              mid_features_temp.T)
                        sil_temp.append(
                            np.mean(dist) *
                            (clust_per_cent + clust_per_cent_2) / 2.0)
                sil_temp = np.array(sil_temp)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(sil_temp))
        sil_1 = np.array(sil_1)
        sil_2 = np.array(sil_2)
        sil = []
        for c in range(speakers):
            # for each cluster (speaker) compute silhouette
            sil.append(
                (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(np.mean(sil))

    imax = int(np.argmax(sil_all))
    # optimal number of clusters
    num_speakers = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = np.zeros((n_wins, ))
    for index in range(n_wins):
        j = np.argmin(np.abs(index - i_non_outliers))
        cls[index] = cluster_labels[imax][j]

    # Post-process method 1: hmm smoothing
    for index in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
            train_hmm_compute_statistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundtruth exists
    if os.path.isfile(gt_file):
        seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file)
        flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end,
                                                      seg_labs, mid_step)

    if plot_res:
        fig = plt.figure()
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(np.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls)

    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(
                np.array(range(len(flags_gt))) * mid_step + mid_step / 2.0,
                flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
            evaluate_speaker_diarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(
                          100 * purity_cluster_m, 100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        if n_speakers <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
    return cls
def silence_removal(signal,
                    sampling_rate,
                    st_win,
                    st_step,
                    smooth_window=0.5,
                    weight=0.5,
                    plot=False):
    """
    Event Detection (silence removal)
    ARGUMENTS:
         - signal:                the input audio signal
         - sampling_rate:               sampling freq
         - st_win, st_step:    window size and step in seconds
         - smoothWindow:     (optinal) smooth window (in seconds)
         - weight:           (optinal) weight factor (0 < weight < 1)
                              the higher, the more strict
         - plot:             (optinal) True if results are to be plotted
    RETURNS:
         - seg_limits:    list of segment limits in seconds (e.g [[0.1, 0.9],
                          [1.4, 3.0]] means that
                          the resulting segments are (0.1 - 0.9) seconds
                          and (1.4, 3.0) seconds
    """

    if weight >= 1:
        weight = 0.99
    if weight <= 0:
        weight = 0.01

    # Step 1: feature extraction
    signal = audioBasicIO.stereo_to_mono(signal)
    st_feats, _ = stf.feature_extraction(signal, sampling_rate,
                                         st_win * sampling_rate,
                                         st_step * sampling_rate)

    # Step 2: train binary svm classifier of low vs high energy frames
    # keep only the energy short-term sequence (2nd feature)
    st_energy = st_feats[1, :]
    en = np.sort(st_energy)
    # number of 10% of the total short-term windows
    st_windows_fraction = int(len(en) / 10)

    # compute "lower" 10% energy threshold
    low_threshold = np.mean(en[0:st_windows_fraction]) + 1e-15

    # compute "higher" 10% energy threshold
    high_threshold = np.mean(en[-st_windows_fraction:-1]) + 1e-15

    # get all features that correspond to low energy
    low_energy = st_feats[:, np.where(st_energy <= low_threshold)[0]]

    # get all features that correspond to high energy
    high_energy = st_feats[:, np.where(st_energy >= high_threshold)[0]]

    # form the binary classification task and ...
    features = [low_energy.T, high_energy.T]
    # normalize and train the respective svm probabilistic model

    # (ONSET vs SILENCE)
    features_norm, mean, std = at.normalize_features(features)
    svm = at.train_svm(features_norm, 1.0)

    # Step 3: compute onset probability based on the trained svm
    prob_on_set = []
    for index in range(st_feats.shape[1]):
        # for each frame
        cur_fv = (st_feats[:, index] - mean) / std
        # get svm probability (that it belongs to the ONSET class)
        prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1])
    prob_on_set = np.array(prob_on_set)

    # smooth probability:
    prob_on_set = smooth_moving_avg(prob_on_set, smooth_window / st_step)

    # Step 4A: detect onset frame indices:
    prog_on_set_sort = np.sort(prob_on_set)

    # find probability Threshold as a weighted average
    # of top 10% and lower 10% of the values
    nt = int(prog_on_set_sort.shape[0] / 10)
    threshold = (np.mean((1 - weight) * prog_on_set_sort[0:nt]) +
                 weight * np.mean(prog_on_set_sort[-nt::]))

    max_indices = np.where(prob_on_set > threshold)[0]
    # get the indices of the frames that satisfy the thresholding
    index = 0
    seg_limits = []
    time_clusters = []

    # Step 4B: group frame indices to onset segments
    while index < len(max_indices):
        # for each of the detected onset indices
        cur_cluster = [max_indices[index]]
        if index == len(max_indices) - 1:
            break
        while max_indices[index + 1] - cur_cluster[-1] <= 2:
            cur_cluster.append(max_indices[index + 1])
            index += 1
            if index == len(max_indices) - 1:
                break
        index += 1
        time_clusters.append(cur_cluster)
        seg_limits.append(
            [cur_cluster[0] * st_step, cur_cluster[-1] * st_step])

    # Step 5: Post process: remove very small segments:
    min_duration = 0.2
    seg_limits_2 = []
    for s_lim in seg_limits:
        if s_lim[1] - s_lim[0] > min_duration:
            seg_limits_2.append(s_lim)
    seg_limits = seg_limits_2

    if plot:
        time_x = np.arange(0, signal.shape[0] / float(sampling_rate),
                           1.0 / sampling_rate)

        plt.subplot(2, 1, 1)
        plt.plot(time_x, signal)
        for s_lim in seg_limits:
            plt.axvline(x=s_lim[0], color='red')
            plt.axvline(x=s_lim[1], color='red')
        plt.subplot(2, 1, 2)
        plt.plot(np.arange(0, prob_on_set.shape[0] * st_step, st_step),
                 prob_on_set)
        plt.title('Signal')
        for s_lim in seg_limits:
            plt.axvline(x=s_lim[0], color='red')
            plt.axvline(x=s_lim[1], color='red')
        plt.title('svm Probability')
        plt.show()

    return seg_limits
def QE_speaker_diarization(
        sampling_rate,
        signal,
        n_speakers,
        classifier_all,
        mean_all,
        std_all,
        class_names_all,
        classifier_fm,
        mean_fm,
        std_fm,
        class_names_fm,  # Load models from avove
        mid_window=2.0,
        mid_step=0.2,
        short_window=0.05,
        lda_dim=35,
        plot_res=False):
    """
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed  #QE_:-> ADAPTED HERE TO RECEIVE DIRECTLY THE DATA sampling_rate, signal INSTEAD OF filename
        - n_speakers       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - mid_window (opt)    mid-term window size
        - mid_step (opt)    mid-term window step
        - short_window  (opt)    short-term window size
        - lda_dim (opt     LDA dimension (0 for no LDA)
        - plot_res         (opt)   0 for not plotting the results 1 for plotting

    """
    """
        Otras opciones a explorar para diarization
        https://hackernoon.com/speaker-diarization-the-squad-way-2205e0accbda
        https://github.com/YongyuG/s4d-diarization-gao/blob/master/s4d/diar.py, muy buena pinta https://pypi.org/project/s4d/ https://projets-lium.univ-lemans.fr/s4d/
        https://medium.com/datadriveninvestor/speaker-diarization-22121f1264b1
        https://arxiv.org/pdf/2005.08072v1.pdf
        https://github.com/calclavia/tal-asrd
        https://github.com/josepatino/pyBK
        https://github.com/wq2012/awesome-diarization
        https://www.researchgate.net/publication/221480626_The_Detection_of_Overlapping_Speech_with_Prosodic_Features_for_Speaker_Diarization


    """
    # sampling_rate, signal = audioBasicIO.read_audio_file(filename) # NO ES NECESARIO DADOJ QUE LO PASO COMO ARGUMENTOS DE ENTRADA EN LUGAR DE filename
    signal = audioBasicIO.stereo_to_mono(
        signal)  # eliminar si ya viene en mono como condicion
    duration = len(signal) / sampling_rate
    """
    #QE_: In order to avoid a recurrent load of the models, they are loaded more globally only once and then passed as arguments
    # So this part is copied in the module avove , QE_main:
    base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 
                            "data/models")

    classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \
        at.load_model_knn(os.path.join(base_dir, "knn_speaker_10"))
    classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _,  _ = \
        at.load_model_knn(os.path.join(base_dir, "knn_speaker_male_female"))
    """

    mid_feats, st_feats, _ = \
        mtf.mid_feature_extraction(signal, sampling_rate,
                                   mid_window * sampling_rate,
                                   mid_step * sampling_rate,
                                   round(sampling_rate * short_window),
                                   round(sampling_rate * short_window * 0.5))

    mid_term_features = np.zeros(
        (mid_feats.shape[0] + len(class_names_all) + len(class_names_fm),
         mid_feats.shape[1]))

    for index in range(mid_feats.shape[1]):
        feature_norm_all = (mid_feats[:, index] - mean_all) / std_all
        feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm
        _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all)
        _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm)
        start = mid_feats.shape[0]
        end = mid_feats.shape[0] + len(class_names_all)
        mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index]
        mid_term_features[start:end, index] = p1 + 1e-4
        mid_term_features[end::, index] = p2 + 1e-4

    mid_feats = mid_term_features  # TODO
    feature_selected = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]

    mid_feats = mid_feats[feature_selected, :]

    mid_feats_norm, mean, std = at.normalize_features([mid_feats.T])
    mid_feats_norm = mid_feats_norm[0].T
    n_wins = mid_feats.shape[1]

    # remove outliers:
    dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)),
                      axis=0)
    m_dist_all = np.mean(dist_all)
    i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    # EnergyMin = np.min(mt_feats[1,:])
    # EnergyMean = np.mean(mt_feats[1,:])
    # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
    # print i_non_outliers

    mt_feats_norm_or = mid_feats_norm
    mid_feats_norm = mid_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:

        # extract mid-term features with minimum step:
        window_ratio = int(round(mid_window / short_window))
        step_ratio = int(round(short_window / short_window))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        for index in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        # for each of the short-term features:
        for index in range(num_of_features):
            cur_pos = 0
            feat_len = len(st_feats[index])
            while cur_pos < feat_len:
                n1 = cur_pos
                n2 = cur_pos + window_ratio
                if n2 > feat_len:
                    n2 = feat_len
                short_features = st_feats[index][n1:n2]
                mt_feats_to_red[index].append(np.mean(short_features))
                mt_feats_to_red[index + num_of_features].\
                    append(np.std(short_features))
                cur_pos += step_ratio
        mt_feats_to_red = np.array(mt_feats_to_red)
        mt_feats_to_red_2 = np.zeros(
            (mt_feats_to_red.shape[0] + len(class_names_all) +
             len(class_names_fm), mt_feats_to_red.shape[1]))
        limit = mt_feats_to_red.shape[0] + len(class_names_all)
        for index in range(mt_feats_to_red.shape[1]):
            feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all
            feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm
            _, p1 = at.classifier_wrapper(classifier_all, "knn",
                                          feature_norm_all)
            _, p2 = at.classifier_wrapper(classifier_fm, "knn",
                                          feature_norm_fm)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \
                mt_feats_to_red[:, index]
            mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit,
                              index] = p1 + 1e-4
            mt_feats_to_red_2[limit::, index] = p2 + 1e-4
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[feature_selected, :]
        mt_feats_to_red, mean, std = at.normalize_features([mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        labels = np.zeros((mt_feats_to_red.shape[1], ))
        lda_step = 1.0
        lda_step_ratio = lda_step / short_window
        for index in range(labels.shape[0]):
            labels[index] = int(index * short_window / lda_step_ratio)
        clf = sklearn.discriminant_analysis.\
            LinearDiscriminantAnalysis(n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, labels)
        mid_feats_norm = (clf.transform(mid_feats_norm.T)).T

##########################################################################################################################################
    if n_speakers <= 0:
        s_range = range(
            2, 10
        )  #QE_: Adapt in this case to range 1-10? We are going to use this diarizantion in short windows, 250-500 ms


###########################################################################################################################################
    else:
        s_range = [n_speakers]

    cluster_labels = []
    sil_all = []
    cluster_centers = []

    for speakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=speakers)
        k_means.fit(mid_feats_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        cluster_labels.append(cls)
        cluster_centers.append(means)
        sil_1 = []
        sil_2 = []
        for c in range(speakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mid_feats_norm[:, cls == c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                dist = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(np.mean(dist) * clust_per_cent)
                sil_temp = []
                for c2 in range(speakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\
                                           float(len(cls))
                        mid_features_temp = mid_feats_norm[:, cls == c2]
                        dist = distance.cdist(mt_feats_norm_temp.T,
                                              mid_features_temp.T)
                        sil_temp.append(
                            np.mean(dist) *
                            (clust_per_cent + clust_per_cent_2) / 2.0)
                sil_temp = np.array(sil_temp)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(sil_temp))
        sil_1 = np.array(sil_1)
        sil_2 = np.array(sil_2)
        sil = []
        for c in range(speakers):
            # for each cluster (speaker) compute silhouette
            sil.append(
                (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(np.mean(sil))

    imax = int(np.argmax(sil_all))
    # optimal number of clusters
    num_speakers = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = np.zeros((n_wins, ))
    for index in range(n_wins):
        j = np.argmin(np.abs(index - i_non_outliers))
        cls[index] = cluster_labels[imax][j]

    # Post-process method 1: hmm smoothing
    for index in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
            train_hmm_compute_statistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundtruth exists
    if os.path.isfile(gt_file):
        seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file)
        flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end,
                                                      seg_labs, mid_step)
    """
    if plot_res:
        fig = plt.figure()    
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(np.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls)

    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(np.array(range(len(flags_gt))) *
                     mid_step + mid_step / 2.0, flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
            evaluate_speaker_diarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m,
                                                        100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        if n_speakers <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
        """

    return cls
import os, readchar, sklearn.cluster
from pyAudioAnalysis.MidTermFeatures import mid_feature_extraction as mT
from pyAudioAnalysis.audioBasicIO import read_audio_file, stereo_to_mono
from pyAudioAnalysis.audioSegmentation import labels_to_segments
from pyAudioAnalysis.audioTrainTest import normalize_features

if __name__ == '__main__':
    # read signal and get normalized segment features:
    input_file = "../data/diarizationExample.wav"
    fs, x = read_audio_file(input_file)
    x = stereo_to_mono(x)
    mt_size, mt_step, st_win = 1, 0.1, 0.05
    [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs,
                                 round(fs * st_win), round(fs * st_win * 0.5))
    print(mt_feats.shape)
    (mt_feats_norm, MEAN, STD) = normalize_features([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    # perform clustering (k = 4)
    n_clusters = 4
    k_means = sklearn.cluster.KMeans(n_clusters=n_clusters)
    k_means.fit(mt_feats_norm.T)
    cls = k_means.labels_
    print(cls.shape)
    segs, c = labels_to_segments(cls,
                                 mt_step)  # convert flags to segment limits
    for sp in range(n_clusters):  # play each cluster's segment
        for i in range(len(c)):
            if c[i] == sp and segs[i, 1] - segs[i, 0] > 0.5:
                # play long segments of current speaker
                print(c[i], segs[i, 0], segs[i, 1])
                cmd = "ffmpeg -i {} -ss {} -t {} temp.wav " \