Example #1
0
def get_speaker_spans(fn):
    '''
    Returns list of predicted spans between speakers
    '''
    mt_step = 0.1
    st_win = 0.05
    features = _get_features(fn, mt_step, st_win)
    clusters = _get_clusters(features, k=4)
    segs, c = labels_to_segments(clusters, mt_step)
    return segs, c
Example #2
0
def test(audiofile_path):
    model_path = r"models/svm_male_female"
    model_type = "svm_rbf"
    plot_results = False

    labels, class_names, mt_step, class_probabilities = mid_term_file_classification(
        audiofile_path, model_path, model_type, plot_results)

    print("labels: ", len(labels))
    # print "merged" segments (use labels_to_segments())
    # print("\nSegments:")
    segs, c, probs = labels_to_segments(labels, class_probabilities, mt_step)

    # print("segs: ", len(segs))
    print("prob test: ", len(probs))

    # for iS, seg in enumerate(segs):
    #     if probs[iS] > 0.6:
    #         print(f'segment {iS} {seg[0]} sec - {seg[1]} sec: {class_names[int(c[iS])]} pro: {probs[iS]}')
    # print("type(segs): ", type(segs))
    # print("type(probs): ", type(probs))

    return probs, segs.tolist(), class_names, c


# if __name__ == "__main__":
#
#     audiofile_path = r"/Users/taanhtuan/Desktop/workproject/basic_audio_analysis-master/data/test.mp3"
#     model_path = r"models/svm_male_female"
#     model_type = "svm_rbf"
#     plot_results = False
#
#     labels, class_names, mt_step, class_probabilities = mid_term_file_classification(audiofile_path, model_path,
#                                                              model_type, plot_results)
#
#     print("labels: ", len(labels))
#     # print "merged" segments (use labels_to_segments())
#     print("\nSegments:")
#     segs, c, probs = labels_to_segments(labels, class_probabilities, mt_step)
#
#     print("segs: ", len(segs))
#     print("prob: ", len(probs))
#
#     for iS, seg in enumerate(segs):
#         if probs[iS] > 0.58:
#             print(f'segment {iS} {seg[0]} sec - {seg[1]} sec: {class_names[int(c[iS])]} pro: {probs[iS]}')
Example #3
0
def FileClassification(input_file, model_name, model_type, gt=False,
                       gt_file=""):
    '''
    TODO: This function needs to be refactored according to the code in
    audioSegmentation.mid_term_file_classification()
    '''

    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return (-1, -1, -1, -1)

    # Load classifier with load_model:
    [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step,
     compute_beat] = aT.load_model(model_name)

    # Using audioBasicIO from puAudioAnalysis, the input audio stream is loaded
    [fs, x] = audioBasicIO.read_audio_file(input_file)
    if fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audioBasicIO.stereo_to_mono(x)  # convert stereo (if) to mono
    duration = len(x) / fs

    # mid-term feature extraction using pyAudioAnalysis mtFeatureExtraction:
    [mt_feats, _, _] = mF.mid_feature_extraction(x, fs, mt_win * fs,
                                                 mt_step * fs,
                                                 round(fs * st_win),
                                                 round(fs * st_step))
    flags = []
    Ps = []
    flags_ind = []
    for i in range(mt_feats.shape[1]):
        # for each feature vector (i.e. for each fix-sized segment):
        cur_fv = (mt_feats[:, i] - MEAN) / STD
        [res, P] = aT.classifier_wrapper(classifier, model_type, cur_fv)
        if res == 0.0:
            if numpy.max(P) > 0.5:
                flags_ind.append(res)
                flags.append(class_names[int(res)])  # update class label matrix
                Ps.append(numpy.max(P))  # update probability matrix
            else:
                flags_ind.append(-1)
                flags.append('None')
                Ps.append(-1)
        if res == 1.0:
            if numpy.max(P) > 0.9:
                flags_ind.append(res)
                flags.append(class_names[int(res)])  # update class label matrix
                Ps.append(numpy.max(P))  # update probability matrix
            else:
                flags_ind.append(-1)
                flags.append('None')
                Ps.append(-1)
        if res == 2.0:
            if numpy.max(P) > 0.6:
                flags_ind.append(res)
                flags.append(class_names[int(res)])  # update class label matrix
                Ps.append(numpy.max(P))  # update probability matrix
            else:
                flags_ind.append(-1)
                flags.append('None')
                Ps.append(-1)
        if res == 3.0:
            if numpy.max(P) > 0.3:
                flags_ind.append(res)
                flags.append(class_names[int(res)])  # update class label matrix
                Ps.append(numpy.max(P))  # update probability matrix
            else:
                flags_ind.append(-1)
                flags.append('None')
                Ps.append(-1)
        if res == 4.0:
            if numpy.max(P) > 0.3:
                flags_ind.append(res)
                flags.append(class_names[int(res)])  # update class label matrix
                Ps.append(numpy.max(P))  # update probability matrix
            else:
                flags_ind.append(-1)
                flags.append('None')
                Ps.append(-1)
    flags_ind = numpy.array(flags_ind)

    # 1-window smoothing
    for i in range(1, len(flags_ind) - 1):
        if flags_ind[i - 1] == flags_ind[i + 1]:
            flags_ind[i] = flags_ind[i + 1]
    # convert fix-sized flags to segments and classes
    (segs, classes) = aS.labels_to_segments(flags, mt_step)
    segs[-1] = len(x) / float(fs)
    if gt == True:
        # Load grount-truth:
        if os.path.isfile(gt_file):
            [seg_start_gt, seg_end_gt, seg_l_gt] = aS.read_segmentation_gt(gt_file)
            flags_gt, class_names_gt = aS.segments_to_labels(seg_start_gt, seg_end_gt, seg_l_gt, mt_step)
            flags_ind_gt = []
            # print(class_names)
            for j, fl in enumerate(flags_gt):
                # "align" labels with GT
                # print(class_names_gt[flags_gt[j]])
                if class_names_gt[flags_gt[j]] in class_names:
                    flags_ind_gt.append(class_names.index(class_names_gt[flags_gt[j]]))
                else:
                    flags_ind_gt.append(-1)
            flags_ind_gt = numpy.array(flags_ind_gt)
            cm = numpy.zeros((len(class_names_gt), len(class_names_gt)))
            for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])):
                cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1
        else:
            cm = []
            flags_ind_gt = numpy.array([])
        acc = aS.plot_segmentation_results(flags_ind, flags_ind_gt,
                                           class_names, mt_step, False)
    else:
        cm = []
        flags_ind_gt = numpy.array([])
        acc = aS.plot_segmentation_results(flags_ind, flags_ind_gt,
                                           class_names, mt_step, False)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc))
        return (flags_ind, class_names_gt, acc, cm)
    else:
        return (flags_ind, class_names, acc, cm)
Example #4
0
if __name__ == '__main__':
    # read signal and get normalized segment features:
    input_file = "../data/diarizationExample.wav"
    fs, x = read_audio_file(input_file)
    x = stereo_to_mono(x)
    mt_size, mt_step, st_win = 1, 0.1, 0.05
    [mt_feats, st_feats, _] = mT(x, fs, mt_size * fs, mt_step * fs,
                                 round(fs * st_win), round(fs * st_win * 0.5))
    print(mt_feats.shape)
    (mt_feats_norm, MEAN, STD) = normalize_features([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    # perform clustering (k = 4)
    n_clusters = 4
    k_means = sklearn.cluster.KMeans(n_clusters=n_clusters)
    k_means.fit(mt_feats_norm.T)
    cls = k_means.labels_
    print(cls.shape)
    segs, c = labels_to_segments(cls,
                                 mt_step)  # convert flags to segment limits
    for sp in range(n_clusters):  # play each cluster's segment
        for i in range(len(c)):
            if c[i] == sp and segs[i, 1] - segs[i, 0] > 0.5:
                # play long segments of current speaker
                print(c[i], segs[i, 0], segs[i, 1])
                cmd = "ffmpeg -i {} -ss {} -t {} temp.wav " \
                          "-loglevel panic -y".format(input_file, segs[i, 0]+1,
                                                      segs[i, 1]-segs[i, 0]-1)
                os.system(cmd)
                os.system("play temp.wav -q")
                readchar.readchar()