コード例 #1
0
def extract_time_start(video_path, bip_ref_path="ref_bip_isolated.wav"):
    # features of the ref
    # extract short-term features using a 50msec non-overlapping windows
    fs, s_ref = aIO.read_audio_file(bip_ref_path)
    duration = len(s_ref) / float(fs)
    win, step = 0.05, 0.05
    win_mid, step_mid = duration, 0.5
    mt_ref, st_ref, mt_n_ref = aFm.mid_feature_extraction(
        s_ref, fs, win_mid * fs, step_mid * fs, win * fs, step * fs)
    # extraction on the long signal
    my_clip1 = mp.VideoFileClip(video_path)
    fs = 44100
    s_long = my_clip1.audio.to_soundarray(fps=fs)
    s_long = s_long[:, 0]
    duration_long = len(s_long) / float(fs)

    # extract short-term features using a 50msec non-overlapping windows
    win, step = 0.05, 0.05
    win_mid, step_mid = 0.4, 0.05
    mt_long, st_long, mt_n_long = aFm.mid_feature_extraction(
        s_long, fs, win_mid * fs, step_mid * fs, win * fs, step * fs)

    # compute the distance and get the minimum
    distances = np.linalg.norm(mt_long - mt_ref, axis=0)
    time_start = np.argmin(distances) * duration_long / mt_long.shape[1]
    return time_start
コード例 #2
0
ファイル: AudioExtractor.py プロジェクト: Nikkunemufr/Python
def analysisAudio(vid_uuid, analysis_uuid):
    with open("../../../data/processed/" + str(vid_uuid) + "-" +
              str(analysis_uuid) + "_extracted.interest.csv") as interestfile:
        interest_reader = csv.reader(interestfile, delimiter=',')
        interest_header = next(interest_reader, None)
        minFrame = int(list(next(interest_reader, None))[0])
        test = reversed(list(interest_reader))
        maxFrame = int(list(next(test, None))[0])
    startTime = minFrame / 60
    endTime = maxFrame / 60

    clip = mp.VideoFileClip("../../../data/raw/" + str(vid_uuid) +
                            "/replay.mp4").subclip(startTime, endTime)
    clip.audio.write_audiofile("../../../data/raw/" + str(vid_uuid) + "/" +
                               str(analysis_uuid) + "-audio.wav")

    VIDEOFILE = "../../../data/raw/" + str(vid_uuid) + "/replay.mp4"
    AUDIOFILE = "../../../data/raw/" + str(vid_uuid) + "/" + str(
        analysis_uuid) + "-audio.wav"
    FEATUREFILE = "../../../data/processed/" + str(vid_uuid) + "-" + str(
        analysis_uuid) + "_extracted.ft"
    [Fs, x] = audioBasicIO.read_audio_file(AUDIOFILE)
    x = audioBasicIO.stereo_to_mono(x)

    midF, shortF, midFNames = MidTermFeatures.mid_feature_extraction(
        x, Fs, (1 / 30) * Fs, (1 / 60) * Fs, (1 / 60) * Fs, (1 / 120) * Fs)

    np.save(FEATUREFILE, midF)
    np.savetxt(FEATUREFILE + ".csv",
               midF.T,
               delimiter=",",
               header=",".join(midFNames))
    #%%
    audioAnalysis.thumbnailWrapper(AUDIOFILE, 50)
コード例 #3
0
def test_feature_extraction_segment():
    print("Short-term feature extraction")
    [fs, x] = audioBasicIO.read_audio_file("test_data/5_sec_wav.wav")
    mt, st, mt_names = MidTermFeatures.mid_feature_extraction(
        x, fs, 1 * fs, 1 * fs, 0.05 * fs, 0.05 * fs)
    assert mt.shape[1] == 5, "Wrong number of short-term windows"
    assert mt.shape[0] == len(mt_names),  "Number of features and feature " \
                                          "names are not the same"
コード例 #4
0
def file_regression(input_file, model_name, model_type):
    # Load classifier:

    if not os.path.isfile(input_file):
        print("fileClassification: wav file not found!")
        return -1, -1, -1

    #regression_models = glob.glob(model_name + "_*")   I CHANGED THIS
    regression_models = model_name
    regression_models2 = []
    for r in regression_models:
        if r[-5::] != "MEANS":
            regression_models2.append(r)
    regression_models = regression_models2
    regression_names = []
    for r in regression_models:
        regression_names.append(r[r.rfind("_") + 1::])

    # FEATURE EXTRACTION
    # LOAD ONLY THE FIRST MODEL (for mt_win, etc)
    if model_type == 'svm' or model_type == "svm_rbf" or \
            model_type == 'randomforest':
        _, _, _, mid_window, mid_step, short_window, short_step, compute_beat \
            = load_model(regression_models[0], True)

    # read audio file and convert to mono
    samping_rate, signal = audioBasicIO.read_audio_file(input_file)
    signal = audioBasicIO.stereo_to_mono(signal)
    # feature extraction:
    mid_features, s, _ = \
        aF.mid_feature_extraction(signal, samping_rate, mid_window * samping_rate,
                                  mid_step * samping_rate,
                                  round(samping_rate * short_window),
                                  round(samping_rate * short_step))
    # long term averaging of mid-term statistics
    mid_features = mid_features.mean(axis=1)
    if compute_beat:
        beat, beat_conf = aF.beat_extraction(s, short_step)
        mid_features = np.append(mid_features, beat)
        mid_features = np.append(mid_features, beat_conf)

    # REGRESSION
    R = []
    for ir, r in enumerate(regression_models):
        if not os.path.isfile(r):
            print("fileClassification: input model_name not found!")
            return (-1, -1, -1)
        if model_type == 'svm' or model_type == "svm_rbf" \
                or model_type == 'randomforest':
            model, mean, std, _, _, _, _, _ = load_model(r, True)
        curFV = (mid_features - mean) / std  # normalization
        R.append(regression_wrapper(model, model_type,
                                    curFV))  # classification
    return R, regression_names
コード例 #5
0
ファイル: audioTrainTest.py プロジェクト: 799609164/VAD
def fileRegression(inputFile, model_name, model_type):
    # Load classifier:

    if not os.path.isfile(inputFile):
        print("fileClassification: wav file not found!")
        return (-1, -1, -1)

    regression_models = glob.glob(model_name + "_*")
    regression_models2 = []
    for r in regression_models:
        if r[-5::] != "MEANS":
            regression_models2.append(r)
    regression_models = regression_models2
    regression_names = []
    for r in regression_models:
        regression_names.append(r[r.rfind("_") + 1::])

    # FEATURE EXTRACTION
    # LOAD ONLY THE FIRST MODEL (for mt_win, etc)
    if model_type == 'svm' or model_type == "svm_rbf" or \
            model_type == 'randomforest':
        [_, _, _, mt_win, mt_step, st_win, st_step, compute_beat] = \
            load_model(regression_models[0], True)

    # read audio file and convert to mono
    [Fs, x] = audioBasicIO.read_audio_file(inputFile)
    x = audioBasicIO.stereo_to_mono(x)
    # feature extraction:
    [mt_features, s, _] = aF.mid_feature_extraction(x, Fs, mt_win * Fs,
                                                    mt_step * Fs,
                                                    round(Fs * st_win),
                                                    round(Fs * st_step))
    # long term averaging of mid-term statistics
    mt_features = mt_features.mean(axis=1)
    if compute_beat:
        [beat, beatConf] = aF.beat_extraction(s, st_step)
        mt_features = np.append(mt_features, beat)
        mt_features = np.append(mt_features, beatConf)

    # REGRESSION
    R = []
    for ir, r in enumerate(regression_models):
        if not os.path.isfile(r):
            print("fileClassification: input model_name not found!")
            return (-1, -1, -1)
        if model_type == 'svm' or model_type == "svm_rbf" \
                or model_type == 'randomforest':
            [model, MEAN, STD, mt_win, mt_step, st_win, st_step,
             compute_beat] = load_model(r, True)
        curFV = (mt_features - MEAN) / STD  # normalization
        R.append(regressionWrapper(model, model_type, curFV))  # classification
    return R, regression_names
コード例 #6
0
def features(file_path):
    fs, s = aIO.read_audio_file(file_path)
    m_win, m_step, s_win, s_step = 1, 1, 0.1, 0.05
    mid_features, short_features, mid_feature_names = aF.mid_feature_extraction(
        s, fs, round(fs * m_win), round(fs * m_step), round(fs * s_win),
        round(fs * s_step))
    mid_features = np.transpose(mid_features).mean(axis=0)
    beat, beat_conf = aF.beat_extraction(short_features, s_step)
    mid_features = np.append(mid_features, beat)
    mid_features = np.append(mid_features, beat_conf)
    mid_feature_names.append('beat')
    mid_feature_names.append('beat_conf')
    return mid_features, mid_feature_names
コード例 #7
0
ファイル: afe_exp.py プロジェクト: yt7589/mgs
 def exp3():
     fs, s = aIO.read_audio_file(AfeExp.wav_file)
     mt, st, mt_n = aMF.mid_feature_extraction(s, fs, 1 * fs, 1 * fs,
                                               0.05 * fs, 0.05 * fs)
     print(f'signal duration {len(s)/fs} seconds')
     print(
         f'{st.shape[1]} {st.shape[0]}-D short-term feature vectors extracted'
     )
     print(
         f'{mt.shape[1]} {mt.shape[0]}-D segment feature statistic vectors extracted'
     )
     print('mid-term feature names')
     for i, mi in enumerate(mt_n):
         print(f'{i}:{mi}')
コード例 #8
0
def hmmSegmentation(wav_file_name,
                    hmm_model_name,
                    plot_res=False,
                    gt_file_name=""):
    [fs, x] = audioBasicIO.read_audio_file(wav_file_name)
    try:
        fo = open(hmm_model_name, "rb")
    except IOError:
        print("didn't find file")
        return

    try:
        hmm = cPickle.load(fo)
        classes_all = cPickle.load(fo)
        mt_win = cPickle.load(fo)
        mt_step = cPickle.load(fo)
    except:
        fo.close()
    fo.close()

    [Features, _, _] = aF.mid_feature_extraction(x, fs,
                                                 mt_win * fs, mt_step * fs,
                                                 round(fs * 0.050),
                                                 round(fs * 0.050))
    flags_ind = hmm.predict(Features.T)  # apply model
    if os.path.isfile(gt_file_name):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file_name)
        flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs,
                                              mt_step)
        flagsGTNew = []
        for j, fl in enumerate(flags_gt):
            # "align" labels with GT
            if class_names_gt[flags_gt[j]] in classes_all:
                flagsGTNew.append(
                    classes_all.index(class_names_gt[flags_gt[j]]))
            else:
                flagsGTNew.append(-1)
        cm = np.zeros((len(classes_all), len(classes_all)))
        flags_ind_gt = np.array(flagsGTNew)
        for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])):
            cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1
    else:
        flags_ind_gt = np.array([])
    acc = plotSegmentationResults(flags_ind, flags_ind_gt, classes_all,
                                  mt_step, not plot_res)
    if acc >= 0:
        print("Overall Accuracy: {0:.2f}".format(acc))
        return (flags_ind, class_names_gt, acc, cm)
    else:
        return (flags_ind, classes_all, -1, -1)
コード例 #9
0
ファイル: audioTrainTest.py プロジェクト: 799609164/VAD
def fileClassification(inputFile, model_name, model_type):
    # Load classifier:

    if not os.path.isfile(model_name):
        print("fileClassification: input model_name not found!")
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print("fileClassification: wav file not found!")
        return (-1, -1, -1)

    if model_type == 'knn':
        [
            classifier, MEAN, STD, classNames, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = load_model_knn(model_name)
    else:
        [
            classifier, MEAN, STD, classNames, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = load_model(model_name)

    # read audio file and convert to mono
    [Fs, x] = audioBasicIO.read_audio_file(inputFile)
    x = audioBasicIO.stereo_to_mono(x)

    if Fs == 0:
        # audio file IO problem
        return -1, -1, -1
    if x.shape[0] / float(Fs) <= mt_win:
        return -1, -1, -1

    # feature extraction:
    [mt_features, s, _] = aF.mid_feature_extraction(x, Fs, mt_win * Fs,
                                                    mt_step * Fs,
                                                    round(Fs * st_win),
                                                    round(Fs * st_step))
    # long term averaging of mid-term statistics
    mt_features = mt_features.mean(axis=1)
    if compute_beat:
        [beat, beatConf] = aF.beat_extraction(s, st_step)
        mt_features = np.append(mt_features, beat)
        mt_features = np.append(mt_features, beatConf)
    curFV = (mt_features - MEAN) / STD  # normalization

    # classification
    [Result, P] = classifierWrapper(classifier, model_type, curFV)
    return Result, P, classNames
コード例 #10
0
ファイル: feature_extraction.py プロジェクト: tyiannak/readys
    def extract_segment_features(self, filenames):
        """
        Extract segment features using pyAudioAnalysis

        Parameters
        ----------

        filenames :
            List of input audio filenames

        basic_features_params:
            Dictionary of parameters to consider.
            It must contain:
                - mid_window: window size for framing
                - mid_step: window step for framing
                - short_window: segment window size
                - short_step: segment window step

        Returns
        -------

        segment_features_all:
            List of stats on segment features
        feature_names:
            List of feature names

        """
        print("--> Extracting audio features")
        segment_features_all = []

        sequences, sampling_rate = self.read_files(filenames)

        mid_window = self.basic_features_params['mid_window']
        mid_step = self.basic_features_params['mid_step']
        short_window = self.basic_features_params['short_window']
        short_step = self.basic_features_params['short_step']

        for seq in sequences:
            (segment_features_stats, segment_features,
             feature_names) = aF.mid_feature_extraction(
                seq, sampling_rate, round(mid_window * sampling_rate),
                round(mid_step * sampling_rate),
                round(sampling_rate * short_window),
                round(sampling_rate * short_step))
            segment_features_stats = np.asarray(segment_features_stats)
            segment_features_all.append(segment_features_stats)

        return segment_features_all, feature_names
コード例 #11
0
def file_classification(input_file, model_name, model_type):
    # Load classifier:

    if not os.path.isfile(model_name):
        print("fileClassification: input model_name not found!")
        return -1, -1, -1

    if not os.path.isfile(input_file):
        print("fileClassification: wav file not found!")
        return -1, -1, -1

    if model_type == 'knn':
        classifier, mean, std, classes, mid_window, mid_step, short_window, \
            short_step, compute_beat = load_model_knn(model_name)
    else:
        classifier, mean, std, classes, mid_window, mid_step, short_window, \
            short_step, compute_beat = load_model(model_name)

    # read audio file and convert to mono
    sampling_rate, signal = audioBasicIO.read_audio_file(input_file)
    signal = audioBasicIO.stereo_to_mono(signal)

    if sampling_rate == 0:
        # audio file IO problem
        return -1, -1, -1
    if signal.shape[0] / float(sampling_rate) <= mid_window:
        return -1, -1, -1

    # feature extraction:
    mid_features, s, _ = \
        aF.mid_feature_extraction(signal, sampling_rate,
                                  mid_window * sampling_rate,
                                  mid_step * sampling_rate,
                                  round(sampling_rate * short_window),
                                  round(sampling_rate * short_step))
    # long term averaging of mid-term statistics
    mid_features = mid_features.mean(axis=1)
    if compute_beat:
        beat, beat_conf = aF.beat_extraction(s, short_step)
        mid_features = np.append(mid_features, beat)
        mid_features = np.append(mid_features, beat_conf)
    feature_vector = (mid_features - mean) / std  # normalization

    # classification
    class_id, probability = classifier_wrapper(classifier, model_type,
                                               feature_vector)
    return class_id, probability, classes
コード例 #12
0
ファイル: audio_features_paa.py プロジェクト: x75/smp_audio
def compute_features_paa(filename, with_timebase=False, verbose=False):
    """compute_features_paa

    Compute a bag of standard audio features to be used for some
    downstream task.
    """
    if verbose:
        print('compute_features_paa loading from {0}'.format(filename))
    [Fs, x_] = audioBasicIO.read_audio_file(filename)
    if verbose:
        print('compute_features_paa: loaded {1} samples from {0}'.format(
            filename, x_.shape))
    if len(x_.shape) > 1 and x_.shape[1] > 1:
        x = audioBasicIO.stereo_to_mono(x_)
    else:
        x = x_
    x_duration = x.shape[0] / Fs
    if verbose:
        print(f'compute_features_paa: {x_duration} seconds of audio at {Fs}Hz')

    mt_win = 1.0 * Fs
    mt_step = 0.5 * Fs
    st_win = 0.050 * Fs
    st_step = 0.025 * Fs
    # F, F_names = audioFeatureExtraction.stFeatureExtraction(x, Fs, st_win, st_step)
    # G, F, F_names = audioFeatureExtraction.mtFeatureExtraction(x, Fs, mt_win, mt_step, st_win, st_step)
    G, F, F_names = mF.mid_feature_extraction(x, Fs, mt_win, mt_step, st_win,
                                              st_step)

    if with_timebase:
        G_time = np.linspace(0, G.shape[1] * 0.5, G.shape[1] + 1)
        F_time = np.linspace(0, F.shape[1] * 0.025, F.shape[1] + 1)
    else:
        G_time = None
        F_time = None

    if verbose:
        print(f'compute_features_paa: F = {F.shape} {F}')
        print(f'compute_features_paa:     {F_time}')
        print(f'compute_features_paa: G = {G.shape} {G}')
        print(f'compute_features_paa:     {G_time}')

    if with_timebase:
        return F, F_names, G, F_time, G_time
    else:
        return F, F_names, G
コード例 #13
0
 def extract_afs(wav_file):
     fs, s = aIO.read_audio_file(wav_file)
     mt, st, mt_n = aMF.mid_feature_extraction(s, fs, 1 * fs, 1 * fs,
                                               0.05 * fs, 0.05 * fs)
     '''
     print(f'signal duration {len(s)/fs} seconds')
     print(f'{st.shape[1]} {st.shape[0]}-D short-term feature vectors extracted')
     print(f'{mt.shape[1]} {mt.shape[0]}-D segment feature statistic vectors extracted')
     print('mid-term feature names')
     for i, mi in enumerate(mt_n):
         print(f'{i}:{mi}')
     '''
     mtf = np.mean(mt, axis=1)
     feats = np.array([
         mtf[mt_n.index('spectral_centroid_mean')],
         mtf[mt_n.index('energy_entropy_mean')]
     ])
     return feats
コード例 #14
0
def train_hmm_from_file(wav_file, gt_file, hmm_model_name, mid_window,
                        mid_step):
    """
    This function trains a HMM model for segmentation-classification
    using a single annotated audio file
    ARGUMENTS:
     - wav_file:        the path of the audio filename
     - gt_file:         the path of the ground truth filename
                       (a csv file of the form <segment start in seconds>,
                       <segment end in seconds>,<segment label> in each row
     - hmm_model_name:   the name of the HMM model to be stored
     - mt_win:          mid-term window size
     - mt_step:         mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - class_names:     a list of class_names

    After training, hmm, class_names, along with the mt_win and mt_step
    values are stored in the hmm_model_name file
    """

    seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file)
    flags, class_names = segments_to_labels(seg_start, seg_end, seg_labs,
                                            mid_step)
    sampling_rate, signal = audioBasicIO.read_audio_file(wav_file)
    features, _, _ = \
        mtf.mid_feature_extraction(signal, sampling_rate,
                                   mid_window * sampling_rate,
                                   mid_step * sampling_rate,
                                   round(sampling_rate * 0.050),
                                   round(sampling_rate * 0.050))
    class_priors, transumation_matrix, means, cov = \
        train_hmm_compute_statistics(features, flags)
    hmm = hmmlearn.hmm.GaussianHMM(class_priors.shape[0], "diag")

    hmm.covars_ = cov
    hmm.means_ = means
    hmm.startprob_ = class_priors
    hmm.transmat_ = transumation_matrix

    save_hmm(hmm_model_name, hmm, class_names, mid_window, mid_step)

    return hmm, class_names
コード例 #15
0
def mid_term_feat_extraction(wav_file_path):

    sampling_rate, signal = audioBasicIO.read_audio_file(wav_file_path)
    if sampling_rate == 0:
        print('Sampling rate not correct.')
        return None

    signal = audioBasicIO.stereo_to_mono(signal)
    if signal.shape[0] < float(sampling_rate) / 5:
        print("The duration of the audio is too short.")
        return None

    mid_window, mid_step, short_window, short_step = 0.5, 0.5, 0.05, 0.05
    mid_features, _, mid_feature_names = MidTermFeatures.mid_feature_extraction(
        signal, sampling_rate, round(mid_window * sampling_rate),
        round(mid_step * sampling_rate), round(sampling_rate * short_window),
        round(sampling_rate * short_step))
    mid_features = np.transpose(mid_features)
    mid_features = mid_features.mean(axis=0)
    # long term averaging of mid-term statistics
    if (not np.isnan(mid_features).any()) and (
            not np.isinf(mid_features).any()):
        #print('Mid-Terms features extracted correctly.')
        mid_dict = dict(zip(mid_feature_names, mid_features))
        mid_df = pd.DataFrame([mid_dict.values()], columns=mid_dict.keys())

        # Smile library audio extraction
        smile = opensmile.Smile(
            feature_set=opensmile.FeatureSet.eGeMAPSv01b,
            feature_level=opensmile.FeatureLevel.Functionals,
        )
        smile_features = smile.process_signal(signal, sampling_rate)
        smile_df = pd.DataFrame(smile_features).reset_index().iloc[:, 2:]

        final_df = pd.concat([mid_df, smile_df], axis=1)

        #excel_path = wav_file_path.strip('.') + 'features_extracted.xlsx'
        #final_df.to_excel(excel_path)
        return final_df
    else:
        #print('Mid-Terms features extracted incorrectly.')
        return None
コード例 #16
0
def trainHMM_fromFile(wav_file, gt_file, hmm_model_name, mt_win, mt_step):
    """
    This function trains a HMM model for segmentation-classification
    using a single annotated audio file
    ARGUMENTS:
     - wav_file:        the path of the audio filename
     - gt_file:         the path of the ground truth filename
                       (a csv file of the form <segment start in seconds>,
                       <segment end in seconds>,<segment label> in each row
     - hmm_model_name:   the name of the HMM model to be stored
     - mt_win:          mid-term window size
     - mt_step:         mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - class_names:     a list of class_names

    After training, hmm, class_names, along with the mt_win and mt_step
    values are stored in the hmm_model_name file
    """

    [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
    flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step)
    [fs, x] = audioBasicIO.read_audio_file(wav_file)
    [F, _, _] = aF.mid_feature_extraction(x, fs, mt_win * fs, mt_step * fs,
                                          round(fs * 0.050), round(fs * 0.050))
    start_prob, transmat, means, cov = trainHMM_computeStatistics(F, flags)
    hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")

    hmm.startprob_ = start_prob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmm_model_name, "wb")
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(class_names, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, class_names
コード例 #17
0
def hmm_segmentation(audio_file, hmm_model_name, plot_results=False,
                     gt_file=""):
    sampling_rate, signal = audioBasicIO.read_audio_file(audio_file)

    with open(hmm_model_name, "rb") as f_handle:
        hmm = cpickle.load(f_handle)
        class_names = cpickle.load(f_handle)
        mid_window = cpickle.load(f_handle)
        mid_step = cpickle.load(f_handle)

    features, _, _ = \
        mtf.mid_feature_extraction(signal, sampling_rate,
                                   mid_window * sampling_rate,
                                   mid_step * sampling_rate,
                                   round(sampling_rate * 0.050),
                                   round(sampling_rate * 0.050))

    # apply model
    labels = hmm.predict(features.T)
    labels_gt, class_names_gt, accuracy, cm = \
        load_ground_truth(gt_file, labels, class_names, mid_step, plot_results)
    return labels, class_names, accuracy, cm
コード例 #18
0
[
    classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1,
    computeBEAT1
] = aT.load_model_knn(
    os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models",
                 "knn_speaker_10"))
[
    classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2,
    computeBEAT2
] = aT.load_model_knn(
    os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/models",
                 "knn_speaker_male_female"))

[mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs,
                                                    mt_step * fs,
                                                    round(fs * st_win),
                                                    round(fs * st_win * 0.5))

MidTermFeatures2 = np.zeros(
    (mt_feats.shape[0] + len(classNames1) + len(classNames2),
     mt_feats.shape[1]))

for i in range(mt_feats.shape[1]):
    cur_f1 = (mt_feats[:, i] - MEAN1) / STD1
    cur_f2 = (mt_feats[:, i] - MEAN2) / STD2
    [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
    [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
    MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
    MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] + len(classNames1),
                     i] = P1 + 0.0001
    MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001
コード例 #19
0
import plotly.graph_objs as go
import plotly
import wavio


# extraction on the ref
fs, s = aIO.read_audio_file("wav/ref_bip.wav")
s_ref = s[3000:18000, 0]
print(fs, s_ref.shape)
duration = len(s_ref) / float(fs)
print(f'duration = {duration} seconds')

# extract short-term features using a 50msec non-overlapping windows
win, step = 0.05, 0.05
win_mid, step_mid = duration, 0.5
mt_ref, st_ref, mt_n_ref = aFm.mid_feature_extraction(s_ref, fs, win_mid * fs, step_mid * fs,
                                         win * fs, step * fs)
# print(f'signal duration {len(s)/fs} seconds')
# print(f'{st.shape[1]} {st.shape[0]}-D short-term feature vectors extracted')
# print(f'{mt.shape[1]} {mt.shape[0]}-D segment feature statistic vectors extracted')
# print('mid-term feature names')
# for i, mi in enumerate(mt_n):
#     print(f'{i}:{mi}')

# extraction on the long signal
audio_to_analyse = "50_brasse_stevens.wav"
# fs, s_long = aIO.read_audio_file("wav/200_4n_dames_finaleA_f122020_gauche_lowered.wav") # 1.9
# fs, s_long = aIO.read_audio_file("wav/50_dos_dames_finaleA_f122020_gauche_lowered.wav") # 6
fs, s_long = aIO.read_audio_file("wav/" + audio_to_analyse)
s = s_long[:, 0]
print(fs, s.shape)
duration_long = len(s) / float(fs)
コード例 #20
0
def trainHMM_fromDir(dirPath, hmm_model_name, mt_win, mt_step):
    """
    This function trains a HMM model for segmentation-classification using
    a where WAV files and .segment (ground-truth files) are stored
    ARGUMENTS:
     - dirPath:        the path of the data diretory
     - hmm_model_name:    the name of the HMM model to be stored
     - mt_win:        mid-term window size
     - mt_step:        mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - class_names:        a list of class_names

    After training, hmm, class_names, along with the mt_win
    and mt_step values are stored in the hmm_model_name file
    """

    flags_all = np.array([])
    classes_all = []

    for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')):
        # for each WAV file
        wav_file = f
        gt_file = f.replace('.wav', '.segments')
        if not os.path.isfile(gt_file):
            continue
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step)
        for c in class_names:
            # update class names:
            if c not in classes_all:
                classes_all.append(c)
        [fs, x] = audioBasicIO.read_audio_file(wav_file)
        [F, _, _] = aF.mid_feature_extraction(x, fs, mt_win * fs,
                                              mt_step * fs, round(fs * 0.050),
                                              round(fs * 0.050))

        lenF = F.shape[1]
        lenL = len(flags)
        min_sm = min(lenF, lenL)
        F = F[:, 0:min_sm]
        flags = flags[0:min_sm]

        flagsNew = []
        for j, fl in enumerate(flags):      # append features and labels
            flagsNew.append(classes_all.index(class_names[flags[j]]))

        flags_all = np.append(flags_all, np.array(flagsNew))

        if i == 0:
            f_all = F
        else:
            f_all = np.concatenate((f_all, F), axis=1)

    # compute HMM statistics
    start_prob, transmat, means, cov = trainHMM_computeStatistics(f_all,
                                                                  flags_all)
    # train the HMM
    hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
    hmm.startprob_ = start_prob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmm_model_name, "wb")   # save HMM model
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classes_all, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classes_all
コード例 #21
0
ファイル: QE_audioUtils.py プロジェクト: rebotlucion/QuickEmo
def QE_speaker_diarization(
        sampling_rate,
        signal,
        n_speakers,
        classifier_all,
        mean_all,
        std_all,
        class_names_all,
        classifier_fm,
        mean_fm,
        std_fm,
        class_names_fm,  # Load models from avove
        mid_window=2.0,
        mid_step=0.2,
        short_window=0.05,
        lda_dim=35,
        plot_res=False):
    """
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed  #QE_:-> ADAPTED HERE TO RECEIVE DIRECTLY THE DATA sampling_rate, signal INSTEAD OF filename
        - n_speakers       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - mid_window (opt)    mid-term window size
        - mid_step (opt)    mid-term window step
        - short_window  (opt)    short-term window size
        - lda_dim (opt     LDA dimension (0 for no LDA)
        - plot_res         (opt)   0 for not plotting the results 1 for plotting

    """
    """
        Otras opciones a explorar para diarization
        https://hackernoon.com/speaker-diarization-the-squad-way-2205e0accbda
        https://github.com/YongyuG/s4d-diarization-gao/blob/master/s4d/diar.py, muy buena pinta https://pypi.org/project/s4d/ https://projets-lium.univ-lemans.fr/s4d/
        https://medium.com/datadriveninvestor/speaker-diarization-22121f1264b1
        https://arxiv.org/pdf/2005.08072v1.pdf
        https://github.com/calclavia/tal-asrd
        https://github.com/josepatino/pyBK
        https://github.com/wq2012/awesome-diarization
        https://www.researchgate.net/publication/221480626_The_Detection_of_Overlapping_Speech_with_Prosodic_Features_for_Speaker_Diarization


    """
    # sampling_rate, signal = audioBasicIO.read_audio_file(filename) # NO ES NECESARIO DADOJ QUE LO PASO COMO ARGUMENTOS DE ENTRADA EN LUGAR DE filename
    signal = audioBasicIO.stereo_to_mono(
        signal)  # eliminar si ya viene en mono como condicion
    duration = len(signal) / sampling_rate
    """
    #QE_: In order to avoid a recurrent load of the models, they are loaded more globally only once and then passed as arguments
    # So this part is copied in the module avove , QE_main:
    base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 
                            "data/models")

    classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \
        at.load_model_knn(os.path.join(base_dir, "knn_speaker_10"))
    classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _,  _ = \
        at.load_model_knn(os.path.join(base_dir, "knn_speaker_male_female"))
    """

    mid_feats, st_feats, _ = \
        mtf.mid_feature_extraction(signal, sampling_rate,
                                   mid_window * sampling_rate,
                                   mid_step * sampling_rate,
                                   round(sampling_rate * short_window),
                                   round(sampling_rate * short_window * 0.5))

    mid_term_features = np.zeros(
        (mid_feats.shape[0] + len(class_names_all) + len(class_names_fm),
         mid_feats.shape[1]))

    for index in range(mid_feats.shape[1]):
        feature_norm_all = (mid_feats[:, index] - mean_all) / std_all
        feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm
        _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all)
        _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm)
        start = mid_feats.shape[0]
        end = mid_feats.shape[0] + len(class_names_all)
        mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index]
        mid_term_features[start:end, index] = p1 + 1e-4
        mid_term_features[end::, index] = p2 + 1e-4

    mid_feats = mid_term_features  # TODO
    feature_selected = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]

    mid_feats = mid_feats[feature_selected, :]

    mid_feats_norm, mean, std = at.normalize_features([mid_feats.T])
    mid_feats_norm = mid_feats_norm[0].T
    n_wins = mid_feats.shape[1]

    # remove outliers:
    dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)),
                      axis=0)
    m_dist_all = np.mean(dist_all)
    i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    # EnergyMin = np.min(mt_feats[1,:])
    # EnergyMean = np.mean(mt_feats[1,:])
    # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
    # print i_non_outliers

    mt_feats_norm_or = mid_feats_norm
    mid_feats_norm = mid_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:

        # extract mid-term features with minimum step:
        window_ratio = int(round(mid_window / short_window))
        step_ratio = int(round(short_window / short_window))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        for index in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        # for each of the short-term features:
        for index in range(num_of_features):
            cur_pos = 0
            feat_len = len(st_feats[index])
            while cur_pos < feat_len:
                n1 = cur_pos
                n2 = cur_pos + window_ratio
                if n2 > feat_len:
                    n2 = feat_len
                short_features = st_feats[index][n1:n2]
                mt_feats_to_red[index].append(np.mean(short_features))
                mt_feats_to_red[index + num_of_features].\
                    append(np.std(short_features))
                cur_pos += step_ratio
        mt_feats_to_red = np.array(mt_feats_to_red)
        mt_feats_to_red_2 = np.zeros(
            (mt_feats_to_red.shape[0] + len(class_names_all) +
             len(class_names_fm), mt_feats_to_red.shape[1]))
        limit = mt_feats_to_red.shape[0] + len(class_names_all)
        for index in range(mt_feats_to_red.shape[1]):
            feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all
            feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm
            _, p1 = at.classifier_wrapper(classifier_all, "knn",
                                          feature_norm_all)
            _, p2 = at.classifier_wrapper(classifier_fm, "knn",
                                          feature_norm_fm)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \
                mt_feats_to_red[:, index]
            mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit,
                              index] = p1 + 1e-4
            mt_feats_to_red_2[limit::, index] = p2 + 1e-4
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[feature_selected, :]
        mt_feats_to_red, mean, std = at.normalize_features([mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        labels = np.zeros((mt_feats_to_red.shape[1], ))
        lda_step = 1.0
        lda_step_ratio = lda_step / short_window
        for index in range(labels.shape[0]):
            labels[index] = int(index * short_window / lda_step_ratio)
        clf = sklearn.discriminant_analysis.\
            LinearDiscriminantAnalysis(n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, labels)
        mid_feats_norm = (clf.transform(mid_feats_norm.T)).T

##########################################################################################################################################
    if n_speakers <= 0:
        s_range = range(
            2, 10
        )  #QE_: Adapt in this case to range 1-10? We are going to use this diarizantion in short windows, 250-500 ms


###########################################################################################################################################
    else:
        s_range = [n_speakers]

    cluster_labels = []
    sil_all = []
    cluster_centers = []

    for speakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=speakers)
        k_means.fit(mid_feats_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        cluster_labels.append(cls)
        cluster_centers.append(means)
        sil_1 = []
        sil_2 = []
        for c in range(speakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mid_feats_norm[:, cls == c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                dist = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(np.mean(dist) * clust_per_cent)
                sil_temp = []
                for c2 in range(speakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\
                                           float(len(cls))
                        mid_features_temp = mid_feats_norm[:, cls == c2]
                        dist = distance.cdist(mt_feats_norm_temp.T,
                                              mid_features_temp.T)
                        sil_temp.append(
                            np.mean(dist) *
                            (clust_per_cent + clust_per_cent_2) / 2.0)
                sil_temp = np.array(sil_temp)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(sil_temp))
        sil_1 = np.array(sil_1)
        sil_2 = np.array(sil_2)
        sil = []
        for c in range(speakers):
            # for each cluster (speaker) compute silhouette
            sil.append(
                (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(np.mean(sil))

    imax = int(np.argmax(sil_all))
    # optimal number of clusters
    num_speakers = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = np.zeros((n_wins, ))
    for index in range(n_wins):
        j = np.argmin(np.abs(index - i_non_outliers))
        cls[index] = cluster_labels[imax][j]

    # Post-process method 1: hmm smoothing
    for index in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
            train_hmm_compute_statistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundtruth exists
    if os.path.isfile(gt_file):
        seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file)
        flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end,
                                                      seg_labs, mid_step)
    """
    if plot_res:
        fig = plt.figure()    
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(np.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls)

    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(np.array(range(len(flags_gt))) *
                     mid_step + mid_step / 2.0, flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
            evaluate_speaker_diarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m,
                                                        100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        if n_speakers <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
        """

    return cls
コード例 #22
0
def mtFileClassification(input_file, model_name, model_type,
                         plot_results=False, gt_file=""):
    """
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used,
    i.e. a pre-trained classifier.
    ARGUMENTS:
        - input_file:        path of the input WAV file
        - model_name:        name of the classification model
        - model_type:        svm or knn depending on the classifier type
        - plot_results:      True if results are to be plotted using
                             matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the
                            endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the
                            class ID of the i-th segment
    """

    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if model_type == "knn":
        [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step,
         compute_beat] = aT.load_model_knn(model_name)
    else:
        [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step,
         compute_beat] = aT.load_model(model_name)

    if compute_beat:
        print("Model " + model_name + " contains long-term music features "
                                      "(beat etc) and cannot be used in "
                                      "segmentation")
        return (-1, -1, -1, -1)
    [fs, x] = audioBasicIO.read_audio_file(input_file) # load input file
    if fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audioBasicIO.stereo_to_mono(x)  # convert stereo (if) to mono
    # mid-term feature extraction:
    [mt_feats, _, _] = aF.mid_feature_extraction(x, fs, mt_win * fs,
                                                 mt_step * fs,
                                                 round(fs * st_win),
                                                 round(fs * st_step))
    flags = []
    Ps = []
    flags_ind = []
    # for each feature vector (i.e. for each fix-sized segment):
    for i in range(mt_feats.shape[1]):
        cur_fv = (mt_feats[:, i] - MEAN) / STD  # normalize current feature v
        # classify vector:
        [res, P] = aT.classifierWrapper(classifier, model_type, cur_fv)
        flags_ind.append(res)
        flags.append(class_names[int(res)])  # update class label matrix
        Ps.append(np.max(P))   # update probability matrix
    flags_ind = np.array(flags_ind)

    # 1-window smoothing
    for i in range(1, len(flags_ind) - 1):
        if flags_ind[i-1] == flags_ind[i + 1]:
            flags_ind[i] = flags_ind[i + 1]
    # convert fix-sized flags to segments and classes
    (segs, classes) = flags2segs(flags, mt_step)
    segs[-1] = len(x) / float(fs)

    # Load grount-truth:
    if os.path.isfile(gt_file):
        [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt,
                                              seg_l_gt, mt_step)
        flags_ind_gt = []
        for j, fl in enumerate(flags_gt):
            # "align" labels with GT
            if class_names_gt[flags_gt[j]] in class_names:
                flags_ind_gt.append(class_names.index(class_names_gt[
                                                          flags_gt[j]]))
            else:
                flags_ind_gt.append(-1)
        flags_ind_gt = np.array(flags_ind_gt)
        cm = np.zeros((len(class_names_gt), len(class_names_gt)))
        for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])):
            cm[int(flags_ind_gt[i]),int(flags_ind[i])] += 1
    else:
        cm = []
        flags_ind_gt = np.array([])
    acc = plotSegmentationResults(flags_ind, flags_ind_gt,
                                  class_names, mt_step, not plot_results)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc)  )
        return (flags_ind, class_names_gt, acc, cm)
    else:
        return (flags_ind, class_names, acc, cm)
コード例 #23
0
    def WhatIsThis(self, data):
        # There are two completely separate models, one is a classifier that uses pyaudioanalysis, the other is a deepspeech model

        # Convert or cast the raw audio data to numpy array
        log.debug('Converting data to numpy')
        if len(data) % 2 != 0:
            log.critical('Data length: {0}'.format(len(data)))
            log.critical('Data: {0}'.format(data))
            return { #bullshit
                'loudness': 0.0,
                'class': 'bullshit',
                'probability': 1.0,
                'text': 'fuckitall',
            }
        AccumulatedData_np = np.frombuffer(data, np.int16)

        # Get the loudness, hope this works
        rms = np.sqrt(np.mean(AccumulatedData_np**2))
        log.debug(f'Raw loudness: {rms}')
        # normalize it, make it between 0.0 and 1.0.
        # rms = round((rms - 20.0) / 45, 2)
        # rms = float(np.clip(rms, 0.0, 1.0))

        seg_len = len(AccumulatedData_np)
        log.debug('seg_len ' + str(seg_len))

        # Run the classifier. This is ripped directly out of paura.py and carelessly sutured into place. There's so much blood! Thank you!!!
        log.debug('Running classifier')
        try:
            [mt_feats, _,
             _] = mF.mid_feature_extraction(AccumulatedData_np, self.fs,
                                            seg_len, seg_len,
                                            round(self.fs * self.st_win),
                                            round(self.fs * self.st_step))
            cur_fv = (mt_feats[:, 0] - self.MEAN) / self.STD
        except ValueError:
            log.error('Yeah, that thing happened')
            log.critical('Data length: {0}'.format(len(data)))
            log.critical('Data: {0}'.format(data))
            return { #bullshit
                'loudness': 0.0,
                'class': 'bullshit',
                'probability': 1.0,
                'text': 'fuckitall',
            }
        # classify vector:
        [res, prob] = aT.classifier_wrapper(self.classifier, "svm_rbf", cur_fv)
        win_class = self.class_names[int(res)]
        win_prob = round(prob[int(res)], 2)

        log.info('Classified {0:s} with probability {1:.2f}'.format(
            win_class, win_prob))

        # Run the accumulated audio data through deepspeech, if it's speech
        if win_class == 'lover':
            log.debug('Running deepspeech model')
            text = self.model.stt(AccumulatedData_np)
            log.info('Recognized: %s', text)
        else:
            text = 'undefined'

        # Save the utterance to a wav file. I hope later I'll be able to use this for training a better model, after I learn how to do that.

        # log.debug('Saving wav file')
        # wf = wave.open(os.path.join(self.save_dir, str(int(time.time())) + '_' + win_class + '_' + text.replace(' ', '_') + '.wav'), 'wb')
        # wf.setnchannels(1)
        # wf.setsampwidth(2)
        # wf.setframerate(16000)
        # wf.writeframes(data)
        # wf.close()

        # return an object
        return {
            'loudness': rms,
            'class': win_class,
            'probability': win_prob,
            'text': text,
        }
コード例 #24
0
def mid_term_file_classification(input_file,
                                 model_name,
                                 model_type,
                                 plot_results=False,
                                 gt_file=""):
    """
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used,
    i.e. a pre-trained classifier.
    ARGUMENTS:
        - input_file:        path of the input WAV file
        - model_name:        name of the classification model
        - model_type:        svm or knn depending on the classifier type
        - plot_results:      True if results are to be plotted using
                             matplotlib along with a set of statistics
    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the
                            endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the
                            class ID of the i-th segment
    """
    labels = []
    accuracy = 0.0
    class_names = []
    cm = np.array([])
    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return labels, class_names, accuracy, cm

    # Load classifier:
    if model_type == "knn":
        classifier, mean, std, class_names, mt_win, mid_step, st_win, \
         st_step, compute_beat = at.load_model_knn(model_name)
    else:
        classifier, mean, std, class_names, mt_win, mid_step, st_win, \
         st_step, compute_beat = at.load_model(model_name)
    if compute_beat:
        print("Model " + model_name + " contains long-term music features "
              "(beat etc) and cannot be used in "
              "segmentation")
        return labels, class_names, accuracy, cm
    # load input file
    sampling_rate, signal = audioBasicIO.read_audio_file(input_file)

    # could not read file
    if sampling_rate == 0:
        return labels, class_names, accuracy, cm

    # convert stereo (if) to mono
    signal = audioBasicIO.stereo_to_mono(signal)

    # mid-term feature extraction:
    mt_feats, _, _ = \
        mtf.mid_feature_extraction(signal, sampling_rate,
                                   mt_win * sampling_rate,
                                   mid_step * sampling_rate,
                                   round(sampling_rate * st_win),
                                   round(sampling_rate * st_step))
    posterior_matrix = []

    # for each feature vector (i.e. for each fix-sized segment):
    for col_index in range(mt_feats.shape[1]):
        # normalize current feature v
        feature_vector = (mt_feats[:, col_index] - mean) / std

        # classify vector:
        label_predicted, posterior = \
            at.classifier_wrapper(classifier, model_type, feature_vector)
        labels.append(label_predicted)

        # update probability matrix
        posterior_matrix.append(np.max(posterior))
    labels = np.array(labels)

    # convert fix-sized flags to segments and classes
    segs, classes = labels_to_segments(labels, mid_step)
    segs[-1] = len(signal) / float(sampling_rate)
    # Load grount-truth:
    labels_gt, class_names_gt, accuracy, cm = \
        load_ground_truth(gt_file, labels, class_names, mid_step, plot_results)

    return labels, class_names, accuracy, cm
コード例 #25
0
def speaker_diarization(filename,
                        n_speakers,
                        mid_window=2.0,
                        mid_step=0.2,
                        short_window=0.05,
                        lda_dim=35,
                        plot_res=False):
    """
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
        - n_speakers       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - mid_window (opt)    mid-term window size
        - mid_step (opt)    mid-term window step
        - short_window  (opt)    short-term window size
        - lda_dim (opt     LDA dimension (0 for no LDA)
        - plot_res         (opt)   0 for not plotting the results 1 for plotting
    """
    sampling_rate, signal = audioBasicIO.read_audio_file(filename)
    signal = audioBasicIO.stereo_to_mono(signal)
    duration = len(signal) / sampling_rate

    base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "data/models")

    classifier_all, mean_all, std_all, class_names_all, _, _, _, _, _ = \
        at.load_model_knn(os.path.join(base_dir, "knn_speaker_10"))
    classifier_fm, mean_fm, std_fm, class_names_fm, _, _, _, _,  _ = \
        at.load_model_knn(os.path.join(base_dir, "knn_speaker_male_female"))

    mid_feats, st_feats, _ = \
        mtf.mid_feature_extraction(signal, sampling_rate,
                                   mid_window * sampling_rate,
                                   mid_step * sampling_rate,
                                   round(sampling_rate * short_window),
                                   round(sampling_rate * short_window * 0.5))

    mid_term_features = np.zeros(
        (mid_feats.shape[0] + len(class_names_all) + len(class_names_fm),
         mid_feats.shape[1]))

    for index in range(mid_feats.shape[1]):
        feature_norm_all = (mid_feats[:, index] - mean_all) / std_all
        feature_norm_fm = (mid_feats[:, index] - mean_fm) / std_fm
        _, p1 = at.classifier_wrapper(classifier_all, "knn", feature_norm_all)
        _, p2 = at.classifier_wrapper(classifier_fm, "knn", feature_norm_fm)
        start = mid_feats.shape[0]
        end = mid_feats.shape[0] + len(class_names_all)
        mid_term_features[0:mid_feats.shape[0], index] = mid_feats[:, index]
        mid_term_features[start:end, index] = p1 + 1e-4
        mid_term_features[end::, index] = p2 + 1e-4

    mid_feats = mid_term_features  # TODO
    feature_selected = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]

    mid_feats = mid_feats[feature_selected, :]

    mid_feats_norm, mean, std = at.normalize_features([mid_feats.T])
    mid_feats_norm = mid_feats_norm[0].T
    n_wins = mid_feats.shape[1]

    # remove outliers:
    dist_all = np.sum(distance.squareform(distance.pdist(mid_feats_norm.T)),
                      axis=0)
    m_dist_all = np.mean(dist_all)
    i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    # EnergyMin = np.min(mt_feats[1,:])
    # EnergyMean = np.mean(mt_feats[1,:])
    # Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    # i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
    # print i_non_outliers

    mt_feats_norm_or = mid_feats_norm
    mid_feats_norm = mid_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:

        # extract mid-term features with minimum step:
        window_ratio = int(round(mid_window / short_window))
        step_ratio = int(round(short_window / short_window))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        for index in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        # for each of the short-term features:
        for index in range(num_of_features):
            cur_pos = 0
            feat_len = len(st_feats[index])
            while cur_pos < feat_len:
                n1 = cur_pos
                n2 = cur_pos + window_ratio
                if n2 > feat_len:
                    n2 = feat_len
                short_features = st_feats[index][n1:n2]
                mt_feats_to_red[index].append(np.mean(short_features))
                mt_feats_to_red[index + num_of_features].\
                    append(np.std(short_features))
                cur_pos += step_ratio
        mt_feats_to_red = np.array(mt_feats_to_red)
        mt_feats_to_red_2 = np.zeros(
            (mt_feats_to_red.shape[0] + len(class_names_all) +
             len(class_names_fm), mt_feats_to_red.shape[1]))
        limit = mt_feats_to_red.shape[0] + len(class_names_all)
        for index in range(mt_feats_to_red.shape[1]):
            feature_norm_all = (mt_feats_to_red[:, index] - mean_all) / std_all
            feature_norm_fm = (mt_feats_to_red[:, index] - mean_fm) / std_fm
            _, p1 = at.classifier_wrapper(classifier_all, "knn",
                                          feature_norm_all)
            _, p2 = at.classifier_wrapper(classifier_fm, "knn",
                                          feature_norm_fm)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0], index] = \
                mt_feats_to_red[:, index]
            mt_feats_to_red_2[mt_feats_to_red.shape[0]:limit,
                              index] = p1 + 1e-4
            mt_feats_to_red_2[limit::, index] = p2 + 1e-4
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[feature_selected, :]
        mt_feats_to_red, mean, std = at.normalize_features([mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        labels = np.zeros((mt_feats_to_red.shape[1], ))
        lda_step = 1.0
        lda_step_ratio = lda_step / short_window
        for index in range(labels.shape[0]):
            labels[index] = int(index * short_window / lda_step_ratio)
        clf = sklearn.discriminant_analysis.\
            LinearDiscriminantAnalysis(n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, labels)
        mid_feats_norm = (clf.transform(mid_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    cluster_labels = []
    sil_all = []
    cluster_centers = []

    for speakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=speakers)
        k_means.fit(mid_feats_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        cluster_labels.append(cls)
        cluster_centers.append(means)
        sil_1 = []
        sil_2 = []
        for c in range(speakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = np.nonzero(cls == c)[0].shape[0] / float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mid_feats_norm[:, cls == c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                dist = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(np.mean(dist) * clust_per_cent)
                sil_temp = []
                for c2 in range(speakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\
                                           float(len(cls))
                        mid_features_temp = mid_feats_norm[:, cls == c2]
                        dist = distance.cdist(mt_feats_norm_temp.T,
                                              mid_features_temp.T)
                        sil_temp.append(
                            np.mean(dist) *
                            (clust_per_cent + clust_per_cent_2) / 2.0)
                sil_temp = np.array(sil_temp)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(sil_temp))
        sil_1 = np.array(sil_1)
        sil_2 = np.array(sil_2)
        sil = []
        for c in range(speakers):
            # for each cluster (speaker) compute silhouette
            sil.append(
                (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 1e-5))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(np.mean(sil))

    imax = int(np.argmax(sil_all))
    # optimal number of clusters
    num_speakers = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = np.zeros((n_wins, ))
    for index in range(n_wins):
        j = np.argmin(np.abs(index - i_non_outliers))
        cls[index] = cluster_labels[imax][j]

    # Post-process method 1: hmm smoothing
    for index in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
            train_hmm_compute_statistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    class_names = ["speaker{0:d}".format(c) for c in range(num_speakers)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundtruth exists
    if os.path.isfile(gt_file):
        seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file)
        flags_gt, class_names_gt = segments_to_labels(seg_start, seg_end,
                                                      seg_labs, mid_step)

    if plot_res:
        fig = plt.figure()
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(np.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(np.array(range(len(cls))) * mid_step + mid_step / 2.0, cls)

    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(
                np.array(range(len(flags_gt))) * mid_step + mid_step / 2.0,
                flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
            evaluate_speaker_diarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(
                          100 * purity_cluster_m, 100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        if n_speakers <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
    return cls
コード例 #26
0
def train_hmm_from_directory(folder_path, hmm_model_name, mid_window,
                             mid_step):
    """
    This function trains a HMM model for segmentation-classification using
    a where WAV files and .segment (ground-truth files) are stored
    ARGUMENTS:
     - folder_path:     the path of the data diretory
     - hmm_model_name:  the name of the HMM model to be stored
     - mt_win:          mid-term window size
     - mt_step:         mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - class_names:    a list of class_names
    After training, hmm, class_names, along with the mt_win
    and mt_step values are stored in the hmm_model_name file
    """

    flags_all = np.array([])
    class_names_all = []
    for i, f in enumerate(glob.glob(folder_path + os.sep + '*.wav')):
        # for each WAV file
        wav_file = f
        gt_file = f.replace('.wav', '.segments')
        if os.path.isfile(gt_file):
            seg_start, seg_end, seg_labs = read_segmentation_gt(gt_file)
            flags, class_names = \
                segments_to_labels(seg_start, seg_end, seg_labs, mid_step)
            for c in class_names:
                # update class names:
                if c not in class_names_all:
                    class_names_all.append(c)
            sampling_rate, signal = audioBasicIO.read_audio_file(wav_file)
            feature_vector, _, _ = \
                mtf.mid_feature_extraction(signal, sampling_rate,
                                           mid_window * sampling_rate,
                                           mid_step * sampling_rate,
                                           round(sampling_rate * 0.050),
                                           round(sampling_rate * 0.050))

            flag_len = len(flags)
            feat_cols = feature_vector.shape[1]
            min_sm = min(feat_cols, flag_len)
            feature_vector = feature_vector[:, 0:min_sm]
            flags = flags[0:min_sm]

            flags_new = []
            # append features and labels
            for j, fl in enumerate(flags):
                flags_new.append(
                    class_names_all.index(class_names_all[flags[j]]))

            flags_all = np.append(flags_all, np.array(flags_new))

            if i == 0:
                f_all = feature_vector
            else:
                f_all = np.concatenate((f_all, feature_vector), axis=1)

    # compute HMM statistics
    class_priors, transmutation_matrix, means, cov = \
        train_hmm_compute_statistics(f_all, flags_all)
    # train the HMM
    hmm = hmmlearn.hmm.GaussianHMM(class_priors.shape[0], "diag")
    hmm.covars_ = cov
    hmm.means_ = means
    hmm.startprob_ = class_priors
    hmm.transmat_ = transmutation_matrix

    save_hmm(hmm_model_name, hmm, class_names_all, mid_window, mid_step)

    return hmm, class_names_all
コード例 #27
0
def speakerDiarization(filename, n_speakers, mt_size=2.0, mt_step=0.2,
                       st_win=0.05, lda_dim=35, plot_res=False):
    """
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
        - n_speakers       the number of speakers (clusters) in
                           the recording (<=0 for unknown)
        - mt_size (opt)    mid-term window size
        - mt_step (opt)    mid-term window step
        - st_win  (opt)    short-term window size
        - lda_dim (opt     LDA dimension (0 for no LDA)
        - plot_res         (opt)   0 for not plotting the results 1 for plotting
    """
    [fs, x] = audioBasicIO.read_audio_file(filename)
    x = audioBasicIO.stereo_to_mono(x)
    duration = len(x) / fs

    [classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerAll"))
    [classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.load_model_knn(os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "knnSpeakerFemaleMale"))

    [mt_feats, st_feats, _] = aF.mid_feature_extraction(x, fs, mt_size * fs,
                                                        mt_step * fs,
                                                        round(fs * st_win),
                                                        round(fs*st_win * 0.5))

    MidTermFeatures2 = np.zeros((mt_feats.shape[0] + len(classNames1) +
                                    len(classNames2), mt_feats.shape[1]))

    for i in range(mt_feats.shape[1]):
        cur_f1 = (mt_feats[:, i] - MEAN1) / STD1
        cur_f2 = (mt_feats[:, i] - MEAN2) / STD2
        [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
        [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
        MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
        MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0]+len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::, i] = P2 + 0.0001

    mt_feats = MidTermFeatures2    # TODO
    iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41,
                       42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]

    mt_feats = mt_feats[iFeaturesSelect, :]

    (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    n_wins = mt_feats.shape[1]

    # remove outliers:
    dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_norm.T)),
                         axis=0)
    m_dist_all = np.mean(dist_all)
    i_non_outliers = np.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = np.min(mt_feats[1,:])
    #EnergyMean = np.mean(mt_feats[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #i_non_outliers = np.nonzero(mt_feats[1,:] > Thres)[0]
    #print i_non_outliers

    perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins
    mt_feats_norm_or = mt_feats_norm
    mt_feats_norm = mt_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs,
        # st_win * fs, round(fs*st_win), round(fs*st_win));
        # extract mid-term features with minimum step:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        #for i in range(num_of_stats * num_of_features + 1):
        for i in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        for i in range(num_of_features):  # for each of the short-term features:
            curPos = 0
            N = len(st_feats[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mt_win_ratio
                if N2 > N:
                    N2 = N
                curStFeatures = st_feats[i][N1:N2]
                mt_feats_to_red[i].append(np.mean(curStFeatures))
                mt_feats_to_red[i+num_of_features].append(np.std(curStFeatures))
                curPos += mt_step_ratio
        mt_feats_to_red = np.array(mt_feats_to_red)
        mt_feats_to_red_2 = np.zeros((mt_feats_to_red.shape[0] +
                                        len(classNames1) + len(classNames2),
                                         mt_feats_to_red.shape[1]))
        for i in range(mt_feats_to_red.shape[1]):
            cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1
            cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2
            [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
            [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0], i] = mt_feats_to_red[:, i]
            mt_feats_to_red_2[mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] + len(classNames1), i] = P1 + 0.0001
            mt_feats_to_red_2[mt_feats_to_red.shape[0]+len(classNames1)::, i] = P2 + 0.0001
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :]
        #mt_feats_to_red += np.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010
        (mt_feats_to_red, MEAN, STD) = aT.normalizeFeatures([mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        #dist_all = np.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0)
        #m_dist_all = np.mean(dist_all)
        #iNonOutLiers2 = np.nonzero(dist_all < 3.0*m_dist_all)[0]
        #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2]
        Labels = np.zeros((mt_feats_to_red.shape[1], ));
        LDAstep = 1.0
        LDAstepRatio = LDAstep / st_win
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i*st_win/LDAstepRatio);
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, Labels)
        mt_feats_norm = (clf.transform(mt_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    clsAll = []
    sil_all = []
    centersAll = []

    for iSpeakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(mt_feats_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(mt_feats_norm.T))
        clsAll.append(cls)
        centersAll.append(means)
        sil_1 = []; sil_2 = []
        for c in range(iSpeakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = np.nonzero(cls == c)[0].shape[0] / \
                             float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mt_feats_norm[:, cls==c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                Yt = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(np.mean(Yt)*clust_per_cent)
                silBs = []
                for c2 in range(iSpeakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = np.nonzero(cls == c2)[0].shape[0] /\
                                           float(len(cls))
                        MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2]
                        Yt = distance.cdist(mt_feats_norm_temp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(np.mean(Yt)*(clust_per_cent
                                                     + clust_per_cent_2)/2.0)
                silBs = np.array(silBs)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(silBs))
        sil_1 = np.array(sil_1);
        sil_2 = np.array(sil_2);
        sil = []
        for c in range(iSpeakers):
            # for each cluster (speaker) compute silhouette
            sil.append( ( sil_2[c] - sil_1[c]) / (max(sil_2[c],
                                                      sil_1[c]) + 0.00001))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(np.mean(sil))

    imax = np.argmax(sil_all)
    # optimal number of clusters
    nSpeakersFinal = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = np.zeros((n_wins,))
    for i in range(n_wins):
        j = np.argmin(np.abs(i-i_non_outliers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
            trainHMM_computeStatistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat
        hmm.means_ = means; hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]
    class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)];


    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundturh exists
    if os.path.isfile(gt_file):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs, mt_step)

    if plot_res:
        fig = plt.figure()
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(np.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(np.array(range(len(cls)))*mt_step+mt_step/2.0, cls)

    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(np.array(range(len(flags_gt))) *
                     mt_step + mt_step / 2.0, flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
            evaluateSpeakerDiarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(100 * purity_cluster_m,
                                                        100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        #print s_range, sil_all
        if n_speakers<=0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters");
            plt.ylabel("average clustering's sillouette");
        plt.show()
    return cls
コード例 #28
0
    def extract(label, out_dir):
        df = pd.read_csv(os.path.join("data", config.dataset) + ".csv")
        df = df[df.sex == label]
        for filename in df.filename:
            if args.ml:
                if not os.path.exists(os.path.join(out_dir, filename) + ".csv"):
                    try:
                        sound = AudioSegment.from_mp3(os.path.join(
                            "./data", "recordings", filename) + ".mp3")
                        sound.export("tmp.wav", format="wav")

                        # signal, sampling rate
                        fs, s = aIO.read_audio_file("tmp.wav")

                        # get all mid-term features, returning an array of features
                        # Look at the first 10 seconds
                        mid_term_window = 10
                        mt, st, mt_n = aFm.mid_feature_extraction(s, fs, mid_term_window * fs, mid_term_window * fs,
                                                                  0.05 * fs, 0.05 * fs)
                        # Mid-Term Features:
                        # 0:zcr_mean
                        # 1:energy_mean
                        # 2:energy_entropy_mean
                        # 3:spectral_centroid_mean
                        # 4:spectral_spread_mean
                        # 5:spectral_entropy_mean
                        # 6:spectral_flux_mean
                        # 7:spectral_rolloff_mean
                        # 8:mfcc_1_mean
                        # 9:mfcc_2_mean
                        # 10:mfcc_3_mean
                        # 11:mfcc_4_mean
                        # 12:mfcc_5_mean
                        # 13:mfcc_6_mean
                        # 14:mfcc_7_mean
                        # 15:mfcc_8_mean
                        # 16:mfcc_9_mean
                        # 17:mfcc_10_mean
                        # 18:mfcc_11_mean
                        # 19:mfcc_12_mean
                        # 20:mfcc_13_mean
                        # 21:chroma_1_mean
                        # 22:chroma_2_mean
                        # 23:chroma_3_mean
                        # 24:chroma_4_mean
                        # 25:chroma_5_mean
                        # 26:chroma_6_mean
                        # 27:chroma_7_mean
                        # 28:chroma_8_mean
                        # 29:chroma_9_mean
                        # 30:chroma_10_mean
                        # 31:chroma_11_mean
                        # 32:chroma_12_mean
                        # 33:chroma_std_mean
                        # 34:delta zcr_mean
                        # 35:delta energy_mean
                        # 36:delta energy_entropy_mean
                        # 37:delta spectral_centroid_mean
                        # 38:delta spectral_spread_mean
                        # 39:delta spectral_entropy_mean
                        # 40:delta spectral_flux_mean
                        # 41:delta spectral_rolloff_mean
                        # 42:delta mfcc_1_mean
                        # 43:delta mfcc_2_mean
                        # 44:delta mfcc_3_mean
                        # 45:delta mfcc_4_mean
                        # 46:delta mfcc_5_mean
                        # 47:delta mfcc_6_mean
                        # 48:delta mfcc_7_mean
                        # 49:delta mfcc_8_mean
                        # 50:delta mfcc_9_mean
                        # 51:delta mfcc_10_mean
                        # 52:delta mfcc_11_mean
                        # 53:delta mfcc_12_mean
                        # 54:delta mfcc_13_mean
                        # 55:delta chroma_1_mean
                        # 56:delta chroma_2_mean
                        # 57:delta chroma_3_mean
                        # 58:delta chroma_4_mean
                        # 59:delta chroma_5_mean
                        # 60:delta chroma_6_mean
                        # 61:delta chroma_7_mean
                        # 62:delta chroma_8_mean
                        # 63:delta chroma_9_mean
                        # 64:delta chroma_10_mean
                        # 65:delta chroma_11_mean
                        # 66:delta chroma_12_mean
                        # 67:delta chroma_std_mean
                        # 68:zcr_std
                        # 69:energy_std
                        # 70:energy_entropy_std
                        # 71:spectral_centroid_std
                        # 72:spectral_spread_std
                        # 73:spectral_entropy_std
                        # 74:spectral_flux_std
                        # 75:spectral_rolloff_std
                        # 76:mfcc_1_std
                        # 77:mfcc_2_std
                        # 78:mfcc_3_std
                        # 79:mfcc_4_std
                        # 80:mfcc_5_std
                        # 81:mfcc_6_std
                        # 82:mfcc_7_std
                        # 83:mfcc_8_std
                        # 84:mfcc_9_std
                        # 85:mfcc_10_std
                        # 86:mfcc_11_std
                        # 87:mfcc_12_std
                        # 88:mfcc_13_std
                        # 89:chroma_1_std
                        # 90:chroma_2_std
                        # 91:chroma_3_std
                        # 92:chroma_4_std
                        # 93:chroma_5_std
                        # 94:chroma_6_std
                        # 95:chroma_7_std
                        # 96:chroma_8_std
                        # 97:chroma_9_std
                        # 98:chroma_10_std
                        # 99:chroma_11_std
                        # 100:chroma_12_std
                        # 101:chroma_std_std
                        # 102:delta zcr_std
                        # 103:delta energy_std
                        # 104:delta energy_entropy_std
                        # 105:delta spectral_centroid_std
                        # 106:delta spectral_spread_std
                        # 107:delta spectral_entropy_std
                        # 108:delta spectral_flux_std
                        # 109:delta spectral_rolloff_std
                        # 110:delta mfcc_1_std
                        # 111:delta mfcc_2_std
                        # 112:delta mfcc_3_std
                        # 113:delta mfcc_4_std
                        # 114:delta mfcc_5_std
                        # 115:delta mfcc_6_std
                        # 116:delta mfcc_7_std
                        # 117:delta mfcc_8_std
                        # 118:delta mfcc_9_std
                        # 119:delta mfcc_10_std
                        # 120:delta mfcc_11_std
                        # 121:delta mfcc_12_std
                        # 122:delta mfcc_13_std
                        # 123:delta chroma_1_std
                        # 124:delta chroma_2_std
                        # 125:delta chroma_3_std
                        # 126:delta chroma_4_std
                        # 127:delta chroma_5_std
                        # 128:delta chroma_6_std
                        # 129:delta chroma_7_std
                        # 130:delta chroma_8_std
                        # 131:delta chroma_9_std
                        # 132:delta chroma_10_std
                        # 133:delta chroma_11_std
                        # 134:delta chroma_12_std
                        # 135:delta chroma_std_std
                        features = {mt_n[i]: [mt[i][0]]
                                    for i in range(len(mt_n))}
                        ftDf = pd.DataFrame.from_dict(features)
                        ftDf.to_csv(os.path.join(out_dir, filename) + ".csv")
                    except Exception as e:
                        print(e)
            elif args.nn:
                if not os.path.exists(os.path.join(out_dir, filename) + ".png"):
                    try:
                        sound = AudioSegment.from_mp3(os.path.join(
                            "./data", "recordings", filename) + ".mp3")
                        sound.export("tmp.wav", format="wav")

                        y, sr = librosa.load(
                            "tmp.wav", offset=2.0, duration=8.0, sr=22050)

                        # extract a fixed length window

                        # number of samples per time-step in spectrogram
                        hop_length = 512
                        # number of bins in spectrogram. Height of image
                        n_mels = config.cnn_input_size[0]
                        # number of time-steps. Width of image
                        time_steps = config.cnn_input_size[1]
                        # starting at beginning
                        start_sample = 0
                        length_samples = time_steps*hop_length
                        window = y[start_sample:start_sample+length_samples]

                        # use log-melspectrogram
                        mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels,
                                                              n_fft=hop_length*2, hop_length=hop_length)
                        # add small number to avoid log(0)
                        mels = np.log(mels + 1e-9)

                        # min-max scale to fit inside 8-bit range
                        img = scale_minmax(mels, 0, 255).astype(numpy.uint8)
                        # put low frequencies at the bottom in image
                        img = np.flip(img, axis=0)
                        img = 255-img  # invert. make black==more energy

                        # save as PNG
                        skimage.io.imsave(os.path.join(
                            out_dir, filename) + ".png", img)
                    except Exception as e:
                        print(e)
                pass
            elif args.rnn:
                if not os.path.exists(os.path.join(out_dir, filename) + ".csv"):
                    try:
                        sound = AudioSegment.from_mp3(os.path.join(
                            "./data", "recordings", filename) + ".mp3")
                        sound.export("tmp.wav", format="wav")

                        # signal, sampling rate
                        fs, s = aIO.read_audio_file("tmp.wav")

                        # get all shoart-term features, returning an array of features
                        # extract short-term features using a 50msec non-overlapping windows
                        duration = len(s) / float(fs)
                        win, step = 0.050, 0.050
                        [f, fn] = aFs.feature_extraction(s, fs, int(fs * win),
                                                         int(fs * step))
                        print(
                            f'{f.shape[1]} frames, {f.shape[0]} short-term features')
                        # Short-Term Features:
                        # 0:zcr
                        # 1:energy
                        # 2:energy_entropy
                        # 3:spectral_centroid
                        # 4:spectral_spread
                        # 5:spectral_entropy
                        # 6:spectral_flux
                        # 7:spectral_rolloff
                        # 8:mfcc_1
                        # 9:mfcc_2
                        # 10:mfcc_3
                        # 11:mfcc_4
                        # 12:mfcc_5
                        # 13:mfcc_6
                        # 14:mfcc_7
                        # 15:mfcc_8
                        # 16:mfcc_9
                        # 17:mfcc_10
                        # 18:mfcc_11
                        # 19:mfcc_12
                        # 20:mfcc_13
                        # 21:chroma_1
                        # 22:chroma_2
                        # 23:chroma_3
                        # 24:chroma_4
                        # 25:chroma_5
                        # 26:chroma_6
                        # 27:chroma_7
                        # 28:chroma_8
                        # 29:chroma_9
                        # 30:chroma_10
                        # 31:chroma_11
                        # 32:chroma_12
                        # 33:chroma_std
                        # 34:delta zcr
                        # 35:delta energy
                        # 36:delta energy_entropy
                        # 37:delta spectral_centroid
                        # 38:delta spectral_spread
                        # 39:delta spectral_entropy
                        # 40:delta spectral_flux
                        # 41:delta spectral_rolloff
                        # 42:delta mfcc_1
                        # 43:delta mfcc_2
                        # 44:delta mfcc_3
                        # 45:delta mfcc_4
                        # 46:delta mfcc_5
                        # 47:delta mfcc_6
                        # 48:delta mfcc_7
                        # 49:delta mfcc_8
                        # 50:delta mfcc_9
                        # 51:delta mfcc_10
                        # 52:delta mfcc_11
                        # 53:delta mfcc_12
                        # 54:delta mfcc_13
                        # 55:delta chroma_1
                        # 56:delta chroma_2
                        # 57:delta chroma_3
                        # 58:delta chroma_4
                        # 59:delta chroma_5
                        # 60:delta chroma_6
                        # 61:delta chroma_7
                        # 62:delta chroma_8
                        # 63:delta chroma_9
                        # 64:delta chroma_10
                        # 65:delta chroma_11
                        # 66:delta chroma_12
                        # 67:delta chroma_std
                        features = {fn[i]: f[i]
                                    for i in range(len(fn))}
                        ftDf = pd.DataFrame.from_dict(features)
                        ftDf.to_csv(os.path.join(out_dir, filename) + ".csv")
                    except Exception as e:
                        print(e)
            else:
                pass
コード例 #29
0
ファイル: vad_simple.py プロジェクト: shammur/pyVAD
def vadFolderWrapperMergedByTh(inputFolder, outFolder, smoothingWindow, weight, model_name, threshold):

    if not os.path.isfile(model_name):
        print("fileClassification: input model_name not found!")



    classifier, mean, std, classes, mid_window, mid_step, short_window, \
    short_step, compute_beat = aT.load_model(model_name)

    types = ('*.wav', '*.mp3')

    wavFilesList = []
    for files in types:
        print(inputFolder + files)
        wavFilesList.extend(glob.glob((inputFolder + files)))
    wavFilesList = sorted(wavFilesList)
    if len(wavFilesList) == 0:
        print("No WAV files found!")
        return
    for wavFile in wavFilesList:
        # print(wavFile)
        if not os.path.isfile(wavFile):
            raise Exception("Input audio file not found!")
        base = os.path.splitext(os.path.basename(wavFile))[0]
        folder = outFolder + base + '/'
        if not os.path.exists(folder):
            os.makedirs(folder)
        segfile = open(os.path.join(folder, 'segments'), 'w+')
        segfile2 = open(os.path.join(folder, 'segments_details'), 'w+')

        stack = deque()

        [fs, x] = audioBasicIO.read_audio_file(wavFile)
        segmentLimits = aS.silence_removal(x, fs, 0.05, 0.05, smoothingWindow, weight, False)
        merge=True

        for i, st in enumerate(segmentLimits):


            signal = audioBasicIO.stereo_to_mono(x[int(fs * st[0]):int(fs * st[1])])
            # print('in here', len(segmentLimits), st[0],st[1],classes, type(st))
            if fs == 0:
                continue
                # audio file IO problem
                # return -1, -1, -1

            if signal.shape[0] / float(fs) < mid_window:
                mid_window = signal.shape[0] / float(fs)

            # feature extraction:
            mid_features, s, _ = \
                aF.mid_feature_extraction(signal, fs,
                                          mid_window * fs,
                                          mid_step * fs,
                                          round(fs * short_window),
                                          round(fs * short_step))
            # long term averaging of mid-term statistics
            mid_features = mid_features.mean(axis=1)
            if compute_beat:
                # print('in here3')
                beat, beat_conf = aF.beat_extraction(s, short_step)
                mid_features = np.append(mid_features, beat)
                mid_features = np.append(mid_features, beat_conf)
            feature_vector = (mid_features - mean) / std  # normalization
            # class_id = -1
            # probability = -1
            class_id = classifier.predict(feature_vector.reshape(1, -1))[0]
            # probability = classifier.predict_proba(feature_vector.reshape(1, -1))[0]
            print(class_id, type(class_id))
            label=classes[int(class_id)]

            print(label)
            if label=='speech':
                dur=st[1]-st[0]
                # print('in hereas')
                if merge == True:
                    seg_prev=[]
                    # print('in hereasq12')
                    if len(stack) >0:
                        seg_prev = stack.pop()


                    if len(seg_prev) >0 and st[1]-seg_prev[0] > threshold:
                        # print('in hereas4')
                        seg = [st[0], st[1], label]
                        stack.append(seg_prev)
                        stack.append(seg)
                        merge = True
                    elif len(seg_prev) >0:
                        # print('in hereasqw345')
                        seg = [seg_prev[0], st[1], label]
                        stack.append(seg)
                        merge = True
                    else:
                        seg = [st[0], st[1], label]
                        stack.append(seg)
                        merge = True
                else:
                    # print('in hereas2')
                    seg = [st[0], st[1], label]
                    stack.append(seg)
                    merge = True

            else:
                merge = False
            print(i, merge)
        # print(len(segmentLimits), len(stack))
        for sn in stack:
            # print(type(wavFile), sn[0].shape, sn[1].shape, type(sn[0]), type(sn[1]))

            strName = base + "_" + "{:.3f}".format(sn[0]) + "_" + "{:.3f}".format(sn[1])
            if sn[2] == 'speech':
                strOut = folder + base + "_" + "{:.3f}".format(sn[0]) + "_" + "{:.3f}".format(sn[1]) + ".wav"

                wavfile.write(strOut, fs, x[int(fs * sn[0]):int(fs * sn[1])])
                segfile.write(strName + ' ' + base + ' ' + "{:.3f}".format(sn[0]) + ' ' + "{:.3f}".format(sn[1]) + "\n")
            segfile2.write(strName + ' ' + "{:.3f}".format(sn[0]) + ' ' + "{:.3f}".format(sn[1]) + ' ' + sn[2] + "\n")
    segfile.close()
    segfile2.close()
コード例 #30
0
Add types of Features
'''

data_dir = "C:/Users/MADHUKAR/Desktop/test/abc/*.wav"
audio_files = glob(data_dir)

for filename in range(0, len(audio_files), 1):
    [Fs, x] = audioBasicIO.read_audio_file(audio_files[filename])
    Mono_Signal = audioBasicIO.stereo_to_mono(x)
    print(Fs)

    #short term features
    [Feature,
     Feature_Names] = ShortTermFeatures.feature_extraction(Mono_Signal,
                                                           Fs,
                                                           0.050 * Fs,
                                                           0.025 * Fs,
                                                           deltas=True)

    #mid term features
    [mid_features, short_features, mid_feature_names
     ] = MidTermFeatures.mid_feature_extraction(Mono_Signal, Fs, 1.0 * Fs,
                                                0.75 * Fs, 0.050 * Fs,
                                                0.005 * Fs)
    #mid_feature_extraction(signal, sampling_rate, mid_window, mid_step, short_window, short_step)

    print(Feature_Names)
    print(Feature)
    print(mid_feature_names)
    print(mid_features)