Ejemplo n.º 1
0
def clean_keyword(audiofile, keyword):
    '''
    taken from https://github.com/F-Tag/python-vad/blob/master/example.ipynb
    '''
    show = False
    curdir = os.getcwd()
    data, fs = librosa.core.load(audiofile)
    time = np.linspace(0, len(data) / fs, len(data))
    try:
        vact = vad(data, fs, fs_vad=16000, hop_length=30, vad_mode=3)
        vact = list(vact)
        while len(time) > len(vact):
            vact.append(0.0)
        utterances = list()

        for i in range(len(vact)):
            try:
                if vact[i] != vact[i - 1]:
                    # voice shift
                    if vact[i] == 1:
                        start = i
                    else:
                        # this means it is end
                        end = i
                        utterances.append([start, end])
            except:
                pass

        print(utterances)
        vact = np.array(vact)

        tempfiles = list()
        keptfiles = list()
        for i in range(len(utterances)):
            trimmed = data[utterances[i][0]:utterances[i][1]]
            tempfile = str(uuid.uuid4()) + '.wav'
            librosa.output.write_wav(tempfile, trimmed, fs)
            tempfiles.append(tempfile)

        for i in range(len(tempfiles)):
            if os.path.getsize(tempfiles[i]) > 20000:
                pass
                transcript = transcribe_audiofile(tempfiles[i])
                print('TRANSCRIPT --> %s' % (transcript))
                if transcript == keyword:
                    keptfiles.append(tempfiles[i])
                else:
                    os.remove(tempfiles[i])
            else:
                os.remove(tempfiles[i])
    except:
        print(
            'ERROR - ValueError: When data.type is float, data must be -1.0 <= data <= 1.0.'
        )

    os.remove(audiofile)
    return keptfiles
Ejemplo n.º 2
0
def webrtc_segmentation(file_path: str, sample_rate: int, hop_length: int,
                        aggressiveness: int) -> tuple:
    data, fs = librosa.load(file_path, mono=True, sr=sample_rate)
    data /= max(np.abs(data))
    vact = vad(data,
               fs,
               fs_vad=sample_rate,
               hop_length=hop_length,
               vad_mode=aggressiveness)
    segments = list()

    previous = 0
    for idx, pred in enumerate(vact):
        if previous == 0 and pred == 1:
            start = idx
            previous = 1
        if previous == 1 and pred == 0:
            end = idx
            previous = 0
            segments.append((start, end))
    return segments, data
Ejemplo n.º 3
0
def clean_utterances(audiofile):
    '''
        taken from https://github.com/F-Tag/python-vad/blob/master/example.ipynb
        '''
    show = False
    curdir = os.getcwd()
    data, fs = librosa.core.load(audiofile)
    time = np.linspace(0, len(data) / fs, len(data))
    vact = vad(data, fs, fs_vad=16000, hop_length=30, vad_mode=3)
    vact = list(vact)
    while len(time) > len(vact):
        vact.append(0.0)
    utterances = list()

    for i in range(len(vact)):
        try:
            if vact[i] != vact[i - 1]:
                # voice shift
                if vact[i] == 1:
                    start = i
                else:
                    # this means it is end
                    end = i
                    utterances.append([start, end])
        except:
            pass

    print(utterances)
    vact = np.array(vact)
    files = list()
    for i in range(len(utterances)):
        trimmed = data[utterances[i][0]:utterances[i][1]]
        tempfile = str(uuid.uuid4()) + '.wav'
        librosa.output.write_wav(tempfile, trimmed, fs)
        files.append(tempfile)

    os.remove(audiofile)
    return files
Ejemplo n.º 4
0
def compute_WAPRQ(ref_path,
                  test_path,
                  sr=16000,
                  n_mfcc=12,
                  fmax=5000,
                  patch_size=0.4,
                  sigma=np.array([[1, 1], [3, 2], [1, 3]])):

    # Inputs:
    # refPath: path of reference speech
    # disPath: path pf degraded speech
    # sr: sampling frequency, Hz
    # n_mfcc: number of MFCCs
    # fmax: cutoff frequency
    # patch_size: size of each patch in s
    # sigma: step size conditon for DTW

    # Output:
    # WARP-Q quality score between refPath and disPath

    ####################### Load speech files #################################
    # Load Ref Speech
    if ref_path[-4:] == '.wav':
        speech_Ref, sr_Ref = librosa.load(ref_path, sr=sr)
    else:
        if ref_path[-4:] == '.SRC':  #For ITUT database if applicable
            speech_Ref, sr_Ref = sf.read(ref_path,
                                         format='RAW',
                                         channels=1,
                                         samplerate=16000,
                                         subtype='PCM_16',
                                         endian='LITTLE')
            if sr_Ref != sr:
                speech_Ref = librosa.resample(speech_Ref, sr_Ref, sr)
                sr_Ref = sr

    # Load Coded Speech
    if test_path[-4:] == '.wav':
        speech_Coded, sr_Coded = librosa.load(test_path, sr=sr)
    else:
        if test_path[-4:] == '.OUT':  #For ITUT database if applicable
            speech_Coded, sr_Coded = sf.read(test_path,
                                             format='RAW',
                                             channels=1,
                                             samplerate=16000,
                                             subtype='PCM_16',
                                             endian='LITTLE')
            if sr_Coded != sr:
                speech_Coded = librosa.resample(speech_Coded, sr_Coded, sr)
                sr_Coded = sr

    if sr_Ref != sr_Coded:
        raise ValueError(
            "Reference and degraded signals should have same sampling rate!")

    # Make sure amplitudes are in the range of [-1, 1] otherwise clipping to -1 to 1
    # after resampling (if applicable). We experienced this issue for TCD-VOIP database only
    speech_Ref[speech_Ref > 1] = 1.0
    speech_Ref[speech_Ref < -1] = -1.0

    speech_Coded[speech_Coded > 1] = 1.0
    speech_Coded[speech_Coded < -1] = -1.0

    ###########################################################################

    win_length = int(0.032 * sr)  #32 ms frame
    hop_length = int(0.004 * sr)  #4 ms overlap
    #hop_length = int(0.016*sr)

    n_fft = 2 * win_length
    lifter = 3

    # DTW Parameters
    Metric = 'euclidean'

    # VAD Parameters
    hop_size_vad = 30
    sr_vad = sr
    aggresive = 0

    # VAD for Ref speech
    vact1 = vad(speech_Ref,
                sr,
                fs_vad=sr_vad,
                hop_length=hop_size_vad,
                vad_mode=aggresive)
    speech_Ref_vad = speech_Ref[vact1 == 1]

    # VAD for Coded speech
    vact2 = vad(speech_Coded,
                sr,
                fs_vad=sr_vad,
                hop_length=hop_size_vad,
                vad_mode=aggresive)
    speech_Coded_vad = speech_Coded[vact2 == 1]

    # Compute MFCC features for the two signals

    mfcc_Ref = librosa.feature.mfcc(speech_Ref_vad,
                                    sr=sr,
                                    n_mfcc=n_mfcc,
                                    fmax=fmax,
                                    n_fft=n_fft,
                                    win_length=win_length,
                                    hop_length=hop_length,
                                    lifter=lifter)
    mfcc_Coded = librosa.feature.mfcc(speech_Coded_vad,
                                      sr=sr,
                                      n_mfcc=n_mfcc,
                                      fmax=fmax,
                                      n_fft=n_fft,
                                      win_length=win_length,
                                      hop_length=hop_length,
                                      lifter=lifter)

    # Feature Normalisation using CMVNW method
    mfcc_Ref = speechpy.processing.cmvnw(mfcc_Ref.T,
                                         win_size=201,
                                         variance_normalization=True).T
    mfcc_Coded = speechpy.processing.cmvnw(mfcc_Coded.T,
                                           win_size=201,
                                           variance_normalization=True).T

    # Divid MFCC features of Coded speech into patches
    cols = int(patch_size / (hop_length / sr))
    window_shape = (np.size(mfcc_Ref, 0), cols)
    step = int(cols / 2)

    mfcc_Coded_patch = view_as_windows(mfcc_Coded, window_shape, step)

    Acc = []
    band_rad = 0.25
    weights_mul = np.array([1, 1, 1])

    # Compute alignment cose between each patch and Ref MFCC
    for i in range(mfcc_Coded_patch.shape[1]):

        patch = mfcc_Coded_patch[0][i]

        D, P = librosa.sequence.dtw(X=patch,
                                    Y=mfcc_Ref,
                                    metric=Metric,
                                    step_sizes_sigma=sigma,
                                    weights_mul=weights_mul,
                                    band_rad=band_rad,
                                    subseq=True,
                                    backtrack=True)

        P_librosa = P[::-1, :]
        b_ast = P_librosa[-1, 1]

        Acc.append(D[-1, b_ast] / D.shape[0])

    # Final score
    return np.median(Acc)
Ejemplo n.º 5
0
def detect_voice(audio, sr):
    vact = vad(audio, sr, vad_mode=3, hoplength=30)
    return vact
Ejemplo n.º 6
0
        temp = np.load(target_path + '/vocals/' + file[:-9] + '.npz')
        ye_vocals = temp['arr_0']
        temp = np.load(target_path + '/accompaniment/' + file[:-9] + '.npz')
        ye_accomp = temp['arr_0']

        #calculation of PES per track (input/4 segments)

        print("Estimating predicted energy at silence...")

        ln = len(ye_vocals)
        seglen = int(sys.argv[3])

        for i in range(0, ln - seglen, seglen):

            y = librosa.resample(yr_vocals[i:i + seglen], 22050, 16000)
            vact_labels = pyvad.vad(y, 16000, hop_length=20, vad_mode=3)

            vact_labels = vact_labels[
                0:-1:320]  #vocal activity labels of the reference track

            ye = librosa.resample(ye_vocals[i:i + seglen], 22050, 16000)
            veact_labels = pyvad.vad(ye, 16000, hop_length=20, vad_mode=3)

            veact_labels = veact_labels[
                0:-1:320]  #vocal activity labels of the estimated track

            l = len(vact_labels)
            vact_temp = np.mean(
                vact_labels
            ) > 0.5  #returns a mean vact_label of the reference segment
            if (vact_temp == 0):
Ejemplo n.º 7
0
if __name__ == '__main__':

    listen_th = threading.Thread(target=listen)
    listen_th.daemon = True
    listen_th.start()
    times = list()  # Profile the prediction inference speed

    try:
        print("Listening...")
        while True:
            if len(buffer) < buffer.maxlen:
                continue
            audio = np.hstack(list(buffer))
            try:
                vact = vad(audio, RATE, fs_vad=RATE, hop_length=30, vad_mode=1)
            except:
                # Skip when audio clips
                continue
            if np.mean(vact) > 0.5:
                audio /= np.max(np.abs(audio))
                audio = audio[None, :]
                now = time.time()
                embeddings = embedder.run(audio)
                prediction = cluster_obj.update_predict(np.squeeze(embeddings))
                times.append(time.time() - now)
                print("                                 ", end="\r")
                print(f"Detected speaker {prediction}", end="\r")
            else:
                pass
                print("                                 ", end="\r")
Ejemplo n.º 8
0
import numpy as np
fs_vads = (8000, 16000, 32000, 48000)
hops = (10, 20, 30)
vad_modes = (0, 1, 2, 3)

#name ='E:/项目/ASRInLesson/lessons/test05mi.wav'
name = 'E:/项目/ASRInLesson/same_segment/audio77_T.wav'
data, fs = load(name, sr=None)
time = np.linspace(0, len(data) / fs, len(data))  # time axis
fig, ax0 = plt.subplots()
plt.plot(time, data)

#plt.show()

for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes):
    vact = vad(data, fs, fs_vad=16000, hoplength=30, vad_mode=0)

fig, ax1 = plt.subplots()
ax1.plot(time, data, color='b', label='speech waveform')
ax1.set_xlabel("TIME [s]")

ax2 = ax1.twinx()
ax2.plot(time, vact, color="r", label='vad')
plt.yticks([0, 1], ('unvoice', 'voice'))
ax2.set_ylim([-0.01, 1.01])

plt.legend()

#plt.show()

#输出剪切之后的音频
Ejemplo n.º 9
0
fs_vads = (8000, 16000, 32000, 48000)
hops = (10, 20, 30)
vad_modes = (0, 1, 2, 3)
fss = [16000, 22050]

name = "voice/arctic_a0007.wav"

for fs in fss:

    data, fs_r = load(name, sr=fs)
    for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes):
        # print(fs, fs_vad, hop, vad_mode)
        vact = vad(data,
                   fs_r,
                   fs_vad=fs_vad,
                   hop_length=hop,
                   vad_mode=vad_mode)
        assert vact.sum() > data.size // 2, vact.sum()
        """
        import matplotlib.pyplot as plt
        plt.plot(data)
        plt.plot(vact)
        plt.savefig(("voice_"+str(fs_r)+str(fs_vad)+str(hop)+str(vad_mode)+".png"))
        plt.close()
        """
    """
    data = (np.random.rand(fs*3)-0.5)*0.1
    for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes):
        print(fs, fs_vad, hop, vad_mode)
        vact = vad(data, fs, fs_vad=fs_vad, hop_length=hop, vad_mode=vad_mode)