Python trimの例、pyvad.trim Pythonの例

コード例 #1

0

ファイルを表示

ファイル: speech_inference.py プロジェクト: flagship-open/flagship-multimodal_emotion_age_gender_recognition

def preprocessing(y, sr):

    # Resampling to 16kHz
    if sr != 16000:
        sr_re = 16000  # sampling rate of resampling
        y = librosa.resample(y, sr, sr_re)
        sr = sr_re

    # Denoising
    y[np.argwhere(y == 0)] = 1e-10
    y_denoise = scipy.signal.wiener(y, mysize=None, noise=None)

    # Pre Emphasis filter
    y_Emphasis = np.append(y_denoise[0],
                           y_denoise[1:] - pre_emphasis * y_denoise[:-1])

    # Normalization (Peak)
    y_max = max(y_Emphasis)
    y_Emphasis = y_Emphasis / y_max  # VAD 인식을 위해 normalize

    # Voice Activity Detection (VAD)
    vad_mode = 2  # VAD mode = 0 ~ 3
    y_vad = trim(y_Emphasis, sr, vad_mode=vad_mode,
                 thr=0.01)  ## VAD 사용하여 trim 수행
    if y_vad is None:
        y_vad = y_Emphasis

    # De normalization
    y_vad = y_vad * y_max

    # Obtain the mel spectrogram
    S = librosa.feature.melspectrogram(y=y_vad,
                                       sr=sr,
                                       hop_length=int(sr * Stride),
                                       n_fft=int(sr * Window_size),
                                       n_mels=Num_mels,
                                       power=2.0)

    EPS = 1e-8
    S = np.log(S + EPS)
    r, Frame_length = S.shape

    # Obtain the normalized mel spectrogram
    S_norm = (S - np.mean(S)) / np.std(S)

    # zero padding
    Input_Mels = np.zeros((r, Num_Frame), dtype=float)
    if Frame_length < Num_Frame:
        Input_Mels[:, :Frame_length] = S_norm[:, :Frame_length]
    else:
        Input_Mels[:, :Num_Frame] = S_norm[:, :Num_Frame]

    # Input_Mels = np.expand_dims(Input_Mels, axis=0)
    # Input_Mels = np.transpose(Input_Mels, (0, 2, 1))
    # Input_Mels = np.expand_dims(Input_Mels, axis=-1)

    return Input_Mels, Frame_length

コード例 #2

0

ファイルを表示

def preprocessing(y, sr, num_frame=1500, stride=0.01, window_size=0.025, \
                  num_mels=40, pre_emphasis=0.97, is_mel=False):

    # Resampling to 16kHz
    if sr != 16000:
        sr_re = 16000  # sampling rate of resampling
        y = librosa.resample(y, sr, sr_re)
        sr = sr_re

    # Denoising
    y[np.argwhere(y == 0)] = 1e-10
    y_denoise = scipy.signal.wiener(y, mysize=None, noise=None)

    # Pre Emphasis filter
    y_emphasis = np.append(y_denoise[0], \
                           y_denoise[1:] - pre_emphasis * y_denoise[:-1])

    # Normalization (Peak)
    y_max = max(y_emphasis)
    y_emphasis = y_emphasis / y_max

    # Voice Activity Detection (VAD)
    vad_mode = 2  # VAD mode = 0 ~ 3
    y_vad = trim(y_emphasis, sr, vad_mode=vad_mode)
    if y_vad is None:
        y_vad = y_emphasis

    # De normalization
    y_vad = y_vad * y_max

    # Obtain the mel spectrogram
    S = librosa.feature.melspectrogram(y=y_vad, sr=sr, \
                                       hop_length=int(sr * stride), \
                                       n_fft=int(sr * window_size), \
                                       n_mels=num_mels, power=2.0)
    # Mel or Log Mel
    EPS = 1e-8
    S = np.log(S + EPS)
    r, frame_length = S.shape
    print('\n** log mel **')
    print('S.shape', S.shape)

    # Obtain the normalized mel spectrogram
    s_norm = (S - np.mean(S)) / np.std(S)

    # zero padding
    input_mels = np.zeros((r, num_frame), dtype=float)
    if frame_length < num_frame:
        input_mels[:, :frame_length] = s_norm[:, :frame_length]
    else:
        input_mels[:, :num_frame] = s_norm[:, :num_frame]

    return input_mels, frame_length

コード例 #3

0

ファイルを表示

ファイル: base_parser2.py プロジェクト: tongjinle123/speech_recognition

 def _load_wav(self, wav_file):
     sig, sample_rate = librosa.load(wav_file, sr=self.config.sample_rate)
     tmp = sig
     if self.config.use_vad:
         sig = trim(sig,
                    sample_rate,
                    fs_vad=self.config.sample_rate,
                    hoplength=30,
                    thr=0,
                    vad_mode=2)
     if sig is None:
         return tmp
     else:
         return sig

コード例 #4

0

ファイルを表示

ファイル: generate_data.py プロジェクト: flagship-open/flaghship-speech_speaker_recognition

def extract_feature(y, sr):
    ### Pre-processing
    Num_Frame = 1500  # max wave length (15 sec)
    Stride = 0.01  # stride (10ms)
    Window_size = 0.025  # filter window size (25ms)
    Num_data = 1
    Num_mels = 40  # Mel filter number
    pre_emphasis = 0.97  # Pre-Emphasis filter coefficient

    # Resampling to 16kHz
    if sr != 16000:
        sr_re = 16000  # sampling rate of resampling
        y = librosa.resample(y, sr, sr_re)
        sr = sr_re

    # Denoising
    y[np.argwhere(y == 0)] = 1e-10
    y_denoise = scipy.signal.wiener(y, mysize=None, noise=None)

    # Pre Emphasis filter → high frequency를 높여주는 부분
    y_Emphasis = np.append(y_denoise[0],
                           y_denoise[1:] - pre_emphasis * y_denoise[:-1])

    # Normalization (Peak)
    y_max = max(y_Emphasis)
    y_Emphasis = y_Emphasis / y_max  # VAD 인식을 위해 normalize

    # Voice Activity Detection (VAD)
    vad_mode = 2  # VAD mode = 0 ~ 3
    y_vad = trim(y_Emphasis, sr, vad_mode=vad_mode,
                 thr=0.01)  ## VAD 사용하여 trim 수행
    if y_vad is None:
        y_vad = y_Emphasis

    # De normalization
    y_vad = y_vad * y_max

    # Obtain the mel spectrogram
    S = librosa.feature.melspectrogram(y=y_vad,
                                       sr=sr,
                                       hop_length=int(sr * Stride),
                                       n_fft=int(sr * Window_size),
                                       n_mels=Num_mels,
                                       power=2.0)
    r, Frame_length = S.shape
    S = np.log(S + 1e-8)
    # Obtain the normalized mel spectrogram
    S_norm = (S - np.mean(S)) / np.std(S)

    return S_norm

コード例 #5

0

ファイルを表示

ファイル: load.py プロジェクト: alongwithyou/speech-transformer-pytorch_lightning

def load(file, do_vad=True):
    sig, sr = ta.load(file, channels_first=True, normalization=True)
    assert sr == 16000
    if do_vad:
        start, end = trim(sig.transpose(0, 1).numpy(),
                          fs=sr,
                          fs_vad=16000,
                          hop_length=30,
                          vad_mode=2)
        if start != 0 and end != 0:
            return sig[:, start:end]
        else:
            return sig
    else:
        return sig

コード例 #6

0

ファイルを表示

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from itertools import product

import numpy as np
from librosa import load
from pyvad import trim

fs_vads = (8000, 16000, 32000, 48000)
hops = (10, 20, 30)
vad_modes = (0, 1, 2, 3)

name = "voice/arctic_a0007.wav"
data, fs = load(name, sr=None)

for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes):
    vact = trim(data, fs, fs_vad=16000, hoplength=30, vad_mode=0)

fs = 16000
data = (np.random.rand(fs * 3) - 0.5) * 0.1

for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes):
    vact = trim(data, fs, fs_vad=16000, hoplength=30, vad_mode=0)

コード例 #7

0

ファイルを表示

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from itertools import product

import numpy as np
from librosa import load
from pyvad import trim

fs_vads = (8000, 16000, 32000, 48000)
hops = (10, 20, 30)
vad_modes = (0, 1, 2, 3)

name = "voice/arctic_a0007.wav"
data, fs = load(name, sr=None)

for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes):
    vact = trim(data, fs, fs_vad=fs_vad, hop_length=hop, vad_mode=vad_mode)
    assert vact[1] - vact[0] > 0, vact

data = (np.random.rand(fs * 3) - 0.5) * 0.05
for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes):
    vact = trim(data, fs, fs_vad=fs_vad, hop_length=hop, vad_mode=vad_mode)
    assert vact[1] - vact[0] == 0, vact

コード例 #8

0

ファイルを表示

ファイル: voice_flask_server.py プロジェクト: flagship-open/flagship-speech_emotion_recognition

def preprocessing(y, sr):
    # Resampling to 16kHz
    if sr != 16000:
        sr_re = 16000  # sampling rate of resampling
        y = librosa.resample(y, sr, sr_re)
        sr = sr_re

    # Denoising
    y[np.argwhere(y == 0)] = 1e-10
    y_denoise = scipy.signal.wiener(y, mysize=None, noise=None)

    # Pre Emphasis filter
    y_Emphasis = np.append(y_denoise[0],
                           y_denoise[1:] - pre_emphasis * y_denoise[:-1])

    # Normalization (Peak)
    y_max = max(y_Emphasis)
    y_Emphasis = y_Emphasis / y_max  # normalize for VAD

    # Voice Activity Detection (VAD)
    vad_mode = 2  # VAD mode = 0 ~ 3
    y_vad = trim(y_Emphasis, sr, vad_mode=vad_mode,
                 thr=0.01)  # trim using VAD module
    if y_vad is None:
        y_vad = y_Emphasis

    # De normalization
    y_vad = y_vad * y_max

    # Obtain the mel spectrogram
    S = librosa.feature.melspectrogram(y=y_vad,
                                       sr=sr,
                                       hop_length=int(sr * Stride),
                                       n_fft=int(sr * Window_size),
                                       n_mels=Num_mels,
                                       power=2.0)
    r, Frame_length = S.shape

    # Obtain the normalized mel spectrogram
    S_norm = (S - np.mean(S)) / np.std(S)

    # zero padding
    Input_Mels = np.zeros((r, Num_Frame), dtype=float)
    if Frame_length < Num_Frame:
        Input_Mels[:, :Frame_length] = S_norm[:, :Frame_length]
    else:
        Input_Mels[:, :Num_Frame] = S_norm[:, :Num_Frame]

    # Obtain the log mel spectrogram
    w = 1e+6
    S_mel_log = np.log(1 + w * S)

    # Feature
    Input_DCT, Input_DST = Feature_DCT_DST(S_mel_log)
    Input_DCT = np.expand_dims(np.expand_dims(Input_DCT, axis=0), axis=-1)
    Input_DST = np.expand_dims(np.expand_dims(Input_DST, axis=0), axis=-1)
    Input_Hist = np.expand_dims(np.expand_dims(Feature_Hist(S_mel_log),
                                               axis=0),
                                axis=-1)
    Input_Moments = np.expand_dims(np.expand_dims(Feature_Moments(S_mel_log),
                                                  axis=0),
                                   axis=-1)

    return Input_Mels, Input_DCT, Input_DST, Input_Hist, Input_Moments, Frame_length

コード例 #9

0

ファイルを表示

#plt.show()

for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes):
    vact = vad(data, fs, fs_vad=16000, hoplength=30, vad_mode=0)

fig, ax1 = plt.subplots()
ax1.plot(time, data, color='b', label='speech waveform')
ax1.set_xlabel("TIME [s]")

ax2 = ax1.twinx()
ax2.plot(time, vact, color="r", label='vad')
plt.yticks([0, 1], ('unvoice', 'voice'))
ax2.set_ylim([-0.01, 1.01])

plt.legend()

#plt.show()

#输出剪切之后的音频
trimed = trim(data, fs, fs_vad=16000, hoplength=30, vad_mode=3)
time = np.linspace(0, len(trimed) / fs, len(trimed))  # time axis
fig, ax1 = plt.subplots()

ax1.plot(time, trimed, color='b', label='speech waveform')
ax1.set_xlabel("TIME [s]")

plt.show()

plt.plot(trimed)
plt.show()

コード例 #10

0

ファイルを表示

def preprocessing(wav, sampling_rate):
    """

    Args:
        wav: wave
        sr: sampling rate

    Returns:
        input_mels
    """
    # Resampling to 16kHz
    if sampling_rate != 16000:
        sampling_rate_re = 16000  # sampling rate of resampling
        wav = librosa.resample(wav, sampling_rate, sampling_rate_re)
        sampling_rate = sampling_rate_re

    # Denoising
    wav[np.argwhere(wav == 0)] = 1e-10
    wav_denoise = scipy.signal.wiener(wav, mysize=None, noise=None)

    # Pre Emphasis filter
    wav_emphasis = np.append(
        wav_denoise[0],
        wav_denoise[1:] - PRE_EMPHASIS_COEFF * wav_denoise[:-1])

    # Normalization (Peak)
    wav_max = np.abs(wav_emphasis).max() / 0.9
    wav_emphasis = wav_emphasis / wav_max  # normalize for VAD

    # Voice Activity Detection (VAD)
    vad_mode = 2  # VAD mode = 0 ~ 3
    wav_vad = trim(wav_emphasis, sampling_rate, vad_mode=vad_mode,
                   thr=0.01)  ## trim
    if wav_vad is None:
        wav_vad = wav_emphasis

    # De normalization
    wav_vad = wav_vad * wav_max

    # Obtain the spectrogram
    sftf_vad = librosa.core.stft(y=wav_vad,
                                 hop_length=int(sampling_rate * STRIDE),
                                 n_fft=int(sampling_rate * WINDOW_SIZE))
    spec = np.abs(sftf_vad)**2

    # mel spectrogram
    mel_spec = librosa.feature.melspectrogram(S=spec, n_mels=NUM_MELS)

    # log scaled mel spectrogram
    log_weight = 1e+6
    log_mel_spec = np.log(1 + log_weight * mel_spec)

    frame_length = log_mel_spec.shape[1]

    # zero padding
    input_mels = np.zeros((NUM_MELS, MAX_FRAME_LENGTH), dtype=float)
    if frame_length < MAX_FRAME_LENGTH:
        input_mels[:, :frame_length] = log_mel_spec[:, :frame_length]
    else:
        input_mels[:, :MAX_FRAME_LENGTH] = log_mel_spec[:, :MAX_FRAME_LENGTH]

    return input_mels