Example #1
0
def get_coefs(wav_file_path):
    sample_rate, x = wavfile.read(wav_file_path)
    # al.play(x.astype(float) / x.max(), fs=sample_rate)

    frames = librosa.util.frame(
        x,
        frame_length=frame_length,
        hop_length=hop_length).astype(np.float64).T
    frames *= pysptk.blackman(frame_length)
    f0 = pysptk.swipe(
        x.astype(np.float64),
        fs=sample_rate,
        hopsize=hop_length,
        min=50,
        max=500)

    # order = 40
    # alpha = 0.41
    mc = np.apply_along_axis(
        pysptk.mcep,
        1,
        frames,
        order,
        alpha)

    return sample_rate, f0, mc  # sample_rate- ?, f0, mel-cepstrum coefs
Example #2
0
def get_synt_wav(wav_file_path):
    # # Synthesis from mel-cepstrum
    sample_rate, x = wavfile.read(wav_file_path)
    # assert sample_rate == 16000
    # al.play(x.astype(float) / x.max(), fs=sample_rate)  # Audio(x, rate=sample_rate)

    # all of pysptk functions assume input array is C-contiguous
    # and np.float4 element type
    frames = librosa.util.frame(
        x,
        frame_length=frame_length,
        hop_length=hop_length).astype(np.float64).T

    # Windowing
    frames *= pysptk.blackman(frame_length)

    # assert frames.shape[1] == frame_length

    # F0 estimation
    f0 = pysptk.swipe(
        x.astype(np.float64),
        fs=sample_rate,
        hopsize=hop_length,
        min=50,
        max=500)

    generator = excite.ExcitePulse(sample_rate, hop_length, False)
    source_excitation = generator.gen(f0)

    # apply function along with `time` axis (=1)
    mc = np.apply_along_axis(
        pysptk.mcep,
        1,
        frames,
        order,
        alpha)

    # Convert mel-cesptrum to MLSADF coefficients
    b = np.apply_along_axis(pysptk.mc2b, 1, mc, alpha)

    synthesizer = pysptk.synthesis.Synthesizer(
        pysptk.synthesis.MLSADF(
            order=order, alpha=alpha),
        hop_length)

    x_synthesized = synthesizer.synthesis(source_excitation, b)
    # Audio(x_synthesized, rate=sample_rate)
    # al.play(x_synthesized.astype(float) / x_synthesized.max(), fs=sample_rate)
    return x_synthesized
Example #3
0
def get_random_peseudo_mcep(order=24, alpha=0.42):
    T, N = 100, 513
    frames = np.random.rand(T, N) * pysptk.blackman(N)
    mc = pysptk.mcep(frames, order=order, alpha=alpha)
    return mc
Example #4
0
MIN_F0 = 60
MAX_F0 = 240
ORDER = 20

IN_WAVE_FILE = "in.wav"  # 入力音声
OUT_WAVE_FILE = "out.wav"  # 分析再合成した音声

# 音声の読み込み
fs, x = wavfile.read(IN_WAVE_FILE)
x = x.astype(np.float64)

# 音声の切り出しと窓掛け
frames = librosa.util.frame(x,
                            frame_length=FRAME_LENGTH,
                            hop_length=HOP_LENGTH).astype(np.float64).T
frames *= pysptk.blackman(FRAME_LENGTH)  # 窓掛け(ブラックマン窓)

# ピッチ抽出
pitch = pysptk.swipe(x,
                     fs=fs,
                     hopsize=HOP_LENGTH,
                     min=MIN_F0,
                     max=MAX_F0,
                     otype="pitch")

# 励振源信号(声帯音源)の生成
source_excitation = pysptk.excite(pitch, HOP_LENGTH)

# 線形予測分析による線形予測符号化(LPC)係数の抽出
lpc = pysptk.lpc(frames, ORDER)
lpc[:, 0] = np.log(lpc[:, 0])
Example #5
0
def __dummy_windowed_frames(source, frame_len=512, hopsize=80):
    np.random.seed(98765)
    n_frames = int(len(source) / hopsize) + 1
    windowed = np.random.randn(n_frames,
                               frame_len) * pysptk.blackman(frame_len)
    return 0.5 * 32768.0 * windowed
Example #6
0
import pysptk
from scipy.io import wavfile
import librosa.util
frame_length = 2048
hop_length = 512

order = 20

path = '../ETTS_newdata/data/wav/lmy00001.wav'
# LPC
sr, x = wavfile.read(path)
x = x.astype(np.float64)
librosa.util.valid_audio(x)
x = np.pad(x, int(frame_length // 2), mode='reflect')
frames = librosa.util.frame(x,
                            frame_length=frame_length,
                            hop_length=hop_length).astype(np.float64).T
frames *= pysptk.blackman(frame_length)

lpc = pysptk.lpc(frames, order)
lpc[:, 0] = np.log(lpc[:, 0])

#MFCC

y, sr = librosa.load(path)
y = y.astype(np.float64)
mfcc = librosa.feature.mfcc(y=y,
                            sr=sr,
                            n_fft=frame_length,
                            hop_length=hop_length)
print(d)
Example #7
0
order = 25
alpha = 0.42
gamma = -0.35

# Loading pyrenn Model
net = pyrenn.loadNN('pyrennweights_2.csv')

# Input
sr, sx = wavfile.read(sourcefile)
l = len(sx)

# framing
sourceframes = librosa.util.frame(sx, frame_length=frameLength, hop_length=hop_length).astype(np.float64).T

# Windowing
sourceframes *= pysptk.blackman(frameLength)

# extract MCEPs
sourcemcepvectors = np.apply_along_axis(pysptk.mcep, 1, sourceframes, order, alpha)
# provide the source MCEPs as input to the trained neural network which gives the target MCEPs
mgc = pyrenn.NNOut(sourcemcepvectors.transpose(), net).transpose()
mgc = mgc.copy(order="C")

# Finding Log Spectrum.
logspec = np.apply_along_axis(pysptk.mgc2sp, 1, mgc, 0.41, 0.0, frameLength)
# Convert to FFT Domain.
spec = np.exp(logspec).T
# Convert to Time Domain.
output_speechover = librosa.core.istft(spec, hop_length, frameLength, pysptk.blackman(frameLength))

# Output.
Example #8
0
def __dummy_windowed_frames(source, frame_len=512, hopsize=80):
    np.random.seed(98765)
    n_frames = int(len(source) / hopsize) + 1
    windowed = np.random.randn(n_frames, frame_len) * pysptk.blackman(frame_len)
    return 0.5 * 32768.0 * windowed
Example #9
0
# Parameters.
frameLength = 1024
overlap = 0.25
hop_length = frameLength * overlap
order = 25
alpha = 0.42
gamma = -0.35

# Feature Extraction.
sr, sx = wavfile.read(sourcefile)
sourceframes = librosa.util.frame(
    sx,
    frame_length=frameLength,  # framing the source audio
    hop_length=hop_length).astype(np.float64).T
sourceframes *= pysptk.blackman(frameLength)  # windowing
sourcemcepvectors = np.apply_along_axis(
    pysptk.mcep, 1, sourceframes, order,
    alpha)  # extract MCEPs of the source frames
sr, tx = wavfile.read(targetfile)
targetframes = librosa.util.frame(
    tx,
    frame_length=frameLength,  # framing the target audio
    hop_length=hop_length).astype(np.float64).T
targetframes *= pysptk.blackman(frameLength)  # windowing
targetmcepvectors = np.apply_along_axis(
    pysptk.mcep, 1, targetframes, order,
    alpha)  # extract mceps of target frames

# Normalising for feeding into RNN.
norm = min(len(sourcemcepvectors), len(targetmcepvectors))
Example #10
0
frameLength = 1024
overlap = 0.25
hop_length=256
subFrameLength = frameLength * overlap
net=pyrenn.CreateNN([26,30,30,26])
order = 25
alpha = 0.41
gamma = -0.35
count=0
for sourcefile,targetfile in zip(source,target) :
    print(sourcefile,targetfile)
    sr, sx = wavfile.read(sourcefile)
    sourceframes = librosa.util.frame(sx, frame_length=frameLength,    #framing the source audio
    hop_length=hop_length).astype(np.float64).T
    sourceframes *= pysptk.blackman(frameLength) #windowing
    sourcemcepvectors = np.apply_along_axis(pysptk.mcep, 1, sourceframes, order, alpha) #extract MCEPs of the source frames
    sr, tx = wavfile.read(targetfile)
    targetframes = librosa.util.frame(tx, frame_length=frameLength,  #framing the target audio
    hop_length=hop_length).astype(np.float64).T
    targetframes *= pysptk.blackman(frameLength) . #windowing
    targetmcepvectors = np.apply_along_axis(pysptk.mcep, 1, targetframes, order, alpha) . #extract mceps of target frames
    reslen=min(len(sourcemcepvectors),len(targetmcepvectors)) 
    transsourcemcepvectorsmod=np.empty([26,reslen])
    transtargetmcepvectorsmod=np.empty([26,reslen])
    transsourcemcepvectorsmod=np.transpose(sourcemcepvectors[0:reslen])
    transtargetmcepvectorsmod=np.transpose(targetmcepvectors[0:reslen])
    print(len(sourcemcepvectors),len(targetmcepvectors))
    # to find if there are any NANs in the MCEP vectors 
    for i in range(len(sourcemcepvectors)):
        for j in range(len(sourcemcepvectors[i])):
Example #11
0
def load_mfcc_mceps(path_to_data, config_mfcc_mceps):
    '''extract normalized mfcc and mceps from list of data path
  input:
    list of paths to data, 
    mfcc_mceps setting dictionary
  return:
    dictionary:
      key: speaker code + _ + audio name
      value: tuple (mfcc normalized, mceps normalized)
    target scaler:
      contains mcep mean and variance of target speaker in order to scale back mcep results
  '''
    _data_x = {}
    path_audios = os.listdir(path_to_data)
    total_mceps = np.empty(
        (0, config_mfcc_mceps['order_mcep'] + 1),
        float)  #used to store mean and std for denormalize results
    target_scaler = {}
    for p in path_audios:
        if p.split(".")[-1] != "wav":
            continue
        x, _ = librosa.load(path_to_data + '/' + p,
                            sr=config_mfcc_mceps["sampling_frequency"])

        mfcc_l = math.ceil(x.shape[0] / config_mfcc_mceps["hop_length"])
        mcep_l = math.ceil((x.shape[0] - config_mfcc_mceps["n_fft"]) /
                           config_mfcc_mceps["hop_length"])
        final_shape = x.shape[0] + config_mfcc_mceps["hop_length"] * (mfcc_l -
                                                                      mcep_l)
        x.resize((final_shape, ))
        frames = librosa.util.frame(
            x,
            frame_length=config_mfcc_mceps["n_fft"],
            hop_length=config_mfcc_mceps["hop_length"]).astype(np.float64).T
        # Windowing
        frames *= pysptk.blackman(config_mfcc_mceps["n_fft"], normalize=1)
        mceps = pysptk.mcep(frames, config_mfcc_mceps['order_mcep'])  #,alpha)
        total_mceps = np.vstack((total_mceps, mceps))
        mfccs = librosa.feature.mfcc(
            y=x,
            sr=config_mfcc_mceps["sampling_frequency"],
            n_mfcc=config_mfcc_mceps["order_mfcc"],
            n_fft=config_mfcc_mceps["n_fft"],
            hop_length=config_mfcc_mceps["hop_length"])
        mfccs = normalize_mfcc(
            mfccs.T).T  #transpose twice in order to normalize on right axis
        id_ = "_" + p
        _data_x[id_] = (
            mfccs.T, mceps
        )  #Don't forget mfcc.T -> now both have shape (#frames, #mfcc/mceps)

    target_scaler["mean"] = list(np.mean(total_mceps, 0))
    target_scaler["std"] = list(np.std(total_mceps, 0))

    #apply normalization
    for k, v in _data_x.items():
        mcep = v[1]
        mcep = (mcep - target_scaler["mean"]) / target_scaler["std"]
        _data_x[k] = (v[0], mcep)

    return _data_x, target_scaler
Example #12
0
import pysptk as SPTK
from scipy.io import wavfile

fs, x = wavfile.read('test.wav')
assert fs == 16000

x = 1. * x  #change to float64

from cle.cle.utils import segment_axis

frame_length = 1024
hopsize = 80
noverlap = frame_length - hopsize

frames = segment_axis(x, frame_length, noverlap).astype('float64').T
frames = xw * SPTK.blackman(frame_length).reshape((1024, 1))

#frames = frames.T
#frames = frames.copy(order='C')
frames = frames.T

order = 20
alpha = 0.41
stage = 4
gamma = -1.0 / stage

mgc = np.apply_along_axis(SPTK.mgcep, 1, frames, order, alpha, gamma)
mgc_sp = np.apply_along_axis(SPTK.mgc2sp, 1, mgc, alpha, gamma,
                             frame_length).real

mgc_sp_test = np.hstack([mgc_sp, mgc_sp[:, ::-1][:, 1:-1]])
Example #13
0
def extract_features(audiodata, speaker, sr=16000, feat='mcep'):
    """
    Feature extraction. For each type of feature, see corresponding case below
    Currently support: MCEP, MFCC. Will be updated when needed
    :param audiodata: 162 files time-series data
    :param sr: sampling rate.
    :param feat: type of currently supported feature
    :return:
    """
    print("=======================")
    logging.info("Extracting MCEP from {}'s data ...".format(speaker))

    if feat.lower() == 'mfcc':
        """
            extract mfcc from audio time series data (from librosa.load)
        """
        mfccs = []
        for audio in tqdm(audiodata):
            mfccs.append(
                lbr.util.normalize(lbr.feature.mfcc(audio,
                                                    sr=sr,
                                                    n_fft=frame_length,
                                                    hop_length=hop_length),
                                   norm=1,
                                   axis=0))

        # return np.stack(mfccs)
        return mfccs, feat

    elif feat.lower() == 'mcep' or feat.lower() == 'mcc':
        """ 
            MCEP is extracted via pysptk. See the link below for more details
            https://github.com/eYSIP-2017/eYSIP-2017_Speech_Spoofing_and_Verification/wiki/Feature-Extraction-for-Speech-Spoofing
            
            Example of using pysptk to extract mcep (copied from the above link):             
                frameLength = 1024
                overlap = 0.25
                hop_length = frameLength * overlap
                order = 25
                alpha = 0.42
                gamma = -0.35
            
                sourceframes = librosa.util.frame(speech, frame_length=frameLength, hop_length=hop_length).astype(np.float64).T
                sourceframes *= pysptk.blackman(frameLength)
                sourcemcepvectors = np.apply_along_axis(pysptk.mcep, 1, sourceframes, order, alpha)
        """
        # Check if data exists
        temp_filename = os.path.join(feature_path,
                                     "{}_{}.pkl".format(speaker, feat))

        if os.path.isfile(temp_filename):
            logging.info("Found {}. Load data from {}_{}".format(
                temp_filename, speaker, feat))

            with open(temp_filename, "rb") as f:
                return pickle.load(f), feat
        else:
            mceps = []
            logging.info("Calculating ...")

            for audio in tqdm(audiodata):
                frame = lbr.util.frame(audio,
                                       frame_length=frame_length,
                                       hop_length=hop_length).T
                frame *= pysptk.blackman(frame_length)

                mceps.append(
                    np.apply_along_axis(pysptk.mcep,
                                        1,
                                        frame,
                                        order=order,
                                        alpha=alpha).T)

            # Save to .pkl for later load
            with open(temp_filename, "wb") as f:
                pickle.dump(mceps, f, protocol=3)

            return mceps, feat
    else:
        logging.critical('{} feature is not supported yet, exiting ...')
        exit()
Example #14
0
import pysptk as SPTK
from scipy.io import wavfile

fs, x = wavfile.read('test.wav')
assert fs == 16000

x = 1.*x #change to float64

from cle.cle.utils import segment_axis

frame_length = 1024
hopsize = 80
noverlap = frame_length - hopsize

frames = segment_axis(x,frame_length, noverlap).astype('float64').T
frames = xw*SPTK.blackman(frame_length).reshape((1024,1))

#frames = frames.T
#frames = frames.copy(order='C')
frames = frames.T

order = 20
alpha = 0.41
stage = 4
gamma = -1.0 / stage

mgc = np.apply_along_axis(SPTK.mgcep, 1, frames, order, alpha, gamma)
mgc_sp = np.apply_along_axis(SPTK.mgc2sp, 1, mgc, alpha, gamma, frame_length).real

mgc_sp_test = np.hstack([mgc_sp,mgc_sp[:,::-1][:,1:-1]])
mgc_sp_test = mgc_sp_test.copy(order = 'C')
def framing_windowing(np_data):
    frames = librosa.util.frame(np_data,
                                frame_length=FRAME_LENGTH,
                                hop_length=HOP_LENGTH).astype(np.float64).T
    frames *= ps.blackman(FRAME_LENGTH)
    return frames