Exemple #1
0
def wav2mgcf0(x, order=34, frame_window=512, zerofill_width=1024, shift_window=64, pass_const=0.4, min_pitch=20, max_pitch=500, mgcep_gamma=2):
    # Convert from int to float32, but keep numbers as integers
    x = x.astype('float32')

    # Compute pitch and voicing strength
    f0 = pysptk.rapt(x.astype(np.float32), 16000, shift_window, otype="pitch", min=min_pitch, max=max_pitch)

    voicing_str = np.empty((f0.shape[0], 5), dtype=np.float32)

    for i in range(5):
        h = h_filters[i]
        xf = lfilter(h, 1, x)
        voicing_str[:, i] = pysptk.rapt(xf.astype(np.float32), 16000, shift_window, otype="mixed", min=20, max=500)[:,3]

    # Compute MGC coefficients
    mgc_cmd = 'frame -l {} -p {} | window -l {} -L {} | mgcep -m {} -a {} -c {} -l {} -e 0.0012'.format(frame_window, shift_window, frame_window, zerofill_width, order, pass_const, mgcep_gamma, zerofill_width)

    p = subprocess.Popen(mgc_cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, shell=True)
    stdout, stderr = p.communicate(x.tobytes())
    mgc = np.fromstring(stdout, dtype='float32').reshape((len(f0), order+1))

    try:
        idx = np.where( abs(voicing_str).sum(axis= 1) > 10)[0][0]
    except IndexError:
        idx = len(f0)

    return mgc[:idx], f0[:idx], voicing_str[:idx]
Exemple #2
0
def test_rapt_regression():
    # Grund truth data is generated by:
    #
    # $ wav2raw pysptk/example_audio_data/arctic_a0007.wav
    #
    # $ x2x +sf ./pysptk/example_audio_data/arctic_a0007.raw | \
    #    pitch -a 0 -s 16 -p 80 -L 60 -H 240 -o 0 > \
    #    arctic_a007_p16_L60_H240_o0_rapt.pitch
    #
    # $ dmp +f arctic_a007_p16_L60_H240_o0_rapt.pitch | awk '{print $2}' >\
    #    arctic_a007_p16_L60_H240_o0_rapt.txt
    #
    # $ pitch -h
    #  ...
    #
    #  SPTK: version 3.8
    #  CVS Info: $Id: pitch.c,v 1.46 2014/12/11 08:30:43 uratec Exp $

    ground_truth_path = join(dirname(__file__), "data",
                             "arctic_a007_p16_L60_H240_o0_rapt.txt")
    with open(ground_truth_path) as f:
        ground_truth = np.asarray([float(s)
                                   for s in [l for l in f.readlines()]])
    ground_truth = ground_truth.astype(np.float32)

    fs, x = wavfile.read(pysptk.util.example_audio_file())
    assert fs == 16000

    # Since SPTK might have memory corruption bug and the result might be
    # non-deterministic, test it with multiple time...
    for _ in range(5):
        f0 = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=80,
                         min=60, max=240, voice_bias=0.0, otype=0)
        assert np.allclose(ground_truth, f0)
def pysptk_featurize(audiofile):
    labels = list()
    features = list()
    fs, x = wavfile.read(audiofile)

    f0_swipe = pysptk.swipe(x.astype(np.float64),
                            fs=fs,
                            hopsize=80,
                            min=60,
                            max=200,
                            otype="f0")
    features = features + stats(f0_swipe)
    labels = stats_labels('f0_swipe', labels)

    f0_rapt = pysptk.rapt(x.astype(np.float32),
                          fs=fs,
                          hopsize=80,
                          min=60,
                          max=200,
                          otype="f0")
    features = features + stats(f0_rapt)
    labels = stats_labels('f0_rapt', labels)

    mgc = pysptk.mgcep(xw, 20, 0.0, 0.0)
    features = features + stats(mgc)
    labels = stats_labels('mel-spectrum envelope', labels)

    return features, labels
Exemple #4
0
def test_rapt_regression():
    # Grund truth data is generated by:
    #
    # $ wav2raw pysptk/example_audio_data/arctic_a0007.wav
    #
    # $ x2x +sf ./pysptk/example_audio_data/arctic_a0007.raw | \
    #    pitch -a 0 -s 16 -p 80 -L 60 -H 240 -o 0 > \
    #    arctic_a007_p16_L60_H240_o0_rapt.pitch
    #
    # $ dmp +f arctic_a007_p16_L60_H240_o0_rapt.pitch | awk '{print $2}' >\
    #    arctic_a007_p16_L60_H240_o0_rapt.txt
    #
    # $ pitch -h
    #  ...
    #
    #  SPTK: version 3.8
    #  CVS Info: $Id: pitch.c,v 1.46 2014/12/11 08:30:43 uratec Exp $

    ground_truth_path = join(dirname(__file__), "data",
                             "arctic_a007_p16_L60_H240_o0_rapt.txt")
    with open(ground_truth_path) as f:
        ground_truth = np.asarray([float(s)
                                   for s in [l for l in f.readlines()]])
    ground_truth = ground_truth.astype(np.float32)

    fs, x = wavfile.read(pysptk.util.example_audio_file())
    assert fs == 16000

    # Since SPTK might have memory corruption bug and the result might be
    # non-deterministic, test it with multiple time...
    for i in range(5):
        f0 = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=80,
                         min=60, max=240, voice_bias=0.0, otype=0)
        assert np.allclose(ground_truth, f0)
def f0gram(filename,hparams,index):
    fs, x = wavfile.read(filename)
    if x.ndim > 1:
        x = np.mean(x,axis=1) 

    x = librosa.core.resample(x.astype(np.float32),fs,hparams.sample_rate,'kaiser_best')
    x = x[index[0]:index[1]]
    f0_rapt = pysptk.rapt(x, fs=hparams.sample_rate, hopsize=256, min=10, max=7600, otype="f0")
    return f0_rapt
 def wav2f0(self, wav_file, f0_dir):
     sr, x = io.wavfile.read(wav_file)
     bn = os.path.basename(wav_file)
     f0_file_path = os.path.join(f0_dir, bn.split('.')[0] + ".f0")
     f0 = pysptk.rapt(x.astype(np.float32),
                      fs=sr,
                      hopsize=self.hop_length,
                      min=self.pitch_floor,
                      max=self.pitch_ceiling,
                      otype="f0").astype(np.float32)
     f0.tofile(f0_file_path)
     return f0_file_path
def process(filename):
    '''
    The function decomposes a wav file into F0, mel-cepstral coefficients, and aperiodicity
    :param filename: path to wav file
    :return: .lf0, .mgc and .bap files
    '''
    # pdb.set_trace()
    file_id = os.path.basename(filename).split(".")[0]
    print('\n' + file_id)

    ### WORLD ANALYSIS -- extract vocoder parameters ###
    # x, fs = librosa.core.load(filename, sr=16000)
    fs, x = wavfile.read(filename)
    # warnning this parameter is important
    alpha = pysptk.util.mcepalpha(fs)
    hopesize = int(0.005 * fs)
    # pdb.set_trace()
    f0 = pysptk.rapt(x.astype(np.float32),
                     fs=fs,
                     hopsize=hopesize,
                     min=60,
                     max=600,
                     voice_bias=0.0,
                     otype=1)
    f0 = f0.astype(np.float64)
    x = x.astype(np.float64) / (2**15)
    _, timeaxis = pyworld.harvest(x,
                                  fs,
                                  frame_period=5,
                                  f0_floor=60.0,
                                  f0_ceil=600)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    f0 = f0[:, None]
    lf0 = f0.copy()
    lf0 = lf0.astype(np.float32)
    nonzero_indices = np.where(f0 != 0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    zero_indices = np.where(f0 == 0)
    lf0[zero_indices] = -1.0E+10
    write_binfile(lf0,
                  os.path.join(lf0_dir, file_id + '.lf0'),
                  dtype=np.float32)
    mc = pysptk.sp2mc(spectrogram, mcsize, alpha=alpha)
    mc = mc.astype(np.float32)
    write_binfile(mc,
                  os.path.join(mgc_dir, file_id + '.mgc'),
                  dtype=np.float32)
    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    bap = bap.astype(np.float32)
    write_binfile(bap,
                  os.path.join(bap_dir, file_id + '.bap'),
                  dtype=np.float32)
Exemple #8
0
def get_f0(wav, sr, fmin=60, fmax=400, spec_len=None):
    if wav.dtype == np.float32:
        wav = wav * 32768.0
    f0 = pysptk.rapt(wav,
                     fs=sr,
                     hopsize=args.hop_length,
                     min=fmin,
                     max=fmax,
                     otype='f0')
    f0norm = normalize(f0, xmin=0, xmax=fmax)
    if spec_len is not None and spec_len != f0.shape[0]:
        n_pad = spec_len - f0.shape[0]
        f0norm = np.pad(f0norm, [0, n_pad])  # pad into spec length
    f0norm = padding_reduction(f0norm, r=args.r)
    # f0norm = f0norm[::args.r]
    return f0norm
Exemple #9
0
def process_file(data, fs, window_len, log):
    chunk_size = int(window_len / 1000 * fs)
    pdqs = []

    for chunk in [
            data[idx:idx + chunk_size]
            for idx in range(0, len(data), chunk_size)
    ]:
        f0 = pysptk.rapt(chunk.astype(np.float32),
                         fs=fs,
                         hopsize=HOP_SIZE,
                         min=60,
                         max=600,
                         otype="f0")

        f0_positive = f0[f0 > 0]

        if f0_positive.size != 0:
            std = np.std(f0_positive)
            avg = np.average(f0_positive)

            if avg == 0:
                pdq = 0
            else:
                pdq = std / avg
        else:
            pdq = 0

        pdqs.append(float(pdq))

    avg_pdq = np.average(pdqs)
    log(f"Avg pitch quotient: {avg_pdq}")

    return {
        'values': pdqs,
        'frame_length': window_len,  #ms
        'avg': float(avg_pdq)
    }
Exemple #10
0
    def extract_emphasis(self, index_chunk):

        chunks_path = self.video_path + "chunks/"

        if (index_chunk <= 9):
            file = self.chunks_path + "chunk-0" + str(index_chunk) + ".wav"
        else:
            file = self.chunks_path + "chunk-" + str(index_chunk) + ".wav"

        fs, x = wavfile.read(file)
        assert fs == 16000

        f0_swipe = pysptk.rapt(x.astype(np.float32),
                               fs=fs,
                               hopsize=20,
                               min=60,
                               otype="pitch")
        a = []
        f = []
        X_Frequecies_Vector = []
        for w in f0_swipe:
            if w != 0:
                f.append(w)

        pitch = float(np.median(f))
        if (np.isnan(pitch)):
            pitch = 0

        #dbs = 20*np.log10( np.sqrt(np.mean(x**2)) )
        dbs = 2000 * np.log10(np.sqrt(np.mean(
            x**2))) / 5 * (self.n_chunks - index_chunk)

        if (np.isnan(dbs)):
            dbs = 0

        return pitch, dbs
Exemple #11
0
 def __test(x, fs, hopsize, min, max, otype):
     f0 = pysptk.rapt(x, fs, hopsize, min=min, max=max, otype=otype)
     assert np.all(np.isfinite(f0))
     if otype == 1:
         assert np.all(f0 >= 0)
def feature_extraction(x, fs, feats_df, lp_ord, ID, label):
    #Extract features from signal x (identified as ID), and concatenate them to dataframe feats_df
    #Features' reference: (see Appendix)
    #[1]https://link.springer.com/article/10.1007/s10439-013-0741-6
    #[2]https://espace.library.uq.edu.au/data/UQ_344963/s41943203_phd_submission.pdf?dsi_version=c5434db897ab74b192ca295a9eeca041&Expires=1585086202&Key-Pair-Id=APKAJKNBJ4MJBJNC6NLQ&Signature=c8k8DmG~KIxg0ToTO8rebm2MzHneCzJGkjSFRB7BYTEQ-MHXEr0ocHmISrldP3hFf9qmeiL11ezyefcNeRVeKIQ9PVjOl9pn7rXWcjA1o2voPn1VnDd8n7G2cT31apdj0LNMclhlXRPnCsGD66qDRqa3d-xaqqXhEqU73aw3ZgBgroO213MfJOqFhJxxXo2QEia0bSlDRTeX9KhSczFK-IFTPC6GwFL2L04por8pQRI3HF7E3f26O9zp9OhkwxSU9qfJah20WxZLA4PxREdv7JGoVBinR6T0mTcIaQi~B4IzYjSPSsTTADMNk5znVYIvSqgtMT~DY~qwlfq4SRdFjQ__

    #do features in a frame-basis
    x_frames = spe_feats.sigproc.framesig(
        x, config.frame_len, config.frame_step,
        config.win_func)  #DOUBT: should I use window or not?
    #at least for formant estimation i should

    nr_frames = x_frames.shape[0]
    #print(nr_frames)

    #0)Wavelets #TODO

    #DOUBT: if log-energy feature is included, should I also include the first mfcc coefficient (c0) ?
    #1)mfcc
    mfcc_feat = spe_feats.mfcc(x,
                               fs,
                               winlen=config.frame_len_s,
                               winstep=config.frame_step_s,
                               numcep=config.cep_num,
                               winfunc=config.win_func)

    #Apply zscore normalization
    mfcc_feat = zscore(mfcc_feat, axis=1, ddof=1)

    #deltas to capture
    mfcc_delta_feat = spe_feats.delta(
        mfcc_feat,
        1)  #mfcc_delta_feat = np.subtract(mfcc_feat[:-1], mfcc_feat[1:]) #same
    mfcc_deltadelta_feat = spe_feats.delta(mfcc_delta_feat, 1)

    #2)zero-crossing rate
    zcr_feat = np.apply_along_axis(get_zcr, 1, x_frames)

    #3)Formant frequencies
    #using LP-coeffcs-based method
    #formant_feat = np.apply_along_axis(get_formants, 1, x_frames, lp_ord, nr_formants)

    #Note: for the moment, it seems some frames are ill-conditioned for lp computing,
    #current solution - we skip those and fill with NaN values
    formants_feat = np.empty((nr_frames, 4))
    formants_feat[:] = np.nan

    for i_frame in range(0, nr_frames):
        try:
            formants_feat[i_frame] = get_formants(x_frames[i_frame],
                                                  config.lp_ord,
                                                  config.nr_formants)
        except:
            pass

    #4)Log-energy
    logEnergy_feat = np.apply_along_axis(get_logEnergy, 1, x_frames)

    #5)Pitch (F0)
    F0_feat = np.apply_along_axis(get_F0, 1, x_frames, fs)

    #TODO: compute also F0 with pysptk (a python wrapper for SPTK library), it probably gives better results
    #https://github.com/r9y9/pysptk/blob/master
    #F0_feat = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=frame_step, min=50, max=500, ,voice_bias=0.0 ,otype=\"f0\")

    #Compare the values between swipe and rapt
    #F0_feat = pysptk.swipe(x.astype(np.float64), fs=fs,hopsize = config.frame_step, min=50, max=500, otype="f0")

    F0_feat = pysptk.rapt(x.astype(np.float32),
                          fs=fs,
                          hopsize=config.frame_step,
                          min=50,
                          max=500,
                          otype="f0")

    #right frame size???
    #Change the window size from 450 to 40 to 100
    # Keep swipe , change min to 50 and max - 500
    #EXample pysptk.swipe(x.astype(np.float64), fs=fs, hopsize=80, min=60, max=200, otype="f0")

    #6)Kurtosis
    kurt_feat = np.apply_along_axis(kurtosis, 1, x_frames)

    #7)Bispectrum Score (BGS)
    #TODO: see PhD thesis for more info on this feature

    #8)Non-Gaussianity Score (NGS)
    #TODO: see PhD thesis for more info on this feature

    #9) Adding skewness as measure of non-gaussianity (not in paper)
    skew_feat = np.apply_along_axis(skew, 1, x_frames)

    #DOUBT: 10) Shannon entropy GETTING -inf in all cases, WHY??? Don't include until fixed
    entropy_feat = np.apply_along_axis(get_entropy, 1, x_frames)

    #TODO: add small value in all entries, this may fix the problem

    mfcc_cols = ['mfcc_%s' % s for s in range(0, config.cep_num)]
    mfcc_delta_cols = ['mfcc_d%s' % s for s in range(0, config.cep_num)]
    mfcc_deltadelta_cols = ['mfcc_dd%s' % s for s in range(0, config.cep_num)]
    formants_cols = ['F%s' % s for s in range(1, config.nr_formants + 1)]

    feats_segment = pd.concat([
        pd.DataFrame({
            'Id': ID,
            'kurt': kurt_feat,
            'logEnergy': logEnergy_feat,
            'zcr': zcr_feat,
            'F0': F0_feat,
            'skewness': skew_feat,
            'label': label,
            'entropy': entropy_feat
        }),
        pd.DataFrame(mfcc_feat, columns=mfcc_cols),
        pd.DataFrame(mfcc_delta_feat, columns=mfcc_delta_cols),
        pd.DataFrame(mfcc_deltadelta_feat, columns=mfcc_deltadelta_cols),
        pd.DataFrame(formants_feat, columns=formants_cols)
    ],
                              axis=1)

    #print(nr_frames)
    feats_df = feats_df.append(feats_segment, ignore_index=True, sort=False)

    return feats_df
Exemple #13
0
 def __test(x, fs, hopsize, min, max, otype):
     f0 = pysptk.rapt(x, fs, hopsize, min=min, max=max, otype=otype)
     assert np.all(np.isfinite(f0))
     if otype == 1:
         assert np.all(f0 >= 0)
Exemple #14
0
order = 34
frame_window = 512
zerofill_width = 1024
shift_window = 64
pass_const = 0.4
min_pitch = 20
max_pitch = 500
mgcep_gamma = 2
e = 0.0012

#-a 0 -s 16

SPTK.rapt(raw.astype(np.float32),
          fs=16000,
          hopsize=shift_window,
          min=min_pitch,
          max=max_pitch,
          otype="pitch")

f0[1200:1300]

import pipes, os, subprocess, tempfile
import numpy as np

frame_cmd = 'frame -l {} -p {}'.format(frame_window, shift_window)

raw2 = raw.astype('float32')

#raw2 = np.arange(5000).astype('float32')

p = subprocess.Popen(frame_cmd,
Exemple #15
0
mgc2, f02 = wav2mgcf0(raw)

order=34
frame_window=512
zerofill_width=1024
shift_window=64
pass_const=0.4
min_pitch=20
max_pitch=500
mgcep_gamma=2
e = 0.0012

#-a 0 -s 16


SPTK.rapt(raw.astype(np.float32), fs=16000,
	hopsize=shift_window, min=min_pitch, max=max_pitch, otype="pitch")

f0[1200:1300]


import pipes, os, subprocess, tempfile
import numpy as np

frame_cmd = 'frame -l {} -p {}'.format(frame_window, shift_window)

raw2 = raw.astype('float32')

#raw2 = np.arange(5000).astype('float32')

p = subprocess.Popen(frame_cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, shell=True)
stdout, stderr = p.communicate(raw2.tobytes())
Exemple #16
0
def get_f0(waveform,
           sample_rate,
           hop_length_seconds=0.01,
           method='swipe',
           f0_min=60,
           f0_max=300):
    """Compute the F0 contour using PYSPTK: https://github.com/r9y9/pysptk/.

    Args:
        waveform (np.array, [T, ]): waveform over which to compute f0
        sample_rate (int > 0): number of samples per second in waveform
        hop_length (int): hop size argument in pysptk.swipe. Corresponds to hopsize
            in the window sliding of the computation of f0.
        method (str): is one of 'swipe' or 'rapt'. Define which method to use for f0
            calculation. See https://github.com/r9y9/pysptk

    Returns:
        dict: Dictionary containing keys:
            "contour" (np.array, [1, t1]): f0 contour of waveform. Contains unvoiced
                frames.
            "values" (np.array, [1, t2]): nonzero f0 values waveform. Note that this
                discards all unvoiced frames. Use to compute mean, std, and other statistics.
            "mean" (float): mean of the f0 contour.
            "std" (float): standard deviation of the f0 contour.
    """
    assert method in (
        'swipe',
        'rapt'), "The method argument should be one of 'swipe' or 'rapt'."

    hop_length = numseconds_to_numsamples(hop_length_seconds, sample_rate)
    if method == 'swipe':
        f0_contour = swipe(
            waveform.astype(np.float64),
            fs=sample_rate,
            hopsize=hop_length,
            min=f0_min,
            max=f0_max,
            otype="f0",
        )[np.newaxis, :]

    elif method == 'rapt':
        # For this estimation, waveform needs to be in the int PCM format.
        f0_contour = rapt(
            np.round(waveform * 32767).astype(np.float32),
            fs=sample_rate,
            hopsize=hop_length,
            min=f0_min,
            max=f0_max,
            otype="f0",
        )[np.newaxis, :]

    # Remove unvoiced frames.
    f0_values = f0_contour[:, np.where(f0_contour[0, :] != 0)][0]

    f0_mean = np.mean(f0_values[0])
    f0_std = np.std(f0_values[0])
    return {
        "contour": f0_contour,
        "values": f0_values,
        "mean": f0_mean,
        "std": f0_std,
    }