def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    x, fs = sf.read('utterance/vaiueo2d.wav')
    # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

    # 1. A convient way
    f0, sp, ap = pw.wav2world(x, fs)    # use default options
    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # 2. Step by step
    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0,
                    channels_in_octave=2,
                    frame_period=args.frame_period,
                    speed=args.speed)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 DIO with F0 refinement (using Stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
    # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
    sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

    # Comparison
    savefig('test/wavform.png', [x, _y, y])
    savefig('test/sp.png', [_sp, sp])
    savefig('test/ap.png', [_ap, ap], log=False)
    savefig('test/f0.png', [_f0, f0])

    print('Please check "test" directory for output files')
def world_decompose(wav, fs, frame_period = 5.0):

    # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD
    wav = wav.astype(np.float64)
    f0, timeaxis = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0)
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    ap = pyworld.d4c(wav, f0, timeaxis, fs)

    return f0, timeaxis, sp, ap
Beispiel #3
0
def wav2pw(x, fs=16000, fft_size=FFT_SIZE):
    ''' Extract WORLD feature from waveform '''
    _f0, t = pw.dio(x, fs, f0_ceil=args.f0_ceil)            # raw pitch extractor
    f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
    sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size)
    ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) # extract aperiodicity
    return {
        'f0': f0,
        'sp': sp,
        'ap': ap,
    }
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        if hp_acoustic.use_harvest:
            f0, timeaxis = pyworld.harvest(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
        else:
            f0, timeaxis = pyworld.dio(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        if self.alpha is None:
            self.alpha = pysptk.util.mcepalpha(fs)
        mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha)
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if hp_acoustic.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind)

        # Parameter trajectory smoothing
        if hp_acoustic.mod_spec_smoothing:
            hop_length = int(fs * (hp_acoustic.frame_period * 0.001))
            modfs = fs / hop_length
            mgc = P.modspec_smoothing(
                mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff)

        mgc = P.delta_features(mgc, hp_acoustic.windows)
        lf0 = P.delta_features(lf0, hp_acoustic.windows)
        bap = P.delta_features(bap, hp_acoustic.windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
    def __call__(self, data: Wave, test=None):
        x = data.wave.astype(numpy.float64)
        fs = data.sampling_rate

        if self._f0_estimating_method == 'dio':
            _f0, t = pyworld.dio(
                x,
                fs,
                frame_period=self._frame_period,
                f0_floor=self._f0_floor,
                f0_ceil=self._f0_ceil,
            )
        else:
            from world4py.np import apis
            _f0, t = apis.harvest(
                x,
                fs,
                frame_period=self._frame_period,
                f0_floor=self._f0_floor,
                f0_ceil=self._f0_ceil,
            )
        f0 = pyworld.stonemask(x, _f0, t, fs)
        spectrogram = pyworld.cheaptrick(x, f0, t, fs)
        aperiodicity = pyworld.d4c(x, f0, t, fs)

        mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha)
        voiced = ~(f0 == 0)  # type: numpy.ndarray

        feature = AcousticFeature(
            f0=f0[:, None].astype(self._dtype),
            spectrogram=spectrogram.astype(self._dtype),
            aperiodicity=aperiodicity.astype(self._dtype),
            mfcc=mfcc.astype(self._dtype),
            voiced=voiced[:, None],
        )
        feature.validate()
        return feature
Beispiel #6
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(
            y_hat, R, hp.stream_sizes, hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(
        mc_static_pred, data_mean[:static_dim], data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(
            mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
        waveform = pyworld.synthesize(
            f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform, inputs, outputs
Beispiel #7
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(y_hat, R, hp.stream_sizes,
                                         hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim],
                                 data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      hp.frame_period)

    return waveform, inputs, outputs
Beispiel #8
0
from python_speech_features import mfcc
from sklearn.model_selection import train_test_split
from keras.utils import np_utils



##############################################################
# Test one audio file
path='/Users/adaezeadigwe/Desktop/Research/project_ml/Data/anger/anger_0001.wav'
Data_Directory = '/Users/adaezeadigwe/Desktop/Research/project_ml/Data/'

x, fs = sf.read(path)
_f0, t = pw.dio(x, fs)    # raw pith extractor
f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
sp = pw.cheaptrick(x, f0, t, fs)  # extract smoothed spectrogram
ap = pw.d4c(x, f0, t, fs)         # extract aperiodicity
mfcc = librosa.feature.mfcc(x, sr=16000)
y = pw.synthesize(f0, sp, ap, fs)
#wav.write('neutral_syn.wav',fs, y)
print(f0.shape)	  #1071,
print(sp.shape)	  #1071 , 513	
print(ap.shape)	  #1071, 513
print(mfcc.shape) #20,168

##############################################################

#A. LOOP to extract vector by sample of source fundamental frequency

def get_labels(path=Data_Directory):
	labels = os.listdir(path)
	label_indices = np.arange(0, len(labels))
# Making directories for speech features
for s in spklist:
    for f in featlist:
        if not os.path.exists("data/{}/{}".format(s, f)):
            os.mkdir("data/{}/{}".format(s, f))

for s in spklist:
    wavlist = os.listdir("data/{}/wav".format(s))
    for wf in wavlist:
        # WORLD analysis for each file
        print("speaker: {} file: {}".format(s, wf))
        fs, data = wavfile.read("data/{}/wav/{}".format(s, wf))
        data = data.astype(np.float)

        f0, t = pw.harvest(data, fs)
        sp = pw.cheaptrick(data, f0, t, fs)
        ap = pw.d4c(data, f0, t, fs)

        alpha = 0.42
        dim = 24
        mgc = sptk.sp2mc(sp, dim, alpha)

        bn, _ = os.path.splitext(wf)

        with open("data/{}/mgc/{}.mgc".format(s, bn), "wb") as f:
            mgc.tofile(f)
        with open("data/{}/f0/{}.f0".format(s, bn), "wb") as f:
            f0.tofile(f)
        with open("data/{}/ap/{}.ap".format(s, bn), "wb") as f:
            ap.tofile(f)
Beispiel #10
0
    def analysisf(self,
                  fwav,
                  ff0,
                  f0_min,
                  f0_max,
                  fspec,
                  faper,
                  fvuv,
                  preproc_hp=None):
        print('Extracting WORLD features from: ' + fwav)

        wav, fs, _ = sp.wavread(fwav)

        if preproc_hp == 'auto': preproc_hp = f0_min
        self.preprocwav(wav, fs, highpass=preproc_hp)

        import pyworld as pw

        if 0:
            # Check direct copy re-synthesis without compression/encoding
            print(pw.__file__)
            # _f0, ts = pw.dio(wav, fs, f0_floor=f0_min, f0_ceil=f0_max, channels_in_octave=2, frame_period=self.shift*1000.0)
            _f0, ts = pw.dio(wav,
                             fs,
                             f0_floor=f0_min,
                             f0_ceil=f0_max,
                             channels_in_octave=2,
                             frame_period=self.shift * 1000.0)
            # _f0, ts = pw.harvest(wav, fs)
            f0 = pw.stonemask(wav, _f0, ts, fs)
            SPEC = pw.cheaptrick(wav, f0, ts, fs, fft_size=self.dftlen)
            APER = pw.d4c(wav, f0, ts, fs, fft_size=self.dftlen)
            resyn = pw.synthesize(f0.astype('float64'), SPEC.astype('float64'),
                                  APER.astype('float64'), fs,
                                  self.shift * 1000.0)
            sp.wavwrite('resynth.wav',
                        resyn,
                        fs,
                        norm_abs=True,
                        force_norm_abs=True,
                        verbose=1)
            from IPython.core.debugger import Pdb
            Pdb().set_trace()

        _f0, ts = pw.dio(wav,
                         fs,
                         f0_floor=f0_min,
                         f0_ceil=f0_max,
                         channels_in_octave=2,
                         frame_period=self.shift * 1000.0)
        f0 = pw.stonemask(wav, _f0, ts, fs)
        SPEC = pw.cheaptrick(wav, f0, ts, fs, fft_size=self.dftlen)
        # SPEC = 10.0*np.sqrt(SPEC) # TODO Best gain correction I could find. Hard to find the good one between PML and WORLD different syntheses
        APER = pw.d4c(wav, f0, ts, fs, fft_size=self.dftlen)

        unvoiced = np.where(f0 < 20)[0]
        f0 = np.interp(ts, ts[f0 > 0], f0[f0 > 0])
        f0 = np.log(f0)
        makedirs(os.path.dirname(ff0))
        f0.astype('float32').tofile(ff0)

        vuv = np.ones(len(f0))
        vuv[unvoiced] = 0
        makedirs(os.path.dirname(fvuv))
        vuv.astype('float32').tofile(fvuv)

        SPEC = self.compress_spectrum(SPEC, fs, self.spec_size)
        makedirs(os.path.dirname(fspec))
        SPEC.astype('float32').tofile(fspec)

        APER = sp.linbnd2fwbnd(APER, fs, self.dftlen, self.aper_size)
        APER = sp.mag2db(APER)
        makedirs(os.path.dirname(faper))
        APER.astype('float32').tofile(faper)

        # CMP = np.concatenate((f0.reshape((-1,1)), SPEC, APER, vuv.reshape((-1,1))), axis=1) # (This is not a necessity)

        if 0:
            import matplotlib.pyplot as plt
            plt.ion()
            resyn = self.synthesis(fs, CMP)
            sp.wavwrite('resynth.wav',
                        resyn,
                        fs,
                        norm_abs=True,
                        force_norm_abs=True,
                        verbose=1)
            from IPython.core.debugger import Pdb
            Pdb().set_trace()
Beispiel #11
0
def get_con2(x, words):
    _f0, t = pw.dio(x, fs, f0_floor=120.0, f0_ceil=750.0, frame_period=8.0)
    f0_herz = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0_herz, t, fs)
    ap = pw.d4c(x, f0_herz, t, fs)
    # print(sp.shape)

    f0_note = []
    for i in range(len(f0_herz)):
        if f0_herz[i] == 0:
            f0_note.append(0.0)
        else:
            f0_note.append(herz2note(f0_herz[i]))
    con2 = []
    # plt.plot(np.arange(len(x)/256),f0_note)
    for i in range(len(words)):
        note = cal_note(f0_note[words[i][0]:words[i][1]])
        con2.append(note)
        # print(words[i])
        # x=np.arange(words[i][0],words[i][1])
        # y=np.zeros(words[i][1]-words[i][0])
        # y.fill(note)
        # print(x,y)
        # plt.plot(x,y)
    # plt.show()
    f0_note = np.array(f0_note)
    f0_note = np.round((f0_note - 40.0) * 5)
    # print(f0_note)
    f0_mat = np.zeros([f0_note.shape[0], 200])
    f0_mat.fill(0.0)
    for i in range(f0_note.shape[0]):
        if f0_note[i] > 0.0 and f0_note[i] < 200:
            f0_mat[i][int(f0_note[i])] = 1.0
        else:
            f0_note[i] = 0

    # plt.matshow(ap)
    # plt.show()
    ap = ap * 20 - 18
    arr = []
    for i in range(sp.shape[0]):
        arr.append(
            np.interp(np.linspace(0, 1025, 32), np.arange(1025),
                      ap[i])[np.newaxis, :])
    _ap = np.concatenate(arr, axis=0)

    sp = np.log(sp)
    # plt.matshow(sp)
    # plt.show()
    arr = []
    for i in range(sp.shape[0]):
        arr.append(
            np.interp(np.linspace(0, 1025, 128), np.arange(1025),
                      sp[i])[np.newaxis, :])
    _sp = np.concatenate(arr, axis=0)

    mel = np.concatenate([_ap, _sp], axis=1)

    #     mel=mel+20.0
    #     mel=np.where(mel>0,mel,0)
    #     mel=mel/mel.max()
    #     plt.matshow(mel)
    #     plt.show()

    return np.array(con2), mel, f0_note.astype(np.int32)
    if opts.harvest:
        print("Begin harvest ...")
        f0_x, tp_x = pw.harvest(x, RATE, f0_floor, f0_ceil, frame_period)
        out_filename += 'harvest'
    else:
        print("Begin stonemask ...")
        f0_x, tp_x = pw.dio(x, RATE, f0_floor, f0_ceil, channels_in_octave,
                            frame_period, speed, allowed_range)
        f0_x = pw.stonemask(x, f0_x, tp_x, RATE)
        out_filename += 'dio'

    print("Begin cheaptrick ...")
    sp_x = pw.cheaptrick(x, f0_x, tp_x, RATE, q1, f0_floor, fft_size)
    print("Begin d4c ...")
    ap_x = pw.d4c(x, f0_x, tp_x, RATE, threshold, fft_size)

    lz, tz, f0_x, sp_x, ap_x = trim_zeros_frames(f0_x, sp_x, ap_x, 0.7)

    uv = (f0_x == 0).astype(int)

    print("Begin f0 transform ...")
    lf0_x = toquefrency(f0_x)
    print("Begin sp transform ...")
    mgc_x = pysptk.conversion.sp2mc(sp_x, order=mcsize, alpha=alpha)
    print("Begin ap transform ...")
    bap_x = pysptk.conversion.sp2mc(ap_x, order=mcsize, alpha=alpha)

    statsdir = 'model_saves/ST_STATS_mlpg.npy'
    savedir = 'model_saves/theta_best_mlpg.dat'
Beispiel #13
0
def world_feature_extract(wav_list, spk_list, feat_param_list, args):
    """EXTRACT WORLD FEATURE VECTOR"""

    for i, wav_name in enumerate(wav_list):
        bin_basename = os.path.basename(wav_name).replace('wav', 'bin')
        # spk = os.path.dirname(wav_name).split('/')[-1][-3:]
        spk = os.path.dirname(wav_name).split('/')[-1]
        bin_name = os.path.join(args.bindir, 'noVAD', spk, bin_basename)
        vad_bin_name = os.path.join(args.bindir, 'VAD', spk, bin_basename)

        if os.path.exists(bin_name):
            if args.overwrite:
                logging.info("overwrite %s (%d/%d)" % (wav_name, i + 1, len(wav_list)))
            else:
                logging.info("skip %s (%d/%d)" % (wav_name, i + 1, len(wav_list)))            
                continue
        else:
            logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list)))
        
        feat_param = feat_param_list[spk_list.index(spk)]
        
        # load wavfile and apply low cut filter
        fs, x = wavfile.read(wav_name)
        x = np.array(x, dtype=np.float64)
        x = low_cut_filter(x, fs, cutoff=feat_param['highpass_cutoff'])

        # check sampling frequency
        if not fs == feat_param['fs']:
            logging.error("sampling frequency is not matched: %s" % wav_name)
            sys.exit(1)

        # extract features
        f0, time_axis = pw.harvest(x, feat_param['fs'],
                                   f0_floor=feat_param['f0min'],
                                   f0_ceil=feat_param['f0max'],
                                   frame_period=feat_param['shift_ms'])
        sp = pw.cheaptrick(x, f0, time_axis, feat_param['fs'],
                           fft_size=feat_param['fftl'])
        ap = pw.d4c(x, f0, time_axis, feat_param['fs'], fft_size=feat_param['fftl'])
        mcc = pysptk.sp2mc(sp, feat_param['mcep_dim'], feat_param['mcep_alpha'])
        en_sp, sp = energy_norm(sp)
        sp = np.log10(sp)
        en_mcc = mcc[:, 0]

        # expand dimensions for concatenation
        f0 = np.expand_dims(f0, axis=-1)
        en_mcc = np.expand_dims(en_mcc, axis=-1)

        # concatenation
        world_feats = np.concatenate([sp, mcc[:, 1:], ap, f0, en_sp, en_mcc], axis=1)
        labels = spk_list.index(spk) * np.ones(
                [sp.shape[0], 1], np.float32)
        
        # concatenate all features
        feats = np.concatenate(
                    [world_feats, labels],
                    axis=1).astype(np.float32)

        # VAD
        vad_idx = np.where(f0.copy().reshape([-1])>10)[0]
        if len(vad_idx) < 1:
            logging.info("invalid wave file: %s" % wav_name)
            continue
        vad_feats = feats[vad_idx[0] : vad_idx[-1]+1]

        # write to bin
        with open(bin_name, 'wb') as fp:
            fp.write(feats.tostring())
        with open(vad_bin_name, 'wb') as fp:
            fp.write(vad_feats.tostring())
Beispiel #14
0
def estimate(letter, name):
    fb = 0
    fm = 0
    fe = 0
    max_beg = 0
    max_mid = 0
    max_end = 0

    if os.path.exists(f'zvucni_glasovi_wav/novi_{letter}_beg_{name}.wav'):
        beg, fs = sf.read(f'zvucni_glasovi_wav/novi_{letter}_beg_{name}.wav')
        """
        f0_dio, timeaxis_dio = pw.dio(beg, fs, f0_floor=70.0, f0_ceil=800.0, channels_in_octave=3.0,
                                      frame_period=args.frame_period,
                                      speed=args.speed)
        """
        f0, timeaxis = pw.harvest(beg, fs)
        f0_mask = pw.stonemask(beg, f0, timeaxis, fs)
        sp = pw.cheaptrick(beg, f0_mask, timeaxis, fs)
        ap = pw.d4c(beg, f0_mask, timeaxis, fs)
        y = pw.synthesize(f0_mask, sp, ap, fs, pw.default_frame_period)

        sf.write(
            f'zvucni_glasovi_after_sint/{letter}_beg_{name}_after_sint.wav', y,
            fs)

        plt.figure()
        plt.title(f'Glas {letter} na početku riječi')
        plt.plot(timeaxis,
                 f0,
                 'r',
                 label='Procjenjena f0 pomoću harvest() funkcije')
        # plt.plot(timeaxis_dio, f0_dio, 'y', label='Procjenjena f0 pomoću DIO() funkcije')
        plt.plot(timeaxis,
                 f0_mask,
                 'g--',
                 label='Pročišćena f0 pomoću stonemaska')

        plt.ylabel('frekvencija (Hz)')
        plt.xlabel('vrijeme (s)')
        plt.legend()
        plt.savefig(f'slike_f0_usporedbe/f0_{letter}_{name}_beg.png')
        savefig(
            f'slike_before_after_sint_usporedbe/before_after_synt_{letter}_{name}_beg.png',
            [beg, y], letter + '_beg')
        savefig(f'slike_sp_usporedbe/sp_{letter}_{name}_beg.png', [sp], letter)
        savefig(f'slike_ap_usporedbe/ap_{letter}_{name}_beg.png', [ap],
                letter,
                log=False)
        plt.close()
        max_beg = np.max(f0)
        fb = 1

    if os.path.exists(f'zvucni_glasovi_wav/novi_{letter}_mid_{name}.wav'):
        mid, fs = sf.read(f'zvucni_glasovi_wav/novi_{letter}_mid_{name}.wav')
        """
        f0_dio, timeaxis_dio = pw.dio(mid, fs, f0_floor=70.0, f0_ceil=800.0, channels_in_octave=2.0,
                                      frame_period=args.frame_period,
                                      speed=args.speed)
        """
        f0, timeaxis = pw.harvest(mid, fs)
        f0_mask = pw.stonemask(mid, f0, timeaxis, fs)
        sp = pw.cheaptrick(mid, f0_mask, timeaxis, fs)
        ap = pw.d4c(mid, f0_mask, timeaxis, fs)
        y = pw.synthesize(f0_mask, sp, ap, fs, pw.default_frame_period)

        sf.write(
            f'zvucni_glasovi_after_sint/{letter}_mid_{name}_after_sint.wav', y,
            fs)

        plt.figure()
        plt.title(f'Glas {letter} u sredini riječi')
        plt.plot(timeaxis,
                 f0,
                 'r',
                 label='Procjenjena f0 pomoću harvest() funkcije')
        # plt.plot(timeaxis_dio, f0_dio, 'y', label='Procjenjena f0 pomoću DIO() funkcije')
        plt.plot(timeaxis,
                 f0_mask,
                 'g--',
                 label='Pročišćena f0 pomoću stonemaska')

        plt.ylabel('frekvencija (Hz)')
        plt.xlabel('vrijeme (s)')
        plt.legend()
        plt.savefig(f'slike_f0_usporedbe/f0_{letter}_{name}_mid.png')
        savefig(
            f'slike_before_after_sint_usporedbe/before_after_synt_{letter}_{name}_mid.png',
            [mid, y], letter + '_mid')
        savefig(f'slike_sp_usporedbe/sp_{letter}_{name}_mid.png', [sp], letter)
        savefig(f'slike_ap_usporedbe/ap_{letter}_{name}_mid.png', [ap],
                letter,
                log=False)
        plt.close()
        max_mid = np.max(f0)
        fm = 1

    if os.path.exists(f'zvucni_glasovi_wav/novi_{letter}_end_{name}.wav'):
        end, fs = sf.read(f'zvucni_glasovi_wav/novi_{letter}_end_{name}.wav')
        """
        f0_dio, timeaxis_dio = pw.dio(end, fs, f0_floor=70.0, f0_ceil=800.0, channels_in_octave=2.0,
                                      frame_period=args.frame_period,
                                      speed=args.speed)
        """
        f0, timeaxis = pw.harvest(end, fs)
        f0_mask = pw.stonemask(end, f0, timeaxis, fs)
        sp = pw.cheaptrick(end, f0_mask, timeaxis, fs)
        ap = pw.d4c(end, f0_mask, timeaxis, fs)
        y = pw.synthesize(f0_mask, sp, ap, fs, pw.default_frame_period)

        sf.write(
            f'zvucni_glasovi_after_sint/{letter}_end_{name}_after_sint.wav', y,
            fs)

        plt.figure()
        plt.title(f'Glas {letter} na kraju riječi')
        plt.plot(timeaxis,
                 f0,
                 'r',
                 label='Procjenjena f0 pomoću harvest() funkcije')
        # plt.plot(timeaxis_dio, f0_dio, 'y', label='Procjenjena f0 pomoću DIO() funkcije')
        plt.plot(timeaxis,
                 f0_mask,
                 'g--',
                 label='Pročišćena f0 pomoću stonemaska')

        plt.ylabel('frekvencija (Hz)')
        plt.xlabel('vrijeme (s)')
        plt.legend()
        plt.savefig(f'slike_f0_usporedbe/f0_{letter}_{name}_end.png')
        savefig(
            f'slike_before_after_sint_usporedbe/before_after_synt_{letter}_{name}_end.png',
            [end, y], letter + '_end')
        savefig(f'slike_sp_usporedbe/sp_{letter}_{name}_end.png', [sp], letter)
        savefig(f'slike_ap_usporedbe/ap_{letter}_{name}_end.png', [ap],
                letter,
                log=False)
        plt.close()
        max_end = np.max(f0)
        fe = 1

    if fb and fm and fe:
        x = [5, 10, 15]
        max_f0 = [max_beg, max_mid, max_end]
        plt.bar(x, height=max_f0)
        plt.axhline(np.average(max_f0),
                    color='lightblue',
                    linestyle='--',
                    label='prosjek')
        plt.xticks(x, [f'{letter}_beg', f'{letter}_mid', f'{letter}_end'])
        plt.xlabel('pozicija')
        plt.ylabel('frekvencija (Hz)')
        plt.legend()
        plt.savefig(f'slike_f0_usporedbe/f0_{letter}_{name}_hist.png')
        plt.close()
    """
Beispiel #15
0
            mfcc = dp.load_mfcc(filename, config_mfcc_mcep)
            ppg_sentence = converter.predict(mfcc)
            result = transformer.predict(ppg_sentence).numpy()

            #extract info from source file
            x, _ = librosa.load(filename,
                                sr=config_mfcc_mcep["sampling_frequency"])
            x = x.astype(np.float64)
            _f0, t = pw.dio(
                x, config_mfcc_mcep["sampling_frequency"])  # frame_period=10)
            f0_try = pw.stonemask(x, _f0, t,
                                  config_mfcc_mcep["sampling_frequency"]
                                  )  #refinement of f0 using stone mask
            ap_try = pw.d4c(x=x,
                            f0=_f0,
                            temporal_positions=t,
                            fs=config_mfcc_mcep["sampling_frequency"],
                            fft_size=config_mfcc_mcep["n_fft"])

            #use transformed result
            indices = sorted(np.concatenate([np.arange(len(result))] * 2))
            alpha = 0.35
            spc = pysptk.mc2sp(result[indices], alpha,
                               config_mfcc_mcep["n_fft"]).astype(
                                   np.float64)[:len(ap_try)]
            y2 = pw.synthesize(f0_try, spc, ap_try,
                               config_mfcc_mcep["sampling_frequency"])
            endfile = final_directory + l
            scipy.io.wavfile.write(endfile,
                                   config_mfcc_mcep["sampling_frequency"], y2)
Beispiel #16
0
def wav2world(
        wave, fs,
        mcep_order=25, f0_smoothing=0,
        ap_smoothing=0, mcep_smoothing=0,
        frame_period=None, f0_floor=None, f0_ceil=None,
        f0_mode="harvest"):
    # setup default values
    wave = wave.astype('float64')

    frame_period = pyworld.default_frame_period \
        if frame_period is None else frame_period
    f0_floor = pyworld.default_f0_floor if f0_floor is None else f0_floor
    f0_ceil = pyworld.default_f0_ceil if f0_ceil is None else f0_ceil
    alpha = pysptk.util.mcepalpha(fs)

    # f0
    if f0_mode == "harvest":
        f0, t = pyworld.harvest(
            wave, fs,
            f0_floor=f0_floor, f0_ceil=f0_ceil,
            frame_period=frame_period)
        threshold = 0.85

    elif f0_mode == "reaper":
        _, _, t, f0, _ = reaper(
            (wave * (2**15 - 1)).astype("int16"),
            fs, frame_period=frame_period / 1000,
            do_hilbert_transform=True)
        t, f0 = t.astype('float64'), f0.astype('float64')
        threshold = 0.1

    elif f0_mode == "dio":
        _f0, t = pyworld.dio(wave, fs)
        f0 = pyworld.stonemask(wave, _f0, t, fs)
        threshold = 0.0

    else:
        raise ValueError

    # world
    sp = pyworld.cheaptrick(wave,  f0, t, fs)
    ap = pyworld.d4c(wave, f0, t, fs, threshold=threshold)

    # extract vuv from ap
    vuv_flag = (ap[:, 0] < 0.5) * (f0 > 1.0)
    vuv = vuv_flag.astype('int')

    # continuous log f0
    clf0 = np.zeros_like(f0)
    if vuv_flag.any():
        if not vuv_flag[0]:
            f0[0] = f0[vuv_flag][0]
            vuv_flag[0] = True
        if not vuv_flag[-1]:
            f0[-1] = f0[vuv_flag][-1]
            vuv_flag[-1] = True

        idx = np.arange(len(f0))
        clf0[idx[vuv_flag]] = np.log(
            np.clip(f0[idx[vuv_flag]], f0_floor / 2, f0_ceil * 2))
        clf0[idx[~vuv_flag]] = interp1d(
            idx[vuv_flag], clf0[idx[vuv_flag]]
        )(idx[~vuv_flag])

        if f0_smoothing > 0:
            clf0 = modspec_smoothing(
                clf0, 1000 / frame_period, cut_off=f0_smoothing)
    else:
        clf0 = np.ones_like(f0) * f0_floor

    # continuous coded ap
    cap = pyworld.code_aperiodicity(ap, fs)

    if ap_smoothing > 0:
        cap = modspec_smoothing(cap, 1000 / frame_period, cut_off=ap_smoothing)

    # mcep
    mcep = pysptk.mcep(sp, order=mcep_order, alpha=alpha, itype=4)

    if mcep_smoothing > 0:
        mcep = modspec_smoothing(
            mcep, 1000 / frame_period, cut_off=mcep_smoothing)

    fbin = sp.shape[1]
    return mcep, clf0, vuv, cap, sp, fbin, t
Beispiel #17
0
    fs = sr
    fft_len = 1024
    hop_length = 256
    frame_period = hop_length / sr * 1000  # hop_length in ms
    f0_floor = 71.  # default
    f0_ceil = 800.  # default

    f0, timeaxis = pyworld.dio(x,
                               fs=sr,
                               f0_floor=f0_floor,
                               f0_ceil=f0_ceil,
                               frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    sp = pyworld.cheaptrick(x, f0, timeaxis, fs,
                            fft_size=fft_len)  # Spectrogram
    ap = pyworld.d4c(x, f0, timeaxis, fs, fft_size=fft_len)  # Aperiodicity

    plt.subplot(3, 1, 1)
    plt.plot(f0)
    plt.subplot(3, 1, 2)
    plt.plot(lf0)
    plt.subplot(3, 1, 3)
    librosa.display(sp.T, sr=sr, hop_length=hop_length, y_axis='linear')
    plt.show()

    y = pyworld.synthesize(f0, sp, ap, fs, frame_period)

    play_audio(y)

    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    mgc = pysptk.sp2mc(spectrogram,
import peakutils
import soundfile as sf
import argparse
from audiolazy import *
from audiolazy.lazy_stream import Stream
from audiolazy import Stream
np.set_printoptions(threshold=np.inf)

styletext = '<style>table {width:100%;}table, th, td {border: 1px solid black;border-collapse: collapse;}</style>'
#path = "vaiueo2d.wav"
path = "vaiueo2d.wav"
x, fs = sf.read(path)
print(x.shape)
f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0)
sp = pw.cheaptrick(x, f0, t, fs)
ap = pw.d4c(x, f0, t, fs)


#_y = pw.synthesize(f0, sp, ap, fs)
#if it has less that temporal position pop off the last element
#################################
#DEFINE FORMANT DETECTION FUNCTION
def cam_formants(x, fs):
    ms10 = math.ceil(fs * 0.005)
    ms30 = math.floor(fs * 0.03)
    ncoeff = 2 + fs / 1000
    t = np.arange(0, len(x) - 1)
    t = t / fs
    pos = 1
    fm = []
    ft = []