Example #1
0
def convertWavIntoF0seqMCEPseq(wav, fs, frame_period=5.0, MCEPdim=24):
    """
    Extract a F0 sequence and a MCEP sequence from a single waveform

    Args:
        wav (np.ndarray(1,T)): waveform
        fs :
        frame_period (float): [ms]
        MCEPdim (int): dimension of Mel CEPstral analysis

    Returns:
        tuple: f0seq (np.ndarray(1, T/frame_period)) & MCEPseq (np.ndarray(MCEPdim, T/frame_period))
    """
    wav = wav.astype(np.float64)  # np.ndarray -> np.ndarray(number is float64)
    f0seq, timeaxis = pyworld.harvest(wav,
                                      fs,
                                      frame_period=frame_period,
                                      f0_floor=71.0,
                                      f0_ceil=800.0)
    spetrogram = pyworld.cheaptrick(wav, f0seq, timeaxis, fs)
    MCEPseq = pyworld.code_spectral_envelope(spetrogram, fs, MCEPdim)
    print(
        f"F0&MCEP-nized! {wav.shape[0] / fs} [sec] wav => {f0seq.shape}, {MCEPseq.shape}"
    )
    return f0seq, MCEPseq.T.astype(np.float32)
Example #2
0
def world_spectrogram(wav, sr=_sr, dim_num=32, **kwargs):
    """world声码器语音转为频谱。"""
    # 分布提取参数
    frame_period = kwargs.get("frame_period", pw.default_frame_period)
    f0_floor = kwargs.get("f0_floor", pw.default_f0_floor)
    f0_ceil = kwargs.get("f0_ceil", pw.default_f0_ceil)
    fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor))
    ap_threshold = kwargs.get("ap_threshold", 0.85)
    f0_extractor = kwargs.get("f0_extractor", "dio")
    x = wav.astype(np.double)
    if f0_extractor == "dio":
        # 使用DIO算法计算音频的基频F0
        f0, t = pw.dio(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil)
    elif f0_extractor == "harvest":
        f0, t = pw.harvest(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period)
    else:
        f0, t = f0_extractor(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period)

    # 使用CheapTrick算法计算音频的频谱包络
    sp = pw.cheaptrick(x, f0, t, sr, f0_floor=f0_floor, fft_size=fft_size)
    # SP降维
    sp_enc = pw.code_spectral_envelope(sp, sr, number_of_dimensions=dim_num)

    # 计算aperiodic参数
    ap = pw.d4c(x, f0, t, sr, threshold=ap_threshold, fft_size=fft_size)
    # AP降维
    ap_enc = pw.code_aperiodicity(ap, sr)
    return f0, sp_enc, ap_enc
Example #3
0
def cal_mcep(wav_ori, fs=SAMPLE_RATE, ispad=False, frame_period=0.005, dim=FEATURE_DIM, fft_size=FFTSIZE):
    '''cal mcep given wav singnal
        the frame_period used only for pad_wav_to_get_fixed_frames
    '''
    if ispad:
        wav, pad_length = pad_wav_to_get_fixed_frames(wav_ori, frames=FRAMES, frame_period=frame_period, sr=fs)
    else:
        wav = wav_ori
    #Harvest F0 extraction algorithm.
    f0, timeaxis = pyworld.harvest(wav, fs)

    #CheapTrick harmonic spectral envelope estimation algorithm.
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs, fft_size=fft_size)

    #D4C aperiodicity estimation algorithm.
    ap = pyworld.d4c(wav, f0, timeaxis, fs, fft_size=fft_size)
    #feature reduction nxdim
    coded_sp = pyworld.code_spectral_envelope(sp, fs, dim)
    #log
    coded_sp = coded_sp.T  # dim x n

    res = {
        'f0': f0,  #n
        'ap': ap,  #n*fftsize//2+1
        'sp': sp,  #n*fftsize//2+1
        'coded_sp': coded_sp,  #dim * n
    }
    return res
Example #4
0
def world_features(wav, sr, fft_size, dim):
    f0, timeaxis = pyworld.harvest(wav, sr)
    sp = pyworld.cheaptrick(wav, f0, timeaxis, sr, fft_size=fft_size)
    ap = pyworld.d4c(wav, f0, timeaxis, sr, fft_size=fft_size)
    coded_sp = pyworld.code_spectral_envelope(sp, sr, dim)

    return f0, timeaxis, sp, ap, coded_sp
Example #5
0
def world_encode_spectral_envelop(sp, fs, dim=24):
    # Get Mel-cepstral coefficients (MCEPs)

    # sp = sp.astype(np.float64)
    coded_sp = pyworld.code_spectral_envelope(sp, fs, dim)

    return coded_sp
Example #6
0
def wav2mcep(filepath):
    '''
    cal mcep given wav singnal

    return:
      f0: shape [ T, ]
      ap: shape [ T, sampling_rate/2 + 1 ]
      sp: shape [ T, sampling_rate/2 + 1 ]
      coded_sp: shape [n_mels, T]
    '''
    y, sr = librosa.load(filepath, sr=sampling_rate)
    y, _ = librosa.effects.trim(y)
    y = np.asarray(y, dtype=np.double)

    f0, timeaxis = pyworld.harvest(y, sr)
    sp = pyworld.cheaptrick(y, f0, timeaxis, sampling_rate, fft_size=n_fft)
    ap = pyworld.d4c(y, f0, timeaxis, sampling_rate, fft_size=n_fft)
    mcep = pyworld.code_spectral_envelope(sp, sampling_rate, n_mels)
    mcep = mcep.T  # dim x n

    f0 = f0.astype(np.float64)
    sp = sp.astype(np.float64)
    ap = ap.astype(np.float64)
    mcep = mcep.astype(np.float64)
    return f0, ap, sp, mcep
Example #7
0
def cal_mcep(wav_ori, fs, dim, fft_size):
    wav = wav_ori
    f0, timeaxis = pyworld.harvest(wav, fs)
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs, fft_size=fft_size)
    ap = pyworld.d4c(wav, f0, timeaxis, fs, fft_size=fft_size)
    coded_sp = pyworld.code_spectral_envelope(sp, fs, dim)
    coded_sp = coded_sp.T  # dim x n
    return f0, ap, coded_sp
def world_encode_spectral_envelop(sp, fs, dim = 24):

    # Get Mel-cepstral coefficients (MCEPs)

    #sp = sp.astype(np.float64)
    coded_sp = pyworld.code_spectral_envelope(sp, fs, dim)

    return coded_sp
def world_features(wav, sr, fft_size, dim):
    f0, timeaxis = pyworld.harvest(wav, sr) # The fundamental period T0 of a voiced speech signal can be
                                            # defined as the elapsed time between two successive laryngeal pulses and the fundamental frequency is F0 = 1/T0 [1].

    sp = pyworld.cheaptrick(wav, f0, timeaxis, sr,fft_size=fft_size) # extract smoothed spectrogram
    ap = pyworld.d4c(wav, f0, timeaxis, sr, fft_size=fft_size) # extract aperiodicity
    # “aperiodicity” is defined as the power ratio between the speech signal and the aperiodic component of the signal.
    coded_sp = pyworld.code_spectral_envelope(sp, sr, dim)

    return f0, timeaxis, sp, ap, coded_sp
Example #10
0
def _process_utterance(out_dir, index, wav_path, text, phone):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    if hparams.vocoder=="world":
        spectrogram = audio.spectrogram(wav).astype(np.float32)

        f0, sp, ap = pw.wav2world(wav.astype(np.double), hparams.sample_rate)
        ap_coded = pw.code_aperiodicity(ap, hparams.sample_rate)
        sp_coded = pw.code_spectral_envelope(sp,hparams.sample_rate, hparams.coded_env_dim)
        
        world_spec = np.hstack([f0[:,np.newaxis],sp_coded,ap_coded])
        n_frames = world_spec.shape[0]
        spectrogram_filename = 'synpaflex-spec-%05d.npy' % index
        encoded_filename = 'synpaflex-world-%05d.npy' % index
        np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, encoded_filename), world_spec, allow_pickle=False)

    else:
        # Compute the linear-scale spectrogram from the wav:
        spectrogram = audio.spectrogram(wav).astype(np.float32)
        n_frames = spectrogram.shape[1]

        # Compute a mel-scale spectrogram from the wav:
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

        # Write the spectrograms to disk:
        spectrogram_filename = 'synpaflex-spec-%05d.npy' % index
        encoded_filename = 'synpaflex-mel-%05d.npy' % index
        np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, encoded_filename), mel_spectrogram.T, allow_pickle=False)


    # Return a tuple describing this training example:
    return (spectrogram_filename, encoded_filename, n_frames, text, phone)
Example #11
0
def get_sp(fpath: str): # 原始人的sp
    wav, _ = librosa.load(fpath, sr=hp.SR, mono=True, dtype=np.float64)  # librosa.load 返回音频信号值 & 采样率
                                                                         # mono=False声音保持原通道数
    f0, timeaxis = pw.harvest(wav, hp.SR)
    sp = pw.cheaptrick(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT)  # sp:频谱包络;pw.cheaptrick 谐波频谱包络估计算法
    # sp = pw.cheaptrick(wav, mean_f0, timeaxis, hp.SR, fft_size=hp.N_FFT)
    coded_sp = pw.code_spectral_envelope(sp, hp.SR, hp.CODED_DIM)
    # 将频谱包络sp再 ?压缩?;返回值是:ndarray
    # pw.code_spectral_envelope :减小频谱包络 和 非周期性的 尺寸 。
    # https://blog.csdn.net/weixin_32393347/article/details/88623256
    coded_sp = coded_sp.T  # ndarray 的 转置矩阵
    return np.array(coded_sp)
Example #12
0
    def compute_world_cmvn(self, enable_load_from_disk, entries_person_wavs,
                           sp_dim, fft_size, fs, speakers):
        """ compuate cmvn of f0 and sp using pyworld """
        start = time.time()
        cmvns = []
        for speaker in speakers:
            coded_sps, f0s = [], []
            for audio_file in entries_person_wavs[speaker]:
                wav, _ = librosa.load(audio_file,
                                      sr=fs,
                                      mono=True,
                                      dtype=np.float64)
                # World Vocoder parameterizes speech into three components:
                #     Pitch (fundamental frequency, F0) contour
                #     Harmonic spectral envelope(sp)
                #     Aperiodic spectral envelope (relative to the harmonic spectral envelope,ap)
                # Refer to the address:https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder
                if enable_load_from_disk:
                    samples = np.load(audio_file)
                    f0, coded_sp, ap = samples["f0"], samples[
                        "coded_sp"], samples["ap"]
                else:
                    wav, _ = librosa.load(audio_file,
                                          sr=fs,
                                          mono=True,
                                          dtype=np.float64)
                    f0, timeaxis = pyworld.harvest(wav, fs)
                    # CheapTrick harmonic spectral envelope estimation algorithm.
                    sp = pyworld.cheaptrick(wav,
                                            f0,
                                            timeaxis,
                                            fs,
                                            fft_size=fft_size)
                    # feature reduction
                    coded_sp = pyworld.code_spectral_envelope(sp, fs, sp_dim).T
                coded_sps.append(coded_sp)
                f0s.append(np.reshape(f0, [-1, 1]))
            # Calculate the mean and standard deviation of the World features
            coded_sps_concatenated = np.concatenate(coded_sps, axis=1)
            coded_sps_mean = list(
                np.mean(coded_sps_concatenated, axis=1, keepdims=False))
            coded_sps_var = list(
                np.var(coded_sps_concatenated, axis=1, keepdims=False))
            log_f0s_concatenated = np.ma.log(np.concatenate(f0s))
            log_f0s_mean = log_f0s_concatenated.mean()
            log_f0s_var = log_f0s_concatenated.var()
            cmvns.append((speaker, coded_sps_mean, coded_sps_var, log_f0s_mean,
                          log_f0s_var))
            self.cmvn_dict[speaker] = (coded_sps_mean, coded_sps_var, \
                                       log_f0s_mean, log_f0s_var)

        logging.info("finished compute cmvn, which cost %.4f s" %
                     (time.time() - start))
def world_features(wav, sr, fft_size, dim):
    # pyworld.harvest用来提取音频基频F0,参数为数据和采样率,返回值为基频和每一帧时间位置,格式为ndarray数组
    f0, timeaxis = pyworld.harvest(wav, sr)
    # pyworld.cheaptrick用来计算简单技巧下的谐波谱包络估计算法,参数为音频数据,基频FO,时间位置数组,采样率,傅里叶变换大小
    # 返回值为频谱图:频谱包络(平方量级),包络即随机过程的振幅随着时间变化的曲线
    sp = pyworld.cheaptrick(wav, f0, timeaxis, sr, fft_size=fft_size)
    # pyworld.d4c是D4C非周期性估计算法获取非周期特征AP,参数为音频数据,基频F0,时间位置数组,采样率和傅里叶变换大小
    # 返回值是非周期性特征(即一个包络线,相对于频谱包络线的线性幅度)
    ap = pyworld.d4c(wav, f0, timeaxis, sr, fft_size=fft_size)
    # pyworld.code_spectral_envelope是对频谱包络的降维,参数为频谱包络,采样率与编码谱包络维数,返回编码频谱包络
    coded_sp = pyworld.code_spectral_envelope(sp, sr, dim)
    # 返回值为基频,时间位置数组,频谱包络,非周期性特征,编码频谱包络
    return f0, timeaxis, sp, ap, coded_sp
Example #14
0
def wav2coded_sp(wav_file):
    # load wav
    x, fs = sf.read(wav_file)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)  # Harmonic spectral envelope
    ap_h = pw.d4c(x, f0_h, t_h, fs)

    # Get Mel-cepstral coefficients (MCEPs)
    coded_sp = pw.code_spectral_envelope(sp_h, fs, 35)

    return coded_sp, f0_h, ap_h, x
Example #15
0
    def _encode_sp(sp, sr, mel_bins):
        """Conversion spectral envelope to mel spectral envelope
        
        Args:
            sp (numpy.float): Spectral envelope
            sr (int, optional): Defaults to 16000. Sampling rate
            mel_bins (int, optional): Defaults to 36. The number of mel bins
        
        Returns:
            numpy.float: mel spectral envelope
        """

        encoded_sp = pw.code_spectral_envelope(sp, sr, mel_bins)

        return encoded_sp
Example #16
0
def convertWavIntoFeatures(wav, fs, frame_period=5.0, MCEPdim=24):
    # basic features
    wav = wav.astype(np.float64)  # np.ndarray -> np.ndarray(number is float64)
    f0seq, timeaxis = pyworld.harvest(wav,
                                      fs,
                                      frame_period=frame_period,
                                      f0_floor=71.0,
                                      f0_ceil=800.0)
    spectrogram = pyworld.cheaptrick(wav, f0seq, timeaxis, fs)
    MCEPseq = pyworld.code_spectral_envelope(spectrogram, fs, MCEPdim)
    APseq = pyworld.d4c(wav, f0seq, timeaxis, fs)
    # argumentation
    # print("wavIntoFeatures size")
    # print(f"f0seq: {f0seq.shape}, MCEPseq_before_T: {MCEPseq.shape}, APseq: {APseq.shape}")
    return f0seq, MCEPseq.T.astype(np.float32), APseq
Example #17
0
def world_encode_spectral_envelop(sp, fs, dim=24):
    '''
    SP -> MCEPs
    Get Mel-cepstral coefficients (MCEPs)

    :param sp:
    :param fs:
    :param dim:
    :return:s
    '''

    #     sp = sp.astype(np.float32)
    coded_sp = pyworld.code_spectral_envelope(sp, fs, dim)

    return coded_sp
Example #18
0
    def world_feature_extract(self, wav):
        """ World Vocoder parameterizes speech into three components:
                Pitch (fundamental frequency, F0) contour
                Harmonic spectral envelope(sp)
                Aperiodic spectral envelope (relative to the harmonic spectral envelope, ap)
            Refer to the address:https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder
        """
        f0, timeaxis = pyworld.harvest(wav, self.fs)
        sp = pyworld.cheaptrick(wav, f0, timeaxis, self.fs, fft_size=self.fft_size)
        ap = pyworld.d4c(wav, f0, timeaxis, self.fs, fft_size=self.fft_size)
        coded_sp = pyworld.code_spectral_envelope(sp, self.fs, self.sp_dim)

        f0, sp, ap, coded_sp = tf.convert_to_tensor(f0, dtype=tf.float32), \
                tf.convert_to_tensor(sp, dtype=tf.float32), tf.convert_to_tensor(ap, dtype=tf.float32), \
                            tf.convert_to_tensor(coded_sp, dtype=tf.float32)
        return f0, sp, ap, coded_sp
Example #19
0
def worldEncodeSpectralEnvelop(sp: np.ndarray,
                               fs: int = SAMPLE_RATE,
                               dim: int = 36) -> np.ndarray:
    '''
    スペクトル包絡を元にMCEPsをつくる

    Parameters
    ----------
    sp: np.ndarray
        スペクトル包絡のデータ
    fs: int, default SAMPLE_RATE
        サンプリング周波数
    dim: int, default 24
        iFFTの次元数
    
    Returns
    -------
    code_spectral_envelope: np.ndarray
        スペクトル包絡のMCEPs
    '''
    coded_sp = pyworld.code_spectral_envelope(sp, fs, dim)
    return coded_sp
Example #20
0
def process_wav(wav_path):
    y, osr = sf.read(wav_path,
                     subtype='PCM_16',
                     channels=1,
                     samplerate=48000,
                     endian='LITTLE')  # , start=56640, stop=262560)

    sr = 32000
    y = librosa.resample(y, osr, sr)

    # 使用DIO算法计算音频的基频F0
    _f0, t = pw.dio(y,
                    sr,
                    f0_floor=50.0,
                    f0_ceil=800.0,
                    channels_in_octave=2,
                    frame_period=pw.default_frame_period)
    print(_f0.shape)

    # 使用CheapTrick算法计算音频的频谱包络
    _sp = pw.cheaptrick(y, _f0, t, sr)

    code_sp = pw.code_spectral_envelope(_sp, sr, 60)
    print(_sp.shape, code_sp.shape)
    # 计算aperiodic参数
    _ap = pw.d4c(y, _f0, t, sr)

    code_ap = pw.code_aperiodicity(_ap, sr)
    print(_ap.shape, code_ap.shape)

    np.save('data/prepared_data/f0', _f0)
    np.save('data/prepared_data/ap', code_ap)

    # 合成原始语音
    synthesized = pw.synthesize(_f0 - 200, _sp, _ap, 32000,
                                pw.default_frame_period)
    # 1.输出原始语音
    sf.write('./data/gen_wav/test-200.wav', synthesized, 32000)
def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    #x, fs = sf.read('utterance/vaiueo2d.wav')
    x, fs = sf.read('utterance/p226_002.wav')
    # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

    # 1. A convient way
    f0, sp, ap = pw.wav2world(x, fs)  # use default options
    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # 2. Step by step
    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x,
                    fs,
                    f0_floor=50.0,
                    f0_ceil=600.0,
                    channels_in_octave=2,
                    frame_period=args.frame_period,
                    speed=args.speed)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 DIO with F0 refinement (using Stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
    # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
    sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

    # 2-4 DIO with F0 refinement (using Stonemask). Code and restore sp, ap.
    code_sp = pw.code_spectral_envelope(sp, fs, 80)
    code_ap = pw.code_aperiodicity(ap, fs)
    fft_size = (sp.shape[1] - 1) * 2
    rest_sp = pw.decode_spectral_envelope(code_sp, fs, fft_size)
    rest_ap = pw.decode_aperiodicity(code_ap, fs, fft_size)
    y_r = pw.synthesize(f0, rest_sp, rest_ap, fs, args.frame_period)
    sf.write('test/y_with_f0_refinement_code_and_restore.wav', y_r, fs)
    print("fft size: {:d}".format(fft_size))
    print("coded sp shape: ({:d}, {:d})".format(code_sp.shape[0],
                                                code_sp.shape[1]))
    print("coded ap shape: ({:d}, {:d})".format(code_ap.shape[0],
                                                code_ap.shape[1]))

    # 2-5 DIO with F0 refinement (using Stonemask). Code and restore sp, ap. frame_shift: 12.5 ms, frame_length: 50.0 ms
    f0_xx, t_xx = pw.dio(x,
                         fs,
                         f0_floor=50.0,
                         f0_ceil=600.0,
                         channels_in_octave=2,
                         frame_period=12.5,
                         speed=args.speed)
    f0_xx = pw.stonemask(x, f0_xx, t_xx, fs)
    sp_xx = pw.cheaptrick(x, f0_xx, t_xx, fs)
    ap_xx = pw.d4c(x, f0_xx, t_xx, fs)
    code_sp_xx = pw.code_spectral_envelope(sp_xx, fs, 80)
    code_ap_xx = pw.code_aperiodicity(ap_xx, fs)
    fft_size = (sp_xx.shape[1] - 1) * 2
    rest_sp_xx = pw.decode_spectral_envelope(code_sp_xx, fs, fft_size)
    rest_ap_xx = pw.decode_aperiodicity(code_ap_xx, fs, fft_size)
    y_r_xx = pw.synthesize(f0_xx, rest_sp_xx, rest_ap_xx, fs, 12.5)
    sf.write(
        'test/y_with_f0_refinement_code_and_restore_frame_period_12.5.wav',
        y_r_xx, fs)
    print("coded sp_xx shape: ({:d}, {:d})".format(code_sp_xx.shape[0],
                                                   code_sp_xx.shape[1]))
    print("coded ap_xx shape: ({:d}, {:d})".format(code_ap_xx.shape[0],
                                                   code_ap_xx.shape[1]))

    # Comparison
    savefig('test/wavform.png', [x, _y, y, y_h, y_r, y_r_xx])
    savefig('test/sp.png', [_sp, sp, sp_h, rest_sp, rest_sp_xx])
    savefig('test/ap.png', [_ap, ap, ap_h, rest_ap, rest_ap_xx], log=False)
    savefig('test/f0.png', [_f0, f0, f0_h, f0_xx])

    print('Please check "test" directory for output files')
def encode_spectral_envelop(spect, sampling_rate, dim=24):
    coded_spect = pyworld.code_spectral_envelope(spect, sampling_rate, dim)
    return coded_spect
Example #23
0
#f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
f0, t = pw.harvest(x, fs)
sp = pw.cheaptrick(x, f0, t, fs)  # extract smoothed spectrogram
ap = pw.d4c(x, f0, t, fs)  # extract aperiodicity
end = timer()
print('Feature Extraction:', end - start, 'seconds')

# f0_new
from copy import deepcopy  # to avoid call by reference!!
f0_new = deepcopy(f0)  # 1-58 59-138 139-198 // 269-360 // 429-522
f0_new[1:198] = np.flip(f0_new[1:198], 0)  # reverse pitch
f0_new[269:360] = f0_new[269:360] + 62  #E(330hz) -> G (392hz)
f0_new[429:522] = f0_new[429:522] + 193  #E(330hz) -> G(523hz)

#%% reduce dimension of spectral envelope and aperiodicity.
enc_sp = pw.code_spectral_envelope(sp, fs, number_of_dimensions=32)
dec_sp = pw.decode_spectral_envelope(enc_sp,
                                     fs,
                                     fft_size=(sp.shape[1] - 1) * 2)

enc_ap = pw.code_aperiodicity(ap, fs)
dec_ap = pw.decode_aperiodicity(enc_ap, fs, fft_size=(ap.shape[1] - 1) * 2)

#%%
y = pw.synthesize(f0, sp, ap, fs)
librosa.output.write_wav('y_EyesNose_short_resynthesis.wav', y, fs)
#%%
y = pw.synthesize(f0, dec_sp, ap, fs)
librosa.output.write_wav('y_EyesNose_short_resynthesis_sp_decode_32.wav', y,
                         fs)
def world_encode_spectral_envelop(sp, fs, dim=24):

    # 对频谱包络进行降维处理,下降后的维度为dim=24
    coded_sp = pyworld.code_spectral_envelope(sp, fs, dim)
    return coded_sp
Example #25
0
for file in source_dir.glob(f"**/*.{extension}"):
    print(f"{file}")
    cnt += 1
    N = None
    fs, signal = load_function(str(file))
    signal = signal.astype(np.float)
    samplerate = args.samplerate or fs # if the samplerate is specified, use specified one, and else, use the wave files one.
    signal = downsampling(signal, fs, samplerate) # fs -> samplerate
    signal = convert2mono(signal)

    _f0, t = pw.dio(signal, samplerate, frame_period=args.frame_period*1000) # 基本周波数の抽出
    # _f0, t = pw.harvest(signal, samplerate, frame_period=args.frame_period*1000) # 基本周波数の抽出
    f0 = pw.stonemask(signal, _f0, t, samplerate) # 基本周波数の修正
    sp = pw.cheaptrick(signal, f0, t, samplerate, fft_size=args.fftsize)  # スペクトル包絡spectrumの抽出
    ap = pw.d4c(signal, f0, t, samplerate, fft_size=args.fftsize)  # 非周期性指標の抽出
    mcep = pw.code_spectral_envelope(sp, samplerate, args.nfilt) # メルケプストラムの抽出

    N = mcep.shape[0]

    if "spenv" in feature_type:
        np.savetxt(file.with_suffix(".spenv"), sp)

    if "mcep" in feature_type:
        np.savetxt(file.with_suffix(".mcep"), mcep)

    if "f0" in feature_type:
        np.savetxt(file.with_suffix(".f0"), f0)

    if "ap" in feature_type:
        np.savetxt(file.with_suffix(".ap"), ap)
Example #26
0
def world_encode_spectral_envelop(sp, fs, dim=24):
    # Get Mel-Cepstral coefficients (MCEPs)
    coded_sp = pyworld.code_spectral_envelope(sp, fs, dim)
    return coded_sp
def get_sp(fpath: str):
    wav, _ = librosa.load(fpath, sr=hp.SR, mono=True, dtype=np.float64)
    f0, timeaxis = pw.harvest(wav, hp.SR, frame_period=10)
    sp = pw.cheaptrick(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT)
    coded_sp = pw.code_spectral_envelope(sp, hp.SR, hp.CODED_DIM)
    return np.array(coded_sp)
def world_encode_spectral_env(spectral_env, settings):
    mfcc = pyworld.code_spectral_envelope(spectral_env,
                                          settings['sample_rate'],
                                          settings['coded_dim'])
    return idct(mfcc) / np.sqrt(settings['coded_dim'] * 2)
import pyworld

IN_WAVE_FILE = "in.wav"  # 入力音声
OUT_WAVE_FILE = "out.wav"  # 分析再合成した音声

SP_DIM = 50  # スペクトル包絡の圧縮後の次元

# 音声の読み込み
fs, x = wavfile.read(IN_WAVE_FILE)
x = x.astype(np.float64)

# 音声の分析 (基本周波数、スペクトル包絡、非周期性指標)
f0, sp, ap = pyworld.wav2world(x, fs)
fft_size = pyworld.get_cheaptrick_fft_size(fs)

# スペクトル包絡をエンコード / デコード
# https://www.isca-speech.org/archive/Interspeech_2017/abstracts/0067.html
code_sp = pyworld.code_spectral_envelope(sp, fs, SP_DIM)
decode_sp = pyworld.decode_spectral_envelope(code_sp, fs, fft_size)

# 非周期性指標をエンコード / デコード
code_ap = pyworld.code_aperiodicity(ap, fs)
decode_ap = pyworld.decode_aperiodicity(code_ap, fs, fft_size)

# 音声の再合成
y = pyworld.synthesize(f0, decode_sp, decode_ap, fs)
y = y.astype(np.int16)

# 音声の書き込み
wavfile.write(OUT_WAVE_FILE, fs, y)
Example #30
0
def world_encode_spectral_envelop(sp, fs, dim=36):
    coded_sp = pyworld.code_spectral_envelope(sp, fs, dim)
    return coded_sp
Example #31
0
def gen_waveform(labels,
                 acoustic_features,
                 binary_dict,
                 continuous_dict,
                 stream_sizes,
                 has_dynamic_features,
                 subphone_features="coarse_coding",
                 log_f0_conditioning=True,
                 pitch_idx=None,
                 num_windows=3,
                 post_filter=True,
                 sample_rate=48000,
                 frame_period=5,
                 relative_f0=True):
    windows = get_windows(num_windows)

    # Apply MLPG if necessary
    if np.any(has_dynamic_features):
        static_stream_sizes = get_static_stream_sizes(stream_sizes,
                                                      has_dynamic_features,
                                                      len(windows))
    else:
        static_stream_sizes = stream_sizes

    # Split multi-stream features
    mgc, target_f0, vuv, bap = split_streams(acoustic_features,
                                             static_stream_sizes)

    # Gen waveform by the WORLD vocodoer
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64),
                                               sample_rate, fftlen)

    # fill aperiodicity with ones for unvoiced regions
    aperiodicity[vuv.reshape(-1) < 0.5, :] = 1.0
    # WORLD fails catastrophically for out of range aperiodicity
    aperiodicity = np.clip(aperiodicity, 0.0, 1.0)

    ### F0 ###
    if relative_f0:
        diff_lf0 = target_f0
        # need to extract pitch sequence from the musical score
        linguistic_features = fe.linguistic_features(
            labels,
            binary_dict,
            continuous_dict,
            add_frame_features=True,
            subphone_features=subphone_features)
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sample_rate, frame_period)

    # 音量を小さくする(音割れ防止)
    # TODO: ここのかける定数をいい感じにする
    spectrogram *= 0.000000001
    sp = pyworld.code_spectral_envelope(spectrogram, sample_rate, 60)

    return f0, sp, bap, generated_waveform