Esempio n. 1
0
File: test.py Progetto: mayite/aukit
def test_audio_io():
    from aukit.audio_io import load_wav, save_wav, anything2bytesio, anything2wav, anything2bytes, Dict2Obj, _sr

    out = anything2bytes(_wav, sr=_sr)
    assert len(out) == len(_wav_bytes)

    out = anything2wav(_wav_bytes, sr=_sr)
    assert len(out) == len(_wav)

    my_obj = Dict2Obj({"my_key": "my_value"})
    assert my_obj.my_key == "my_value"
Esempio n. 2
0
my_hp = {
    "n_fft": 1024, "hop_size": 256, "win_size": 1024,
    "sample_rate": _sr,
    "fmin": 0, "fmax": _sr // 2,
    "preemphasize": False,
    'symmetric_mels': True,
    'signal_normalization': False,
    'allow_clipping_in_normalization': False,
    'ref_level_db': 0,
    '__file__': __file__
}

melgan_hparams = {}
melgan_hparams.update(default_hparams)
melgan_hparams.update(my_hp)
melgan_hparams = Dict2Obj(melgan_hparams)

_pad_len = (default_hparams.n_fft - default_hparams.hop_size) // 2


def wav2mel(wav, hparams=None):
    # mel = Audio2Mel().cuda()(src)
    # return mel
    hparams = hparams or melgan_hparams
    wav = np.pad(wav.flatten(), (_pad_len, _pad_len), mode="reflect")
    mel = mel_spectrogram(wav, hparams)
    mel = mel / 20
    return mel


def load_vocoder_melgan(load_path):
Esempio n. 3
0
    "win_size": 1024,  # 800
    "sample_rate": _sr,  # 16000
    "fmin": 0,  # 55
    "fmax": _sr // 2,  # 7600
    "preemphasize": False,  # True
    'symmetric_mels': True,  # True
    'signal_normalization': False,  # True
    'allow_clipping_in_normalization': False,  # True
    'ref_level_db': 0,  # 20
    'center': False,  # True
    '__file__': __file__
}

synthesizer_hparams = {k: v for k, v in default_hparams.items()}
synthesizer_hparams = {**synthesizer_hparams, **my_hp}
synthesizer_hparams = Dict2Obj(synthesizer_hparams)


def melspectrogram_torch(wav, hparams=None):
    """mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)"""
    mel = melspectrogram(wav, hparams)
    mel_output = torch.from_numpy(mel).type(torch.FloatTensor)
    return mel_output


def linearspectrogram_torch(wav, hparams=None):
    """spec_output: torch.FloatTensor of shape (B, n_spec_channels, T)"""
    spec = linearspectrogram(wav, hparams)
    spec_output = torch.from_numpy(spec).type(torch.FloatTensor)
    return spec_output
Esempio n. 4
0
hparams = Dict2Obj(
    dict(
        encoder_path=r"../models/encoder/saved_models/ge2e_pretrained.pt",
        # Comma-separated list of cleaners to run on text prior to training and eval. For non-English
        # text, you may want to use "basic_cleaners" or "transliteration_cleaners".
        cleaners="chinese_cleaners",
        center=True,
        # If you only have 1 GPU or want to use only one GPU, please set num_gpus=0 and specify the
        # GPU idx on run. example:
        # expample 1 GPU of index 2 (train on "/gpu2" only): CUDA_VISIBLE_DEVICES=2 python train.py
        # --model="Tacotron" --hparams="tacotron_gpu_start_idx=2"
        # If you want to train on multiple GPUs, simply specify the number of GPUs available,
        # and the idx of the first GPU to use. example:
        # example 4 GPUs starting from index 0 (train on "/gpu0"->"/gpu3"): python train.py
        # --model="Tacotron" --hparams="tacotron_num_gpus=4, tacotron_gpu_start_idx=0"
        # The hparams arguments can be directly modified on this hparams.py file instead of being
        # specified on run if preferred!

        # If one wants to train both Tacotron and WaveNet in parallel (provided WaveNet will be
        # trained on True mel spectrograms), one needs to specify different GPU idxes.
        # example Tacotron+WaveNet on a machine with 4 or plus GPUs. Two GPUs for each model:
        # CUDA_VISIBLE_DEVICES=0,1 python train.py --model="Tacotron"
        # --hparams="tacotron_gpu_start_idx=0, tacotron_num_gpus=2"
        # Cuda_VISIBLE_DEVICES=2,3 python train.py --model="WaveNet"
        # --hparams="wavenet_gpu_start_idx=2; wavenet_num_gpus=2"

        # IMPORTANT NOTE: If using N GPUs, please multiply the tacotron_batch_size by N below in the
        # hparams! (tacotron_batch_size = 32 * N)
        # Never use lower batch size than 32 on a single GPU!
        # Same applies for Wavenet: wavenet_batch_size = 8 * N (wavenet_batch_size can be smaller than
        #  8 if GPU is having OOM, minimum 2)
        # Please also apply the synthesis batch size modification likewise. (if N GPUs are used for
        # synthesis, minimal batch size must be N, minimum of 1 sample per GPU)
        # We did not add an automatic multi-GPU batch size computation to avoid confusion in the
        # user"s mind and to provide more control to the user for
        # resources related decisions.

        # Acknowledgement:
        #	Many thanks to @MlWoo for his awesome work on multi-GPU Tacotron which showed to work a
        # little faster than the original
        #	pipeline for a single GPU as well. Great work!

        # Hardware setup: Default supposes user has only one GPU: "/gpu:0" (Tacotron only for now!
        # WaveNet does not support multi GPU yet, WIP)
        # Synthesis also uses the following hardware parameters for multi-GPU parallel synthesis.
        tacotron_gpu_start_idx=
        0,  # idx of the first GPU to be used for Tacotron training.
        tacotron_num_gpus=
        1,  # Determines the number of gpus in use for Tacotron training.
        split_on_cpu=True,
        # Determines whether to split data on CPU or on first GPU. This is automatically True when
        # more than 1 GPU is used.
        ###########################################################################################################################################

        # Audio
        # Audio parameters are the most important parameters to tune when using this work on your
        # personal data. Below are the beginner steps to adapt
        # this work to your personal data:
        #	1- Determine my data sample rate: First you need to determine your audio sample_rate (how
        # many samples are in a second of audio). This can be done using sox: "sox --i <filename>"
        #		(For this small tuto, I will consider 24kHz (24000 Hz), and defaults are 22050Hz,
        # so there are plenty of examples to refer to)
        #	2- set sample_rate parameter to your data correct sample rate
        #	3- Fix win_size and and hop_size accordingly: (Supposing you will follow our advice: 50ms
        # window_size, and 12.5ms frame_shift(hop_size))
        #		a- win_size = 0.05 * sample_rate. In the tuto example, 0.05 * 24000 = 1200
        #		b- hop_size = 0.25 * win_size. Also equal to 0.0125 * sample_rate. In the tuto
        # example, 0.25 * 1200 = 0.0125 * 24000 = 300 (Can set frame_shift_ms=12.5 instead)
        #	4- Fix n_fft, num_freq and upsample_scales parameters accordingly.
        #		a- n_fft can be either equal to win_size or the first power of 2 that comes after
        # win_size. I usually recommend using the latter
        #			to be more consistent with signal processing friends. No big difference to be seen
        #  however. For the tuto example: n_fft = 2048 = 2**11
        #		b- num_freq = (n_fft / 2) + 1. For the tuto example: num_freq = 2048 / 2 + 1 = 1024 +
        # 1 = 1025.
        #		c- For WaveNet, upsample_scales products must be equal to hop_size. For the tuto
        # example: upsample_scales=[15, 20] where 15 * 20 = 300
        #			it is also possible to use upsample_scales=[3, 4, 5, 5] instead. One must only
        # keep in mind that upsample_kernel_size[0] = 2*upsample_scales[0]
        #			so the training segments should be long enough (2.8~3x upsample_scales[0] *
        # hop_size or longer) so that the first kernel size can see the middle
        #			of the samples efficiently. The length of WaveNet training segments is under the
        # parameter "max_time_steps".
        #	5- Finally comes the silence trimming. This very much data dependent, so I suggest trying
        # preprocessing (or part of it, ctrl-C to stop), then use the
        #		.ipynb provided in the repo to listen to some inverted mel/linear spectrograms. That
        # will first give you some idea about your above parameters, and
        #		it will also give you an idea about trimming. If silences persist, try reducing
        # trim_top_db slowly. If samples are trimmed mid words, try increasing it.
        #	6- If audio quality is too metallic or fragmented (or if linear spectrogram plots are
        # showing black silent regions on top), then restart from step 2.
        inv_mel_basis=None,
        mel_basis=None,
        num_mels=
        80,  # Number of mel-spectrogram channels and local conditioning dimensionality
        #  network
        rescale=True,  # Whether to rescale audio prior to preprocessing
        rescaling_max=0.9,  # Rescaling value
        # Whether to clip silence in Audio (at beginning and end of audio only, not the middle)
        # train samples of lengths between 3sec and 14sec are more than enough to make a model capable
        # of good parallelization.
        clip_mels_length=True,
        # For cases of OOM (Not really recommended, only use if facing unsolvable OOM errors,
        # also consider clipping your samples to smaller chunks)
        max_mel_frames=900,
        # Only relevant when clip_mels_length = True, please only use after trying output_per_steps=3
        #  and still getting OOM errors.

        # Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
        # It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
        # Does not work if n_ffit is not multiple of hop_size!!
        use_lws=False,
        # Only used to set as True if using WaveNet, no difference in performance is observed in
        # either cases.
        silence_threshold=
        2,  # silence threshold used for sound trimming for wavenet preprocessing

        # Mel spectrogram
        n_fft=
        800,  # Extra window size is filled with 0 paddings to match this parameter
        hop_size=200,  # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
        win_size=
        800,  # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
        sample_rate=
        16000,  # 16000Hz (corresponding to librispeech) (sox --i <filename>)
        frame_shift_ms=
        None,  # Can replace hop_size parameter. (Recommended: 12.5)

        # M-AILABS (and other datasets) trim params (these parameters are usually correct for any
        # data, but definitely must be tuned for specific speakers)
        trim_fft_size=512,
        trim_hop_size=128,
        trim_top_db=23,

        # Mel and Linear spectrograms normalization/scaling and clipping
        signal_normalization=True,
        # Whether to normalize mel spectrograms to some predefined range (following below parameters)
        allow_clipping_in_normalization=
        True,  # Only relevant if mel_normalization = True
        symmetric_mels=True,
        # Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2,
        # faster and cleaner convergence)
        max_abs_value=4.,
        # max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not
        # be too big to avoid gradient explosion,
        # not too small for fast convergence)
        normalize_for_wavenet=True,
        # whether to rescale to [0, 1] for wavenet. (better audio quality)
        clip_for_wavenet=True,
        # whether to clip [-max, max] before training/synthesizing with wavenet (better audio quality)

        # Contribution by @begeekmyfriend
        # Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude
        # levels. Also allows for better G&L phase reconstruction)
        preemphasize=True,  # whether to apply filter
        preemphasis=0.97,  # filter coefficient.

        # Limits
        min_level_db=-100,
        ref_level_db=20,
        fmin=55,
        # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To
        # test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
        fmax=7600,  # To be increased/reduced depending on data.

        # Griffin Lim
        power=1.5,
        # Only used in G&L inversion, usually values between 1.2 and 1.5 are a good choice.
        griffin_lim_iters=30,  # 60,
        # Number of G&L iterations, typically 30 is enough but we use 60 to ensure convergence.
        ###########################################################################################################################################

        # Tacotron
        outputs_per_step=2,  # Was 1
        # number of frames to generate at each decoding step (increase to speed up computation and
        # allows for higher batch size, decreases G&L audio quality)
        stop_at_any=True,
        # Determines whether the decoder should stop when predicting <stop> to any frame or to all of
        # them (True works pretty well)
        embedding_dim=one *
        4,  # 512,  # dimension of embedding space (these are NOT the speaker embeddings)

        # Encoder parameters
        enc_conv_num_layers=3,  # number of encoder convolutional layers
        enc_conv_kernel_size=(
            5, ),  # size of encoder convolution filters for each layer
        enc_conv_channels=one *
        4,  # 512,  # number of encoder convolutions filters for each layer
        encoder_lstm_units=one *
        2,  # 256,  # number of lstm units for each direction (forward and backward)

        # Attention mechanism
        smoothing=
        False,  # Whether to smooth the attention normalization function
        attention_dim=one * 1,  # 128,  # dimension of attention space
        attention_filters=32,  # number of attention convolution filters
        attention_kernel=(31, ),  # kernel size of attention convolution
        cumulative_weights=True,
        # Whether to cumulate (sum) all previous attention weights or simply feed previous weights (
        # Recommended: True)

        # Decoder
        prenet_layers=[
            one * 2, one * 2
        ],  # [256, 256],  # number of layers and number of units of prenet
        decoder_layers=2,  # number of decoder lstm layers
        decoder_lstm_units=one *
        8,  # 1024,  # number of decoder lstm units on each layer
        max_iters=2000,
        # Max decoder steps during inference (Just for safety from infinite loop cases)

        # Residual postnet
        postnet_num_layers=5,  # number of postnet convolutional layers
        postnet_kernel_size=(
            5, ),  # size of postnet convolution filters for each layer
        postnet_channels=one *
        4,  # 512,  # number of postnet convolution filters for each layer

        # CBHG mel->linear postnet
        cbhg_kernels=8,
        # All kernel sizes from 1 to cbhg_kernels will be used in the convolution bank of CBHG to act
        #  as "K-grams"
        cbhg_conv_channels=one * 1,  # 128,  # Channels of the convolution bank
        cbhg_pool_size=2,  # pooling size of the CBHG
        cbhg_projection=one * 2,  # 256,
        # projection channels of the CBHG (1st projection, 2nd is automatically set to num_mels)
        cbhg_projection_kernel_size=3,  # kernel_size of the CBHG projections
        cbhg_highwaynet_layers=4,  # Number of HighwayNet layers
        cbhg_highway_units=one *
        1,  # 128,  # Number of units used in HighwayNet fully connected layers
        cbhg_rnn_units=one * 1,  # 128,
        # Number of GRU units used in bidirectional RNN of CBHG block. CBHG output is 2x rnn_units in
        # shape

        # Loss params
        mask_encoder=True,
        # whether to mask encoder padding while computing attention. Set to True for better prosody
        # but slower convergence.
        mask_decoder=False,
        # Whether to use loss mask for padded sequences (if False, <stop_token> loss function will not
        #  be weighted, else recommended pos_weight = 20)
        cross_entropy_pos_weight=20,
        # Use class weights to reduce the stop token classes imbalance (by adding more penalty on
        # False Negatives (FN)) (1 = disabled)
        predict_linear=False,
        # Whether to add a post-processing network to the Tacotron to predict linear spectrograms (
        # True mode Not tested!!)
        ###########################################################################################################################################

        # Tacotron Training
        # Reproduction seeds
        tacotron_random_seed=5339,
        # Determines initial graph and operations (i.e: model) random state for reproducibility
        tacotron_data_random_state=
        1234,  # random state for train test split repeatability

        # performance parameters
        tacotron_swap_with_cpu=False,
        # Whether to use cpu as support to gpu for decoder computation (Not recommended: may cause
        # major slowdowns! Only use when critical!)

        # train/test split ratios, mini-batches sizes
        tacotron_batch_size=
        64,  # number of training samples on each training steps (was 32)
        # Tacotron Batch synthesis supports ~16x the training batch size (no gradients during
        # testing).
        # Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times
        #  different from training. We thus recommend masking the encoder.
        tacotron_synthesis_batch_size=128,
        # DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN"T TRAIN TACOTRON WITH "mask_encoder=True"!!
        tacotron_test_size=None,  # 0.05
        # % of data to keep as test data, if None, tacotron_test_batches must be not None. (5% is
        # enough to have a good idea about overfit)
        tacotron_test_batches=2,  # number of test batches.

        # Learning rate schedule
        tacotron_decay_learning_rate=True,
        # boolean, determines if the learning rate will follow an exponential decay
        tacotron_start_decay=
        10000,  # 50000,  # Step at which learning decay starts
        tacotron_decay_steps=
        10000,  # 50000,  # Determines the learning rate decay slope (UNDER TEST)
        tacotron_decay_rate=0.5,  # learning rate decay rate (UNDER TEST)
        tacotron_initial_learning_rate=1e-3,  # starting learning rate
        tacotron_final_learning_rate=1e-5,  # minimal learning rate

        # Optimization parameters
        tacotron_adam_beta1=0.9,  # AdamOptimizer beta1 parameter
        tacotron_adam_beta2=0.999,  # AdamOptimizer beta2 parameter
        tacotron_adam_epsilon=1e-6,  # AdamOptimizer Epsilon parameter

        # Regularization parameters
        tacotron_reg_weight=
        1e-7,  # regularization weight (for L2 regularization)
        tacotron_scale_regularization=False,
        # Whether to rescale regularization weight to adapt for outputs range (used when reg_weight is
        #  high and biasing the model)
        tacotron_zoneout_rate=
        0.1,  # zoneout rate for all LSTM cells in the network
        tacotron_dropout_rate=
        0.5,  # dropout rate for all convolutional layers + prenet
        tacotron_clip_gradients=True,  # whether to clip gradients

        # Evaluation parameters
        natural_eval=False,
        # Whether to use 100% natural eval (to evaluate Curriculum Learning performance) or with same
        #  teacher-forcing ratio as in training (just for overfit)

        # Decoder RNN learning can take be done in one of two ways:
        #	Teacher Forcing: vanilla teacher forcing (usually with ratio = 1). mode="constant"
        #	Curriculum Learning Scheme: From Teacher-Forcing to sampling from previous outputs is
        # function of global step. (teacher forcing ratio decay) mode="scheduled"
        # The second approach is inspired by:
        # Bengio et al. 2015: Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks.
        # Can be found under: https://arxiv.org/pdf/1506.03099.pdf
        tacotron_teacher_forcing_mode="constant",
        # Can be ("constant" or "scheduled"). "scheduled" mode applies a cosine teacher forcing ratio
        # decay. (Preference: scheduled)
        tacotron_teacher_forcing_ratio=1.,
        # Value from [0., 1.], 0.=0%, 1.=100%, determines the % of times we force next decoder
        # inputs, Only relevant if mode="constant"
        tacotron_teacher_forcing_init_ratio=1.,
        # initial teacher forcing ratio. Relevant if mode="scheduled"
        tacotron_teacher_forcing_final_ratio=0.,
        # final teacher forcing ratio. Relevant if mode="scheduled"
        tacotron_teacher_forcing_start_decay=10000,
        # starting point of teacher forcing ratio decay. Relevant if mode="scheduled"
        tacotron_teacher_forcing_decay_steps=280000,
        # Determines the teacher forcing ratio decay slope. Relevant if mode="scheduled"
        tacotron_teacher_forcing_decay_alpha=0.,
        # teacher forcing ratio decay rate. Relevant if mode="scheduled"
        ###########################################################################################################################################

        # Tacotron-2 integration parameters
        train_with_GTA=False,
        # Whether to use GTA mels to train WaveNet instead of ground truth mels.
        ###########################################################################################################################################

        # Eval sentences (if no eval text file was specified during synthesis, these sentences are
        # used for eval)
        sentences=["你好语音克隆模型。"],

        ### SV2TTS ###
        speaker_embedding_size=256,
        silence_min_duration_split=
        0.4,  # Duration in seconds of a silence for an utterance to be split
        utterance_min_duration=
        1.,  # Duration in seconds below which utterances are discarded
    ))