Beispiel #1
0
hparams = HParams(
    # model
    freq=8,
    dim_neck=8,
    freq_2=8,
    dim_neck_2=1,
    freq_3=8,
    dim_neck_3=32,
    dim_enc=512,
    dim_enc_2=128,
    dim_enc_3=256,
    dim_freq=80,
    dim_spk_emb=512,  # 82,
    dim_f0=257,
    dim_dec=512,
    len_raw=128,
    chs_grp=16,

    # interp
    min_len_seg=19,
    max_len_seg=32,
    min_len_seq=64,
    max_len_seq=128,
    max_len_pad=192,

    # data loader
    root_dir='assets/spmel',
    feat_dir='assets/raptf0',
    batch_size=16,
    mode='train',
    shuffle=True,
    num_workers=0,
    samplier=8,
)
Beispiel #2
0
hparams = HParams(
    name='wavenet_vocoder',
    builder='wavenet',
    input_type='raw',
    quantize_channels=65536,
    sample_rate=16000,
    silence_threshold=2,
    num_mels=80,
    fmin=125,
    fmax=7600,
    fft_size=1024,
    hop_size=256,
    frame_shift_ms=None,
    min_level_db=-100,
    ref_level_db=20,
    rescaling=True,
    rescaling_max=0.999,
    allow_clipping_in_normalization=True,
    log_scale_min=-32.23619130191664,
    out_channels=30,
    layers=24,
    stacks=4,
    residual_channels=512,
    gate_channels=512,
    skip_out_channels=256,
    dropout=0.050000000000000044,
    kernel_size=3,
    weight_normalization=True,
    legacy=True,
    cin_channels=80,
    upsample_conditional_features=True,
    upsample_scales=[4, 4, 4, 4],
    freq_axis_kernel_size=3,
    gin_channels=-1,
    n_speakers=-1,
    pin_memory=True,
    test_size=0.0441,
    test_num_samples=None,
    random_state=1234,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_eps=1e-08,
    amsgrad=False,
    initial_learning_rate=0.001,
    lr_schedule='noam_learning_rate_decay',
    lr_schedule_kwargs={},
    nepochs=2000,
    weight_decay=0.0,
    clip_thresh=-1,
    max_time_sec=None,
    max_time_steps=8000,
    exponential_moving_average=True,
    ema_decay=0.9999,
    checkpoint_interval=10000,
    train_eval_interval=10000,
    test_eval_epoch_interval=5,
    save_optimizer_state=True,

    # model
    freq=8,
    dim_neck=8,
    freq_2=8,
    dim_neck_2=1,
    freq_3=8,
    dim_neck_3=32,
    dim_enc=512,
    dim_enc_2=128,
    dim_enc_3=256,
    dim_freq=80,
    dim_spk_emb=82,
    dim_f0=257,
    dim_dec=512,
    len_raw=128,
    chs_grp=16,

    # interp
    min_len_seg=19,
    max_len_seg=32,
    min_len_seq=64,
    max_len_seq=128,
    max_len_pad=192,

    # data loader
    root_dir='assets/spmel',
    feat_dir='assets/raptf0',
    batch_size=16,
    mode='train',
    shuffle=True,
    num_workers=0,
    samplier=8,
)
Beispiel #3
0
hparams = HParams(
    loss_reconstruction_w=9,
    loss_disentanglement_w=1,
    # threshold=4e-4,
    # model
    freq=8,  # content codes降采样
    dim_neck=8,  # (blstm dim)content_encoder
    freq_2=8,  # rhythm codes降采样 rhythm_encoder(encoder_2,encoder_t)
    dim_neck_2=1,  # (blstm dim)rhythm_encoder(encoder_2,encoder_t)
    freq_3=8,  # pitch codes降采样
    dim_neck_3=32,  # (blstm dim)pitch_encoder
    dim_enc=512,  # (conv dim) content
    dim_enc_2=128,  # (conv dim) rhythm (encoder_2,encoder_t)
    dim_enc_3=256,  # (conv dim) pitch

    # Decoder parameters
    n_frames_per_step=1,  # currently only 1 is supported
    decoder_rnn_dim=1024,
    prenet_dim=256,
    max_decoder_steps=1000,
    gate_threshold=0.5,
    p_attention_dropout=0.1,
    p_decoder_dropout=0.1,

    # Attention parameters
    attention_rnn_dim=1024,
    attention_dim=128,

    # Location Layer parameters
    attention_location_n_filters=32,
    attention_location_kernel_size=31,

    # Mel-post processing network parameters
    postnet_embedding_dim=512,
    postnet_kernel_size=5,
    postnet_n_convolutions=5,
    n_mel_channels=80,

    # x(mel) : 80维
    dim_freq=80,
    dim_spk_emb=107,  # 82 at first
    embedding_spk=64,
    # f0 :257维
    dim_f0=257,
    dim_dec=512,
    dim_ortho=1024,
    layer_norm_eps=1e-12,
    len_raw=128,
    chs_grp=16,

    # interp
    # 为了random resampling,先分段segment
    # 每个段segment长度:19帧~32帧
    min_len_seg=19,
    max_len_seg=32,
    # min_len_seq = 64,
    min_len_seq=32,
    # max_len_seq = 128,
    max_len_seq=48,
    max_len_pad=408,  #192 at first,

    # data loader
    root_dir='assets/spmel',
    feat_dir='assets/raptf0',
    batch_size=16,
    mode='train',
    shuffle=True,
    num_workers=0,
    samplier=8,
    #MBV
    enc_mbv_size=7,
)
Beispiel #4
0
hparams = HParams(
    # model
    freq=8,
    dim_neck=8,
    freq_2=8,
    dim_neck_2=1,
    freq_3=8,
    dim_neck_3=32,
    out_channels=10 * 3,
    layers=24,
    stacks=4,
    residual_channels=512,
    gate_channels=512,  # split into 2 groups internally for gated activation
    skip_out_channels=256,
    cin_channels=80,
    gin_channels=-1,  # i.e., speaker embedding dim
    weight_normalization=True,
    n_speakers=-1,
    dropout=1 - 0.95,
    kernel_size=3,
    upsample_conditional_features=True,
    upsample_scales=[4, 4, 4, 4],
    freq_axis_kernel_size=3,
    legacy=True,
    dim_enc=512,
    dim_enc_2=128,
    dim_enc_3=256,
    dim_freq=80,
    dim_spk_emb=82,
    dim_f0=257,
    dim_dec=512,
    len_raw=128,
    chs_grp=16,

    # interp
    min_len_seg=19,
    max_len_seg=32,
    min_len_seq=64,
    max_len_seq=128,
    max_len_pad=192,

    # data loader
    root_dir='assets/spmel',
    feat_dir='assets/raptf0',
    batch_size=16,
    mode='train',
    shuffle=True,
    num_workers=0,
    samplier=8,

    # Convenient model builder
    builder="wavenet",
    hop_size=256,
    log_scale_min=float(-32.23619130191664),
)
Beispiel #5
0
hparams = HParams(
    # model
    freq=8,
    dim_neck=8,
    freq_2=8,
    dim_neck_2=1,
    freq_3=8,
    dim_neck_3=32,
    dim_enc=512,  # content encoder
    dim_enc_2=128,  # rhythm encoder
    dim_enc_3=256,  # pitch encoder
    dim_freq=80,
    dim_spk_emb=20,
    dim_f0=257,
    dim_dec=512,
    len_raw=128,
    chs_grp=16,

    # interp
    min_len_seg=19,
    max_len_seg=32,
    min_len_seq=64,
    max_len_seq=128,
    max_len_pad=192,

    # data loader
    root_dir='/hd0/speechsplit/preprocessed/spmel',
    feat_dir='/hd0/speechsplit/preprocessed/raptf0',
    batch_size=16,
    mode='train',
    shuffle=True,
    num_workers=0,
    samplier=8,
)
Beispiel #6
0
hparams = HParams(
    name="olr",
    sample_rate=16000,
    # num_mels=80,
    n_fft=int(0.04 * 16000),
    # n_fft=2048,
    hop_length=int(0.02 * 16000),
    win_length=int(0.04 * 16000),
    deltas=False,

    # training testing evaluating
    model_type='Cnn_9layers_AvgPooling',
    use_cuda=True,
    max_epoch=100,
    batch_size=32,
    lang=[
        "Kazak", "Tibet", "Uyghu", "ct_cn", "id_id", "ja_jp", "ko_kr", "ru_ru",
        "vi_vn", "zh_cn"
    ],
    #'Kazak',
    #'TE_IN',
    #'Tibet',
    #'Uyghu',
    #'ca_es',
    #'ct_cn',
    #'el_gr',
    #'id_id',
    #'ja_jp',
    ##'ko_kr',
    #'ru_ru',
    #'shanghai',
    #'sichuan',
    #'minnan',
    #'vi_vn',
    #'zh_cn',
    #],
)