hparams = HParams( # model freq=8, dim_neck=8, freq_2=8, dim_neck_2=1, freq_3=8, dim_neck_3=32, dim_enc=512, dim_enc_2=128, dim_enc_3=256, dim_freq=80, dim_spk_emb=512, # 82, dim_f0=257, dim_dec=512, len_raw=128, chs_grp=16, # interp min_len_seg=19, max_len_seg=32, min_len_seq=64, max_len_seq=128, max_len_pad=192, # data loader root_dir='assets/spmel', feat_dir='assets/raptf0', batch_size=16, mode='train', shuffle=True, num_workers=0, samplier=8, )
hparams = HParams( name='wavenet_vocoder', builder='wavenet', input_type='raw', quantize_channels=65536, sample_rate=16000, silence_threshold=2, num_mels=80, fmin=125, fmax=7600, fft_size=1024, hop_size=256, frame_shift_ms=None, min_level_db=-100, ref_level_db=20, rescaling=True, rescaling_max=0.999, allow_clipping_in_normalization=True, log_scale_min=-32.23619130191664, out_channels=30, layers=24, stacks=4, residual_channels=512, gate_channels=512, skip_out_channels=256, dropout=0.050000000000000044, kernel_size=3, weight_normalization=True, legacy=True, cin_channels=80, upsample_conditional_features=True, upsample_scales=[4, 4, 4, 4], freq_axis_kernel_size=3, gin_channels=-1, n_speakers=-1, pin_memory=True, test_size=0.0441, test_num_samples=None, random_state=1234, adam_beta1=0.9, adam_beta2=0.999, adam_eps=1e-08, amsgrad=False, initial_learning_rate=0.001, lr_schedule='noam_learning_rate_decay', lr_schedule_kwargs={}, nepochs=2000, weight_decay=0.0, clip_thresh=-1, max_time_sec=None, max_time_steps=8000, exponential_moving_average=True, ema_decay=0.9999, checkpoint_interval=10000, train_eval_interval=10000, test_eval_epoch_interval=5, save_optimizer_state=True, # model freq=8, dim_neck=8, freq_2=8, dim_neck_2=1, freq_3=8, dim_neck_3=32, dim_enc=512, dim_enc_2=128, dim_enc_3=256, dim_freq=80, dim_spk_emb=82, dim_f0=257, dim_dec=512, len_raw=128, chs_grp=16, # interp min_len_seg=19, max_len_seg=32, min_len_seq=64, max_len_seq=128, max_len_pad=192, # data loader root_dir='assets/spmel', feat_dir='assets/raptf0', batch_size=16, mode='train', shuffle=True, num_workers=0, samplier=8, )
hparams = HParams( loss_reconstruction_w=9, loss_disentanglement_w=1, # threshold=4e-4, # model freq=8, # content codes降采样 dim_neck=8, # (blstm dim)content_encoder freq_2=8, # rhythm codes降采样 rhythm_encoder(encoder_2,encoder_t) dim_neck_2=1, # (blstm dim)rhythm_encoder(encoder_2,encoder_t) freq_3=8, # pitch codes降采样 dim_neck_3=32, # (blstm dim)pitch_encoder dim_enc=512, # (conv dim) content dim_enc_2=128, # (conv dim) rhythm (encoder_2,encoder_t) dim_enc_3=256, # (conv dim) pitch # Decoder parameters n_frames_per_step=1, # currently only 1 is supported decoder_rnn_dim=1024, prenet_dim=256, max_decoder_steps=1000, gate_threshold=0.5, p_attention_dropout=0.1, p_decoder_dropout=0.1, # Attention parameters attention_rnn_dim=1024, attention_dim=128, # Location Layer parameters attention_location_n_filters=32, attention_location_kernel_size=31, # Mel-post processing network parameters postnet_embedding_dim=512, postnet_kernel_size=5, postnet_n_convolutions=5, n_mel_channels=80, # x(mel) : 80维 dim_freq=80, dim_spk_emb=107, # 82 at first embedding_spk=64, # f0 :257维 dim_f0=257, dim_dec=512, dim_ortho=1024, layer_norm_eps=1e-12, len_raw=128, chs_grp=16, # interp # 为了random resampling,先分段segment # 每个段segment长度:19帧~32帧 min_len_seg=19, max_len_seg=32, # min_len_seq = 64, min_len_seq=32, # max_len_seq = 128, max_len_seq=48, max_len_pad=408, #192 at first, # data loader root_dir='assets/spmel', feat_dir='assets/raptf0', batch_size=16, mode='train', shuffle=True, num_workers=0, samplier=8, #MBV enc_mbv_size=7, )
hparams = HParams( # model freq=8, dim_neck=8, freq_2=8, dim_neck_2=1, freq_3=8, dim_neck_3=32, out_channels=10 * 3, layers=24, stacks=4, residual_channels=512, gate_channels=512, # split into 2 groups internally for gated activation skip_out_channels=256, cin_channels=80, gin_channels=-1, # i.e., speaker embedding dim weight_normalization=True, n_speakers=-1, dropout=1 - 0.95, kernel_size=3, upsample_conditional_features=True, upsample_scales=[4, 4, 4, 4], freq_axis_kernel_size=3, legacy=True, dim_enc=512, dim_enc_2=128, dim_enc_3=256, dim_freq=80, dim_spk_emb=82, dim_f0=257, dim_dec=512, len_raw=128, chs_grp=16, # interp min_len_seg=19, max_len_seg=32, min_len_seq=64, max_len_seq=128, max_len_pad=192, # data loader root_dir='assets/spmel', feat_dir='assets/raptf0', batch_size=16, mode='train', shuffle=True, num_workers=0, samplier=8, # Convenient model builder builder="wavenet", hop_size=256, log_scale_min=float(-32.23619130191664), )
hparams = HParams( # model freq=8, dim_neck=8, freq_2=8, dim_neck_2=1, freq_3=8, dim_neck_3=32, dim_enc=512, # content encoder dim_enc_2=128, # rhythm encoder dim_enc_3=256, # pitch encoder dim_freq=80, dim_spk_emb=20, dim_f0=257, dim_dec=512, len_raw=128, chs_grp=16, # interp min_len_seg=19, max_len_seg=32, min_len_seq=64, max_len_seq=128, max_len_pad=192, # data loader root_dir='/hd0/speechsplit/preprocessed/spmel', feat_dir='/hd0/speechsplit/preprocessed/raptf0', batch_size=16, mode='train', shuffle=True, num_workers=0, samplier=8, )
hparams = HParams( name="olr", sample_rate=16000, # num_mels=80, n_fft=int(0.04 * 16000), # n_fft=2048, hop_length=int(0.02 * 16000), win_length=int(0.04 * 16000), deltas=False, # training testing evaluating model_type='Cnn_9layers_AvgPooling', use_cuda=True, max_epoch=100, batch_size=32, lang=[ "Kazak", "Tibet", "Uyghu", "ct_cn", "id_id", "ja_jp", "ko_kr", "ru_ru", "vi_vn", "zh_cn" ], #'Kazak', #'TE_IN', #'Tibet', #'Uyghu', #'ca_es', #'ct_cn', #'el_gr', #'id_id', #'ja_jp', ##'ko_kr', #'ru_ru', #'shanghai', #'sichuan', #'minnan', #'vi_vn', #'zh_cn', #], )