train_batch_fn, batch_size, sampler=RandomSampler(ljspeech_train)) # only batch=1 for validation is enabled valid_cargo = DataCargo( ljspeech_valid, valid_batch_fn, batch_size=1, sampler=SequentialSampler(ljspeech_valid)) # conditioner(upsampling net) conditioner_config = config["conditioner"] upsampling_factors = conditioner_config["upsampling_factors"] upsample_net = UpsampleNet(upscale_factors=upsampling_factors) freeze(upsample_net) residual_channels = teacher_config["residual_channels"] loss_type = teacher_config["loss_type"] output_dim = teacher_config["output_dim"] log_scale_min = teacher_config["log_scale_min"] assert loss_type == "mog" and output_dim == 3, \ "the teacher wavenet should be a wavenet with single gaussian output" teacher = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels, filter_size, loss_type, log_scale_min) # load & freeze upsample_net & teacher freeze(teacher) student_config = config["student"] n_loops = student_config["n_loops"]
def synthesis_with_clarinet(config_path, checkpoint, mel_spectrogram, place): with open(config_path, 'rt') as f: config = yaml.safe_load(f) data_config = config["data"] n_mels = data_config["n_mels"] teacher_config = config["teacher"] n_loop = teacher_config["n_loop"] n_layer = teacher_config["n_layer"] filter_size = teacher_config["filter_size"] # only batch=1 for validation is enabled with dg.guard(place): # conditioner(upsampling net) conditioner_config = config["conditioner"] upsampling_factors = conditioner_config["upsampling_factors"] upsample_net = UpsampleNet(upscale_factors=upsampling_factors) freeze(upsample_net) residual_channels = teacher_config["residual_channels"] loss_type = teacher_config["loss_type"] output_dim = teacher_config["output_dim"] log_scale_min = teacher_config["log_scale_min"] assert loss_type == "mog" and output_dim == 3, \ "the teacher wavenet should be a wavenet with single gaussian output" teacher = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels, filter_size, loss_type, log_scale_min) # load & freeze upsample_net & teacher freeze(teacher) student_config = config["student"] n_loops = student_config["n_loops"] n_layers = student_config["n_layers"] student_residual_channels = student_config["residual_channels"] student_filter_size = student_config["filter_size"] student_log_scale_min = student_config["log_scale_min"] student = ParallelWaveNet(n_loops, n_layers, student_residual_channels, n_mels, student_filter_size) stft_config = config["stft"] stft = STFT(n_fft=stft_config["n_fft"], hop_length=stft_config["hop_length"], win_length=stft_config["win_length"]) lmd = config["loss"]["lmd"] model = Clarinet(upsample_net, teacher, student, stft, student_log_scale_min, lmd) io.load_parameters(model=model, checkpoint_path=checkpoint) if not os.path.exists(args.output): os.makedirs(args.output) model.eval() # Rescale mel_spectrogram. min_level, ref_level = 1e-5, 20 # hard code it mel_spectrogram = 20 * np.log10(np.maximum(min_level, mel_spectrogram)) mel_spectrogram = mel_spectrogram - ref_level mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1) mel_spectrogram = dg.to_variable(mel_spectrogram) mel_spectrogram = fluid.layers.transpose(mel_spectrogram, [0, 2, 1]) wav_var = model.synthesis(mel_spectrogram) wav_np = wav_var.numpy()[0] return wav_np
def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx, embedding_std, max_positions, n_vocab, freeze_embedding, filter_size, encoder_channels, mel_dim, decoder_channels, r, trainable_positional_encodings, use_memory_mask, query_position_rate, key_position_rate, window_behind, window_ahead, key_projection, value_projection, downsample_factor, linear_dim, use_decoder_states, converter_channels, dropout): """just a simple function to create a deepvoice 3 model""" if n_speakers > 1: spe = dg.Embedding((n_speakers, speaker_dim), param_attr=I.Normal(scale=speaker_embed_std)) else: spe = None h = encoder_channels k = filter_size encoder_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ConvSpec(h, k, 3), ) enc = Encoder(n_vocab, embed_dim, n_speakers, speaker_dim, padding_idx=None, embedding_weight_std=embedding_std, convolutions=encoder_convolutions, dropout=dropout) if freeze_embedding: freeze(enc.embed) h = decoder_channels prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3)) attentive_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ) attention = [True, False, False, False, True] force_monotonic_attention = [True, False, False, False, True] dec = Decoder(n_speakers, speaker_dim, embed_dim, mel_dim, r=r, max_positions=max_positions, preattention=prenet_convolutions, convolutions=attentive_convolutions, attention=attention, dropout=dropout, use_memory_mask=use_memory_mask, force_monotonic_attention=force_monotonic_attention, query_position_rate=query_position_rate, key_position_rate=key_position_rate, window_range=WindowRange(window_behind, window_ahead), key_projection=key_projection, value_projection=value_projection) if not trainable_positional_encodings: freeze(dec.embed_keys_positions) freeze(dec.embed_query_positions) h = converter_channels postnet_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(2 * h, k, 1), ConvSpec(2 * h, k, 3), ) cvt = Converter(n_speakers, speaker_dim, dec.state_dim if use_decoder_states else mel_dim, linear_dim, time_upsampling=downsample_factor, convolutions=postnet_convolutions, dropout=dropout) dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states) return dv3
def make_model(config): c = config["model"] # speaker embedding n_speakers = c["n_speakers"] speaker_dim = c["speaker_embed_dim"] if n_speakers > 1: speaker_embed = dg.Embedding( (n_speakers, speaker_dim), param_attr=I.Normal(scale=c["speaker_embedding_weight_std"])) else: speaker_embed = None # encoder h = c["encoder_channels"] k = c["kernel_size"] encoder_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ConvSpec(h, k, 3), ) encoder = Encoder(n_vocab=en.n_vocab, embed_dim=c["text_embed_dim"], n_speakers=n_speakers, speaker_dim=speaker_dim, embedding_weight_std=c["embedding_weight_std"], convolutions=encoder_convolutions, dropout=c["dropout"]) if c["freeze_embedding"]: freeze(encoder.embed) # decoder h = c["decoder_channels"] k = c["kernel_size"] prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3)) attentive_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ) attention = [True, False, False, False, True] force_monotonic_attention = [True, False, False, False, True] window = WindowRange(c["window_backward"], c["window_ahead"]) decoder = Decoder(n_speakers, speaker_dim, embed_dim=c["text_embed_dim"], mel_dim=config["transform"]["n_mels"], r=c["outputs_per_step"], max_positions=c["max_positions"], preattention=prenet_convolutions, convolutions=attentive_convolutions, attention=attention, dropout=c["dropout"], use_memory_mask=c["use_memory_mask"], force_monotonic_attention=force_monotonic_attention, query_position_rate=c["query_position_rate"], key_position_rate=c["key_position_rate"], window_range=window, key_projection=c["key_projection"], value_projection=c["value_projection"]) if not c["trainable_positional_encodings"]: freeze(decoder.embed_keys_positions) freeze(decoder.embed_query_positions) # converter(postnet) linear_dim = 1 + config["transform"]["n_fft"] // 2 h = c["converter_channels"] k = c["kernel_size"] postnet_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(2 * h, k, 1), ConvSpec(2 * h, k, 3), ) use_decoder_states = c["use_decoder_state_for_postnet_input"] converter = Converter(n_speakers, speaker_dim, in_channels=decoder.state_dim if use_decoder_states else config["transform"]["n_mels"], linear_dim=linear_dim, time_upsampling=c["downsample_factor"], convolutions=postnet_convolutions, dropout=c["dropout"]) model = DeepVoice3(encoder, decoder, converter, speaker_embed, use_decoder_states=use_decoder_states) return model