Exemple #1
0
        train_batch_fn,
        batch_size,
        sampler=RandomSampler(ljspeech_train))

    # only batch=1 for validation is enabled
    valid_cargo = DataCargo(
        ljspeech_valid,
        valid_batch_fn,
        batch_size=1,
        sampler=SequentialSampler(ljspeech_valid))

    # conditioner(upsampling net)
    conditioner_config = config["conditioner"]
    upsampling_factors = conditioner_config["upsampling_factors"]
    upsample_net = UpsampleNet(upscale_factors=upsampling_factors)
    freeze(upsample_net)

    residual_channels = teacher_config["residual_channels"]
    loss_type = teacher_config["loss_type"]
    output_dim = teacher_config["output_dim"]
    log_scale_min = teacher_config["log_scale_min"]
    assert loss_type == "mog" and output_dim == 3, \
        "the teacher wavenet should be a wavenet with single gaussian output"

    teacher = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels,
                      filter_size, loss_type, log_scale_min)
    # load & freeze upsample_net & teacher
    freeze(teacher)

    student_config = config["student"]
    n_loops = student_config["n_loops"]
Exemple #2
0
def synthesis_with_clarinet(config_path, checkpoint, mel_spectrogram, place):
    with open(config_path, 'rt') as f:
        config = yaml.safe_load(f)

    data_config = config["data"]
    n_mels = data_config["n_mels"]

    teacher_config = config["teacher"]
    n_loop = teacher_config["n_loop"]
    n_layer = teacher_config["n_layer"]
    filter_size = teacher_config["filter_size"]

    # only batch=1 for validation is enabled

    with dg.guard(place):
        # conditioner(upsampling net)
        conditioner_config = config["conditioner"]
        upsampling_factors = conditioner_config["upsampling_factors"]
        upsample_net = UpsampleNet(upscale_factors=upsampling_factors)
        freeze(upsample_net)

        residual_channels = teacher_config["residual_channels"]
        loss_type = teacher_config["loss_type"]
        output_dim = teacher_config["output_dim"]
        log_scale_min = teacher_config["log_scale_min"]
        assert loss_type == "mog" and output_dim == 3, \
            "the teacher wavenet should be a wavenet with single gaussian output"

        teacher = WaveNet(n_loop, n_layer, residual_channels, output_dim,
                          n_mels, filter_size, loss_type, log_scale_min)
        # load & freeze upsample_net & teacher
        freeze(teacher)

        student_config = config["student"]
        n_loops = student_config["n_loops"]
        n_layers = student_config["n_layers"]
        student_residual_channels = student_config["residual_channels"]
        student_filter_size = student_config["filter_size"]
        student_log_scale_min = student_config["log_scale_min"]
        student = ParallelWaveNet(n_loops, n_layers, student_residual_channels,
                                  n_mels, student_filter_size)

        stft_config = config["stft"]
        stft = STFT(n_fft=stft_config["n_fft"],
                    hop_length=stft_config["hop_length"],
                    win_length=stft_config["win_length"])

        lmd = config["loss"]["lmd"]
        model = Clarinet(upsample_net, teacher, student, stft,
                         student_log_scale_min, lmd)
        io.load_parameters(model=model, checkpoint_path=checkpoint)

        if not os.path.exists(args.output):
            os.makedirs(args.output)
        model.eval()

        # Rescale mel_spectrogram.
        min_level, ref_level = 1e-5, 20  # hard code it
        mel_spectrogram = 20 * np.log10(np.maximum(min_level, mel_spectrogram))
        mel_spectrogram = mel_spectrogram - ref_level
        mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1)

        mel_spectrogram = dg.to_variable(mel_spectrogram)
        mel_spectrogram = fluid.layers.transpose(mel_spectrogram, [0, 2, 1])

        wav_var = model.synthesis(mel_spectrogram)
        wav_np = wav_var.numpy()[0]

        return wav_np
Exemple #3
0
def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
               padding_idx, embedding_std, max_positions, n_vocab,
               freeze_embedding, filter_size, encoder_channels, mel_dim,
               decoder_channels, r, trainable_positional_encodings,
               use_memory_mask, query_position_rate, key_position_rate,
               window_behind, window_ahead, key_projection, value_projection,
               downsample_factor, linear_dim, use_decoder_states,
               converter_channels, dropout):
    """just a simple function to create a deepvoice 3 model"""
    if n_speakers > 1:
        spe = dg.Embedding((n_speakers, speaker_dim),
                           param_attr=I.Normal(scale=speaker_embed_std))
    else:
        spe = None

    h = encoder_channels
    k = filter_size
    encoder_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
    )
    enc = Encoder(n_vocab,
                  embed_dim,
                  n_speakers,
                  speaker_dim,
                  padding_idx=None,
                  embedding_weight_std=embedding_std,
                  convolutions=encoder_convolutions,
                  dropout=dropout)
    if freeze_embedding:
        freeze(enc.embed)

    h = decoder_channels
    prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3))
    attentive_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
    )
    attention = [True, False, False, False, True]
    force_monotonic_attention = [True, False, False, False, True]
    dec = Decoder(n_speakers,
                  speaker_dim,
                  embed_dim,
                  mel_dim,
                  r=r,
                  max_positions=max_positions,
                  preattention=prenet_convolutions,
                  convolutions=attentive_convolutions,
                  attention=attention,
                  dropout=dropout,
                  use_memory_mask=use_memory_mask,
                  force_monotonic_attention=force_monotonic_attention,
                  query_position_rate=query_position_rate,
                  key_position_rate=key_position_rate,
                  window_range=WindowRange(window_behind, window_ahead),
                  key_projection=key_projection,
                  value_projection=value_projection)
    if not trainable_positional_encodings:
        freeze(dec.embed_keys_positions)
        freeze(dec.embed_query_positions)

    h = converter_channels
    postnet_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(2 * h, k, 1),
        ConvSpec(2 * h, k, 3),
    )
    cvt = Converter(n_speakers,
                    speaker_dim,
                    dec.state_dim if use_decoder_states else mel_dim,
                    linear_dim,
                    time_upsampling=downsample_factor,
                    convolutions=postnet_convolutions,
                    dropout=dropout)
    dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states)
    return dv3
Exemple #4
0
def make_model(config):
    c = config["model"]
    # speaker embedding
    n_speakers = c["n_speakers"]
    speaker_dim = c["speaker_embed_dim"]
    if n_speakers > 1:
        speaker_embed = dg.Embedding(
            (n_speakers, speaker_dim),
            param_attr=I.Normal(scale=c["speaker_embedding_weight_std"]))
    else:
        speaker_embed = None

    # encoder
    h = c["encoder_channels"]
    k = c["kernel_size"]
    encoder_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
    )
    encoder = Encoder(n_vocab=en.n_vocab,
                      embed_dim=c["text_embed_dim"],
                      n_speakers=n_speakers,
                      speaker_dim=speaker_dim,
                      embedding_weight_std=c["embedding_weight_std"],
                      convolutions=encoder_convolutions,
                      dropout=c["dropout"])
    if c["freeze_embedding"]:
        freeze(encoder.embed)

    # decoder
    h = c["decoder_channels"]
    k = c["kernel_size"]
    prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3))
    attentive_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
    )
    attention = [True, False, False, False, True]
    force_monotonic_attention = [True, False, False, False, True]
    window = WindowRange(c["window_backward"], c["window_ahead"])
    decoder = Decoder(n_speakers,
                      speaker_dim,
                      embed_dim=c["text_embed_dim"],
                      mel_dim=config["transform"]["n_mels"],
                      r=c["outputs_per_step"],
                      max_positions=c["max_positions"],
                      preattention=prenet_convolutions,
                      convolutions=attentive_convolutions,
                      attention=attention,
                      dropout=c["dropout"],
                      use_memory_mask=c["use_memory_mask"],
                      force_monotonic_attention=force_monotonic_attention,
                      query_position_rate=c["query_position_rate"],
                      key_position_rate=c["key_position_rate"],
                      window_range=window,
                      key_projection=c["key_projection"],
                      value_projection=c["value_projection"])
    if not c["trainable_positional_encodings"]:
        freeze(decoder.embed_keys_positions)
        freeze(decoder.embed_query_positions)

    # converter(postnet)
    linear_dim = 1 + config["transform"]["n_fft"] // 2
    h = c["converter_channels"]
    k = c["kernel_size"]
    postnet_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(2 * h, k, 1),
        ConvSpec(2 * h, k, 3),
    )
    use_decoder_states = c["use_decoder_state_for_postnet_input"]
    converter = Converter(n_speakers,
                          speaker_dim,
                          in_channels=decoder.state_dim if use_decoder_states
                          else config["transform"]["n_mels"],
                          linear_dim=linear_dim,
                          time_upsampling=c["downsample_factor"],
                          convolutions=postnet_convolutions,
                          dropout=c["dropout"])

    model = DeepVoice3(encoder,
                       decoder,
                       converter,
                       speaker_embed,
                       use_decoder_states=use_decoder_states)
    return model