def test_duration_predictor():
    input_dummy = torch.rand(8, 128, 27).to(device)
    input_lengths = torch.randint(20, 27, (8, )).long().to(device)
    input_lengths[-1] = 27

    x_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)),
                             1).to(device)

    layer = DurationPredictor(hidden_channels=128).to(device)

    output = layer(input_dummy, x_mask)
    assert list(output.shape) == [8, 1, 27]
Example #2
0
    def __init__(
        self,
        num_chars,
        out_channels,
        hidden_channels=256,
        hidden_channels_dp=256,
        encoder_type="fftransformer",
        encoder_params={
            "hidden_channels_ffn": 1024,
            "num_heads": 2,
            "num_layers": 6,
            "dropout_p": 0.1
        },
        decoder_type="fftransformer",
        decoder_params={
            "hidden_channels_ffn": 1024,
            "num_heads": 2,
            "num_layers": 6,
            "dropout_p": 0.1
        },
        length_scale=1,
        num_speakers=0,
        external_c=False,
        c_in_channels=0,
    ):

        super().__init__()
        self.length_scale = float(length_scale) if isinstance(
            length_scale, int) else length_scale
        self.emb = nn.Embedding(num_chars, hidden_channels)
        self.pos_encoder = PositionalEncoding(hidden_channels)
        self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type,
                               encoder_params, c_in_channels)
        self.decoder = Decoder(out_channels, hidden_channels, decoder_type,
                               decoder_params)
        self.duration_predictor = DurationPredictor(hidden_channels_dp)

        self.mod_layer = nn.Conv1d(hidden_channels, hidden_channels, 1)
        self.mdn_block = MDNBlock(hidden_channels, 2 * out_channels)

        if num_speakers > 1 and not external_c:
            # speaker embedding layer
            self.emb_g = nn.Embedding(num_speakers, c_in_channels)
            nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)

        if c_in_channels > 0 and c_in_channels != hidden_channels:
            self.proj_g = nn.Conv1d(c_in_channels, hidden_channels, 1)
Example #3
0
    def __init__(
        self,
        num_chars,
        out_channels,
        hidden_channels,
        positional_encoding=True,
        length_scale=1,
        encoder_type="residual_conv_bn",
        encoder_params={
            "kernel_size": 4,
            "dilations": 4 * [1, 2, 4] + [1],
            "num_conv_blocks": 2,
            "num_res_blocks": 13
        },
        decoder_type="residual_conv_bn",
        decoder_params={
            "kernel_size": 4,
            "dilations": 4 * [1, 2, 4, 8] + [1],
            "num_conv_blocks": 2,
            "num_res_blocks": 17,
        },
        num_speakers=0,
        external_c=False,
        c_in_channels=0,
    ):

        super().__init__()
        self.length_scale = float(length_scale) if isinstance(
            length_scale, int) else length_scale
        self.emb = nn.Embedding(num_chars, hidden_channels)
        self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type,
                               encoder_params, c_in_channels)
        if positional_encoding:
            self.pos_encoder = PositionalEncoding(hidden_channels)
        self.decoder = Decoder(out_channels, hidden_channels, decoder_type,
                               decoder_params)
        self.duration_predictor = DurationPredictor(hidden_channels +
                                                    c_in_channels)

        if num_speakers > 1 and not external_c:
            # speaker embedding layer
            self.emb_g = nn.Embedding(num_speakers, c_in_channels)
            nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)

        if c_in_channels > 0 and c_in_channels != hidden_channels:
            self.proj_g = nn.Conv1d(c_in_channels, hidden_channels, 1)
Example #4
0
    def __init__(self, config: Coqpit):

        super().__init__()
        self.config = config
        self.phase = -1
        self.length_scale = (float(config.model_args.length_scale)
                             if isinstance(config.model_args.length_scale, int)
                             else config.model_args.length_scale)

        if not self.config.model_args.num_chars:
            _, self.config, num_chars = self.get_characters(config)
            self.config.model_args.num_chars = num_chars

        self.emb = nn.Embedding(self.config.model_args.num_chars,
                                self.config.model_args.hidden_channels)

        self.embedded_speaker_dim = 0
        self.init_multispeaker(config)

        self.pos_encoder = PositionalEncoding(
            config.model_args.hidden_channels)
        self.encoder = Encoder(
            config.model_args.hidden_channels,
            config.model_args.hidden_channels,
            config.model_args.encoder_type,
            config.model_args.encoder_params,
            self.embedded_speaker_dim,
        )
        self.decoder = Decoder(
            config.model_args.out_channels,
            config.model_args.hidden_channels,
            config.model_args.decoder_type,
            config.model_args.decoder_params,
        )
        self.duration_predictor = DurationPredictor(
            config.model_args.hidden_channels_dp)

        self.mod_layer = nn.Conv1d(config.model_args.hidden_channels,
                                   config.model_args.hidden_channels, 1)

        self.mdn_block = MDNBlock(config.model_args.hidden_channels,
                                  2 * config.model_args.out_channels)

        if self.embedded_speaker_dim > 0 and self.embedded_speaker_dim != config.model_args.hidden_channels:
            self.proj_g = nn.Conv1d(self.embedded_speaker_dim,
                                    config.model_args.hidden_channels, 1)