Beispiel #1
0
    def __init__(self, residual_channels: int, condition_dim: int,
                 filter_size: Union[int, Sequence[int]], dilation: int):

        super(ResidualBlock, self).__init__()
        dilated_channels = 2 * residual_channels
        # following clarinet's implementation, we do not have parametric residual
        # & skip connection.

        _filter_size = filter_size[0] if isinstance(filter_size,
                                                    (list,
                                                     tuple)) else filter_size
        std = math.sqrt(1 / (_filter_size * residual_channels))
        conv = Conv1dCell(residual_channels,
                          dilated_channels,
                          filter_size,
                          dilation=dilation,
                          weight_attr=I.Normal(scale=std))
        self.conv = nn.utils.weight_norm(conv)

        std = math.sqrt(1 / condition_dim)
        condition_proj = Conv1dCell(condition_dim,
                                    dilated_channels, (1, ),
                                    weight_attr=I.Normal(scale=std))
        self.condition_proj = nn.utils.weight_norm(condition_proj)

        self.filter_size = filter_size
        self.dilation = dilation
        self.dilated_channels = dilated_channels
        self.residual_channels = residual_channels
        self.condition_dim = condition_dim
Beispiel #2
0
def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
    """Return a list of Layers that upsamples the input by 4 times in time dimension.

    Args:
        n_speakers (int): number of speakers of the Conv1DGLU layers used.
        speaker_dim (int): speaker embedding size of the Conv1DGLU layers used.
        target_channels (int): channels of the input and the output.(the list of layers does not change the number of channels.)
        dropout (float): dropout probability.

    Returns:
        List[Layer]: upsampling layers.
    """
    # upsampling convolitions
    upsampling_convolutions = [
        Conv1DTranspose(
            target_channels,
            target_channels,
            2,
            stride=2,
            param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
        Conv1DGLU(n_speakers,
                  speaker_dim,
                  target_channels,
                  target_channels,
                  3,
                  dilation=1,
                  std_mul=1.,
                  dropout=dropout),
        Conv1DGLU(n_speakers,
                  speaker_dim,
                  target_channels,
                  target_channels,
                  3,
                  dilation=3,
                  std_mul=4.,
                  dropout=dropout),
        Conv1DTranspose(
            target_channels,
            target_channels,
            2,
            stride=2,
            param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))),
        Conv1DGLU(n_speakers,
                  speaker_dim,
                  target_channels,
                  target_channels,
                  3,
                  dilation=1,
                  std_mul=1.,
                  dropout=dropout),
        Conv1DGLU(n_speakers,
                  speaker_dim,
                  target_channels,
                  target_channels,
                  3,
                  dilation=3,
                  std_mul=4.,
                  dropout=dropout),
    ]
    return upsampling_convolutions
Beispiel #3
0
    def __init__(self, residual_channels, condition_dim, filter_size,
                 dilation):
        """A Residual block in wavenet. It does not have parametric residual or skip connection. It consists of a Conv1DCell and an Conv1D(filter_size = 1) to integrate the condition.

        Args:
            residual_channels (int): the channels of the input, residual and skip.
            condition_dim (int): the channels of the condition.
            filter_size (int): filter size of the internal convolution cell.
            dilation (int): dilation of the internal convolution cell.
        """
        super(ResidualBlock, self).__init__()
        dilated_channels = 2 * residual_channels
        # following clarinet's implementation, we do not have parametric residual
        # & skip connection.

        std = np.sqrt(1 / (filter_size * residual_channels))
        self.conv = Conv1DCell(residual_channels,
                               dilated_channels,
                               filter_size,
                               dilation=dilation,
                               causal=True,
                               param_attr=I.Normal(scale=std))

        std = np.sqrt(1 / condition_dim)
        self.condition_proj = Conv1D(condition_dim,
                                     dilated_channels,
                                     1,
                                     param_attr=I.Normal(scale=std))

        self.filter_size = filter_size
        self.dilation = dilation
        self.dilated_channels = dilated_channels
        self.residual_channels = residual_channels
        self.condition_dim = condition_dim
Beispiel #4
0
    def __init__(self, in_channel, out_channel, has_bias=False, bias_dim=0):
        super(AffineBlock1, self).__init__()
        std = np.sqrt(1.0 / in_channel)
        affine = dg.Linear(in_channel, out_channel, param_attr=I.Normal(scale=std))
        self.affine = weight_norm(affine, dim=-1)
        if has_bias:
            std = np.sqrt(1 / bias_dim)
            self.bias_affine = dg.Linear(bias_dim, out_channel, param_attr=I.Normal(scale=std))

        self.has_bias = has_bias
        self.bias_dim = bias_dim
Beispiel #5
0
    def __init__(self, in_channel, out_channel,
                 has_bias=False, bias_dim=0, dropout=False, keep_prob=1.):
        super(AffineBlock2, self).__init__()
        if has_bias:
            std = np.sqrt(1 / bias_dim)
            self.bias_affine = dg.Linear(bias_dim, in_channel, param_attr=I.Normal(scale=std))
        std = np.sqrt(1.0 / in_channel)
        affine = dg.Linear(in_channel, out_channel, param_attr=I.Normal(scale=std))
        self.affine = weight_norm(affine, dim=-1)

        self.has_bias = has_bias
        self.bias_dim = bias_dim
        self.dropout = dropout
        self.keep_prob = keep_prob
Beispiel #6
0
    def test_set_global_bias_initilizer(self):
        """Test Set Global Bias initilizer with NormalInitializer
        """
        main_prog = framework.Program()
        startup_prog = framework.Program()
        fluid.set_global_initializer(initializer.Uniform(low=-0.5, high=0.5),
                                     bias_init=initializer.Normal(loc=0.0,
                                                                  scale=2.0))
        with fluid.program_guard(main_prog, startup_prog):
            x = fluid.data(name="x", shape=[1, 3, 32, 32])
            # default initilizer of bias in layers.conv2d is ConstantInitializer
            conv = fluid.layers.conv2d(x, 5, 3)

        block = startup_prog.global_block()
        self.assertEqual(len(block.ops), 2)

        # init bias is the first op, and weight is the second
        bias_init_op = block.ops[0]
        self.assertEqual(bias_init_op.type, 'gaussian_random')
        self.assertAlmostEqual(bias_init_op.attr('mean'), 0.0, delta=DELTA)
        self.assertAlmostEqual(bias_init_op.attr('std'), 2.0, delta=DELTA)
        self.assertEqual(bias_init_op.attr('seed'), 0)

        param_init_op = block.ops[1]
        self.assertEqual(param_init_op.type, 'uniform_random')
        self.assertAlmostEqual(param_init_op.attr('min'), -0.5, delta=DELTA)
        self.assertAlmostEqual(param_init_op.attr('max'), 0.5, delta=DELTA)
        self.assertEqual(param_init_op.attr('seed'), 0)
        fluid.set_global_initializer(None)
Beispiel #7
0
    def __init__(self, in_channel, kernel_size, causal=False, has_bias=False, 
                 bias_dim=None, keep_prob=1.):
        super(ConvBlock, self).__init__()
        self.causal = causal
        self.keep_prob = keep_prob
        self.in_channel = in_channel
        self.has_bias = has_bias

        std = np.sqrt(4 * keep_prob / (kernel_size * in_channel))
        padding = "valid" if causal else "same"
        conv =  Conv1D(in_channel, 2 * in_channel, (kernel_size, ),
                       padding=padding, 
                       data_format="NTC",
                       param_attr=I.Normal(scale=std))
        self.conv = weight_norm(conv)
        if has_bias:
            std = np.sqrt(1 / bias_dim)
            self.bias_affine = dg.Linear(bias_dim, 2 * in_channel, param_attr=I.Normal(scale=std))
Beispiel #8
0
 def __init__(self, layers, in_channels, postnet_dim, kernel_size, out_channels, upsample_factor, has_bias=False, bias_dim=0, keep_prob=1.):
     super(PostNet, self).__init__()
     self.pre_affine = AffineBlock1(in_channels, postnet_dim, has_bias, bias_dim)
     self.convs = dg.LayerList([
         ConvBlock(postnet_dim, kernel_size, False, has_bias, bias_dim, keep_prob) for _ in range(layers)
     ])
     std = np.sqrt(1.0 / postnet_dim)
     post_affine = dg.Linear(postnet_dim, out_channels, param_attr=I.Normal(scale=std))
     self.post_affine = weight_norm(post_affine, dim=-1)
     self.upsample_factor = upsample_factor
Beispiel #9
0
    def __init__(self, attention_dim, input_dim, position_encoding_weight=1., 
                 position_rate=1., reduction_factor=1, has_bias=False, bias_dim=0, 
                 keep_prob=1.):
        super(AttentionBlock, self).__init__()
        # positional encoding
        omega_default = position_rate / reduction_factor
        self.omega_default = omega_default
        # multispeaker case
        if has_bias:
            std = np.sqrt(1.0 / bias_dim)
            self.q_pos_affine = dg.Linear(bias_dim, 1, param_attr=I.Normal(scale=std))
            self.k_pos_affine = dg.Linear(bias_dim, 1, param_attr=I.Normal(scale=std))
            self.omega_initial = self.create_parameter(shape=[1], 
                attr=I.ConstantInitializer(value=omega_default))
        
        # mind the fact that q, k, v have the same feature dimension
        # so we can init k_affine and q_affine's weight as the same matrix
        # to get a better init attention
        init_weight = np.random.normal(size=(input_dim, attention_dim),
                                       scale=np.sqrt(1. / input_dim))
        initializer = I.NumpyArrayInitializer(init_weight.astype(np.float32))
        # 3 affine transformation to project q, k, v into attention_dim
        q_affine = dg.Linear(input_dim, attention_dim, param_attr=initializer)
        self.q_affine = weight_norm(q_affine, dim=-1)
        k_affine = dg.Linear(input_dim, attention_dim, param_attr=initializer)
        self.k_affine = weight_norm(k_affine, dim=-1)

        std = np.sqrt(1.0 / input_dim)
        v_affine = dg.Linear(input_dim, attention_dim, param_attr=I.Normal(scale=std))
        self.v_affine = weight_norm(v_affine, dim=-1)

        std = np.sqrt(1.0 / attention_dim)
        out_affine = dg.Linear(attention_dim, input_dim, param_attr=I.Normal(scale=std))
        self.out_affine = weight_norm(out_affine, dim=-1)

        self.keep_prob = keep_prob
        self.has_bias = has_bias
        self.bias_dim = bias_dim
        self.attention_dim = attention_dim
        self.position_encoding_weight = position_encoding_weight
Beispiel #10
0
    def __init__(self, in_channels, reduction_factor, prenet_sizes, 
                layers, kernel_size, attention_dim,
                position_encoding_weight=1., omega=1., 
                has_bias=False, bias_dim=0, keep_prob=1.):
        super(Decoder, self).__init__()
        # prenet-mind the difference of AffineBlock2 and AffineBlock1
        c_in = in_channels
        self.prenet = dg.LayerList()
        for i, c_out in enumerate(prenet_sizes):
            affine = AffineBlock2(c_in, c_out, has_bias, bias_dim, dropout=(i!=0), keep_prob=keep_prob)
            self.prenet.append(affine)
            c_in = c_out
        
        # causal convolutions + multihop attention
        decoder_dim = prenet_sizes[-1]
        self.causal_convs = dg.LayerList()
        self.attention_blocks = dg.LayerList()
        for i in range(layers):
            conv = ConvBlock(decoder_dim, kernel_size, True, has_bias, bias_dim, keep_prob)
            attn = AttentionBlock(attention_dim, decoder_dim, position_encoding_weight, omega, reduction_factor, has_bias, bias_dim, keep_prob)
            self.causal_convs.append(conv)
            self.attention_blocks.append(attn)

        # output mel spectrogram
        output_dim = reduction_factor * in_channels # r * mel_dim
        std = np.sqrt(1.0 / decoder_dim)
        out_affine = dg.Linear(decoder_dim, output_dim, param_attr=I.Normal(scale=std))
        self.out_affine = weight_norm(out_affine, dim=-1)
        if has_bias:
            std = np.sqrt(1 / bias_dim)
            self.out_sp_affine = dg.Linear(bias_dim, output_dim, param_attr=I.Normal(scale=std))

        self.has_bias = has_bias
        self.kernel_size = kernel_size

        self.in_channels = in_channels
        self.decoder_dim = decoder_dim
        self.reduction_factor = reduction_factor
        self.out_channels = output_dim
Beispiel #11
0
    def __init__(self,
                 query_dim,
                 embed_dim,
                 dropout=0.0,
                 window_range=WindowRange(-1, 3),
                 key_projection=True,
                 value_projection=True):
        """Attention Layer for Deep Voice 3.

        Args:
            query_dim (int): the dimension of query vectors. (The size of a single vector of query.)
            embed_dim (int): the dimension of keys and values.
            dropout (float, optional): dropout probability of attention. Defaults to 0.0.
            window_range (WindowRange, optional): range of attention, this is only used at inference. Defaults to WindowRange(-1, 3).
            key_projection (bool, optional): whether the `Attention` Layer has a Linear Layer for the keys to pass through before computing attention. Defaults to True.
            value_projection (bool, optional): whether the `Attention` Layer has a Linear Layer for the values to pass through before computing attention. Defaults to True.
        """
        super(Attention, self).__init__()
        std = np.sqrt(1 / query_dim)
        self.query_proj = Linear(
            query_dim, embed_dim, param_attr=I.Normal(scale=std))
        if key_projection:
            std = np.sqrt(1 / embed_dim)
            self.key_proj = Linear(
                embed_dim, embed_dim, param_attr=I.Normal(scale=std))
        if value_projection:
            std = np.sqrt(1 / embed_dim)
            self.value_proj = Linear(
                embed_dim, embed_dim, param_attr=I.Normal(scale=std))
        std = np.sqrt(1 / embed_dim)
        self.out_proj = Linear(
            embed_dim, query_dim, param_attr=I.Normal(scale=std))

        self.key_projection = key_projection
        self.value_projection = value_projection
        self.dropout = dropout
        self.window_range = window_range
Beispiel #12
0
def create_model(config):
    char_embedding = dg.Embedding((en.n_vocab, config["char_dim"]), param_attr=I.Normal(scale=0.1))
    multi_speaker = config["n_speakers"] > 1
    speaker_embedding = dg.Embedding((config["n_speakers"], config["speaker_dim"]), param_attr=I.Normal(scale=0.1)) \
        if multi_speaker else None
    encoder = Encoder(config["encoder_layers"], config["char_dim"], 
                      config["encoder_dim"], config["kernel_size"], 
                      has_bias=multi_speaker, bias_dim=config["speaker_dim"], 
                      keep_prob=1.0 - config["dropout"])
    decoder = Decoder(config["n_mels"], config["reduction_factor"], 
                      list(config["prenet_sizes"]) + [config["char_dim"]], 
                      config["decoder_layers"], config["kernel_size"], 
                      config["attention_dim"],
                      position_encoding_weight=config["position_weight"], 
                      omega=config["position_rate"], 
                      has_bias=multi_speaker, bias_dim=config["speaker_dim"], 
                      keep_prob=1.0 - config["dropout"])
    postnet = PostNet(config["postnet_layers"], config["char_dim"], 
                      config["postnet_dim"], config["kernel_size"], 
                      config["n_mels"], config["reduction_factor"], 
                      has_bias=multi_speaker, bias_dim=config["speaker_dim"], 
                      keep_prob=1.0 - config["dropout"])
    spectranet = SpectraNet(char_embedding, speaker_embedding, encoder, decoder, postnet)
    return spectranet
Beispiel #13
0
def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
               padding_idx, embedding_std, max_positions, n_vocab,
               freeze_embedding, filter_size, encoder_channels, mel_dim,
               decoder_channels, r, trainable_positional_encodings,
               use_memory_mask, query_position_rate, key_position_rate,
               window_behind, window_ahead, key_projection, value_projection,
               downsample_factor, linear_dim, use_decoder_states,
               converter_channels, dropout):
    """just a simple function to create a deepvoice 3 model"""
    if n_speakers > 1:
        spe = dg.Embedding((n_speakers, speaker_dim),
                           param_attr=I.Normal(scale=speaker_embed_std))
    else:
        spe = None

    h = encoder_channels
    k = filter_size
    encoder_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
    )
    enc = Encoder(n_vocab,
                  embed_dim,
                  n_speakers,
                  speaker_dim,
                  padding_idx=None,
                  embedding_weight_std=embedding_std,
                  convolutions=encoder_convolutions,
                  dropout=dropout)
    if freeze_embedding:
        freeze(enc.embed)

    h = decoder_channels
    prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3))
    attentive_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
    )
    attention = [True, False, False, False, True]
    force_monotonic_attention = [True, False, False, False, True]
    dec = Decoder(n_speakers,
                  speaker_dim,
                  embed_dim,
                  mel_dim,
                  r=r,
                  max_positions=max_positions,
                  preattention=prenet_convolutions,
                  convolutions=attentive_convolutions,
                  attention=attention,
                  dropout=dropout,
                  use_memory_mask=use_memory_mask,
                  force_monotonic_attention=force_monotonic_attention,
                  query_position_rate=query_position_rate,
                  key_position_rate=key_position_rate,
                  window_range=WindowRange(window_behind, window_ahead),
                  key_projection=key_projection,
                  value_projection=value_projection)
    if not trainable_positional_encodings:
        freeze(dec.embed_keys_positions)
        freeze(dec.embed_query_positions)

    h = converter_channels
    postnet_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(2 * h, k, 1),
        ConvSpec(2 * h, k, 3),
    )
    cvt = Converter(n_speakers,
                    speaker_dim,
                    dec.state_dim if use_decoder_states else mel_dim,
                    linear_dim,
                    time_upsampling=downsample_factor,
                    convolutions=postnet_convolutions,
                    dropout=dropout)
    dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states)
    return dv3
Beispiel #14
0
    def __init__(self,
                 n_vocab,
                 embed_dim,
                 n_speakers,
                 speaker_dim,
                 padding_idx=None,
                 embedding_weight_std=0.1,
                 convolutions=(ConvSpec(64, 5, 1), ) * 7,
                 dropout=0.):
        """Encoder of Deep Voice 3.

        Args:
            n_vocab (int): vocabulary size of the text embedding.
            embed_dim (int): embedding size of the text embedding.
            n_speakers (int): number of speakers.
            speaker_dim (int): speaker embedding size.
            padding_idx (int, optional): padding index of text embedding. Defaults to None.
            embedding_weight_std (float, optional): standard deviation of the embedding weights when intialized. Defaults to 0.1.
            convolutions (Iterable[ConvSpec], optional): specifications of the convolutional layers. ConvSpec is a namedtuple of output channels, filter_size and dilation. Defaults to (ConvSpec(64, 5, 1), )*7.
            dropout (float, optional): dropout probability. Defaults to 0..
        """
        super(Encoder, self).__init__()
        self.embedding_weight_std = embedding_weight_std
        self.embed = dg.Embedding(
            (n_vocab, embed_dim),
            padding_idx=padding_idx,
            param_attr=I.Normal(scale=embedding_weight_std))

        self.dropout = dropout
        if n_speakers > 1:
            std = np.sqrt((1 - dropout) / speaker_dim)
            self.sp_proj1 = Linear(speaker_dim,
                                   embed_dim,
                                   act="softsign",
                                   param_attr=I.Normal(scale=std))
            self.sp_proj2 = Linear(speaker_dim,
                                   embed_dim,
                                   act="softsign",
                                   param_attr=I.Normal(scale=std))
        self.n_speakers = n_speakers

        self.convolutions = dg.LayerList()
        in_channels = embed_dim
        std_mul = 1.0
        for (out_channels, filter_size, dilation) in convolutions:
            # 1 * 1 convolution & relu
            if in_channels != out_channels:
                std = np.sqrt(std_mul / in_channels)
                self.convolutions.append(
                    Conv1D(in_channels,
                           out_channels,
                           1,
                           act="relu",
                           param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0

            self.convolutions.append(
                Conv1DGLU(n_speakers,
                          speaker_dim,
                          in_channels,
                          out_channels,
                          filter_size,
                          dilation,
                          std_mul,
                          dropout,
                          causal=False,
                          residual=True))
            in_channels = out_channels
            std_mul = 4.0

        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        self.convolutions.append(
            Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
Beispiel #15
0
    def __init__(self,
                 n_speakers,
                 speaker_dim,
                 embed_dim,
                 mel_dim,
                 r=1,
                 max_positions=512,
                 preattention=(ConvSpec(128, 5, 1), ) * 4,
                 convolutions=(ConvSpec(128, 5, 1), ) * 4,
                 attention=True,
                 dropout=0.0,
                 use_memory_mask=False,
                 force_monotonic_attention=False,
                 query_position_rate=1.0,
                 key_position_rate=1.0,
                 window_range=WindowRange(-1, 3),
                 key_projection=True,
                 value_projection=True):
        """Decoder of the Deep Voice 3 model.

        Args:
            n_speakers (int): number of speakers.
            speaker_dim (int): speaker embedding size.
            embed_dim (int): text embedding size.
            mel_dim (int): channel of mel input.(mel bands)
            r (int, optional): number of frames generated per decoder step. Defaults to 1.
            max_positions (int, optional): max position for text and decoder steps. Defaults to 512.
            convolutions (Iterable[ConvSpec], optional): specification of causal convolutional layers inside the decoder. ConvSpec is a namedtuple of output_channels, filter_size and dilation. Defaults to (ConvSpec(128, 5, 1), )*4.
            attention (bool or List[bool], optional): whether to use attention, it should have the same length with `convolutions` if it is a list of bool, indicating whether to have an Attention layer coupled with the corresponding convolutional layer. If it is a bool, it is repeated len(convolutions) times internally. Defaults to True.
            dropout (float, optional): dropout probability. Defaults to 0.0.
            use_memory_mask (bool, optional): whether to use memory mask at the Attention layer. It should have the same length with `attention` if it is a list of bool, indicating whether to use memory mask at the corresponding Attention layer. If it is a bool, it is repeated len(attention) times internally. Defaults to False.
            force_monotonic_attention (bool, optional): whether to use monotonic_attention at the Attention layer when inferencing. It should have the same length with `attention` if it is a list of bool, indicating whether to use monotonic_attention at the corresponding Attention layer. If it is a bool, it is repeated len(attention) times internally. Defaults to False.
            query_position_rate (float, optional): position_rate of the PositionEmbedding for query. Defaults to 1.0.
            key_position_rate (float, optional): position_rate of the PositionEmbedding for key. Defaults to 1.0.
            window_range (WindowRange, optional): window range of monotonic attention. Defaults to WindowRange(-1, 3).
            key_projection (bool, optional): `key_projection` of Attention layers. Defaults to True.
            value_projection (bool, optional): `value_projection` of Attention layers Defaults to True.
        """
        super(Decoder, self).__init__()

        self.dropout = dropout
        self.mel_dim = mel_dim
        self.r = r
        self.query_position_rate = query_position_rate
        self.key_position_rate = key_position_rate
        self.window_range = window_range
        self.n_speakers = n_speakers

        conv_channels = convolutions[0].out_channels
        # only when padding idx is 0 can we easilt handle it
        self.embed_keys_positions = PositionEmbedding(max_positions, embed_dim)
        self.embed_query_positions = PositionEmbedding(max_positions,
                                                       conv_channels)

        if n_speakers > 1:
            std = np.sqrt((1 - dropout) / speaker_dim)
            self.speaker_proj1 = Linear(speaker_dim,
                                        1,
                                        act="sigmoid",
                                        param_attr=I.Normal(scale=std))
            self.speaker_proj2 = Linear(speaker_dim,
                                        1,
                                        act="sigmoid",
                                        param_attr=I.Normal(scale=std))

        # prenet
        self.prenet = dg.LayerList()
        in_channels = mel_dim * r  # multiframe
        std_mul = 1.0
        for (out_channels, filter_size, dilation) in preattention:
            if in_channels != out_channels:
                # conv1d & relu
                std = np.sqrt(std_mul / in_channels)
                self.prenet.append(
                    Conv1D(in_channels,
                           out_channels,
                           1,
                           act="relu",
                           param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0
            self.prenet.append(
                Conv1DGLU(n_speakers,
                          speaker_dim,
                          in_channels,
                          out_channels,
                          filter_size,
                          dilation,
                          std_mul,
                          dropout,
                          causal=True,
                          residual=True))
            in_channels = out_channels
            std_mul = 4.0

        # attention
        self.use_memory_mask = use_memory_mask
        if isinstance(attention, bool):
            self.attention = [attention] * len(convolutions)
        else:
            self.attention = attention

        if isinstance(force_monotonic_attention, bool):
            self.force_monotonic_attention = [force_monotonic_attention
                                              ] * len(convolutions)
        else:
            self.force_monotonic_attention = force_monotonic_attention

        for x, y in zip(self.force_monotonic_attention, self.attention):
            if x is True and y is False:
                raise ValueError("When not using attention, there is no "
                                 "monotonic attention at all")

        # causual convolution & attention
        self.conv_attn = []
        for use_attention, (out_channels, filter_size,
                            dilation) in zip(self.attention, convolutions):
            assert (
                in_channels == out_channels
            ), "the stack of convolution & attention does not change channels"
            conv_layer = Conv1DGLU(n_speakers,
                                   speaker_dim,
                                   in_channels,
                                   out_channels,
                                   filter_size,
                                   dilation,
                                   std_mul,
                                   dropout,
                                   causal=True,
                                   residual=False)
            attn_layer = Attention(
                out_channels,
                embed_dim,
                dropout,
                window_range,
                key_projection=key_projection,
                value_projection=value_projection) if use_attention else None
            in_channels = out_channels
            std_mul = 4.0
            self.conv_attn.append((conv_layer, attn_layer))
        for i, (conv_layer, attn_layer) in enumerate(self.conv_attn):
            self.add_sublayer("conv_{}".format(i), conv_layer)
            if attn_layer is not None:
                self.add_sublayer("attn_{}".format(i), attn_layer)

        # 1 * 1 conv to transform channels
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        self.last_conv = Conv1D(in_channels,
                                mel_dim * r,
                                1,
                                param_attr=I.Normal(scale=std))

        # mel (before sigmoid) to done hat
        std = np.sqrt(1 / in_channels)
        self.fc = Conv1D(mel_dim * r, 1, 1, param_attr=I.Normal(scale=std))

        # decoding configs
        self.max_decoder_steps = 200
        self.min_decoder_steps = 10

        assert convolutions[-1].out_channels % r == 0, \
                "decoder_state dim must be divided by r"
        self.state_dim = convolutions[-1].out_channels // self.r
Beispiel #16
0
    def __init__(self,
                 n_speakers,
                 speaker_dim,
                 in_channels,
                 num_filters,
                 filter_size=1,
                 dilation=1,
                 std_mul=4.0,
                 dropout=0.0,
                 causal=False,
                 residual=True):
        """[summary]

        Args:
            n_speakers (int): number of speakers.
            speaker_dim (int): speaker embedding's size.
            in_channels (int): channels of the input.
            num_filters (int): channels of the output.
            filter_size (int, optional): filter size of the internal Conv1DCell. Defaults to 1.
            dilation (int, optional): dilation of the internal Conv1DCell. Defaults to 1.
            std_mul (float, optional): [description]. Defaults to 4.0.
            dropout (float, optional): dropout probability. Defaults to 0.0.
            causal (bool, optional): padding of the Conv1DCell. It shoudl be True if `add_input` method of `Conv1DCell` is ever used. Defaults to False.
            residual (bool, optional): whether to use residual connection. If True, in_channels shoudl equals num_filters. Defaults to True.
        """
        super(Conv1DGLU, self).__init__()
        # conv spec
        self.in_channels = in_channels
        self.n_speakers = n_speakers
        self.speaker_dim = speaker_dim
        self.num_filters = num_filters
        self.filter_size = filter_size
        self.dilation = dilation

        # padding
        self.causal = causal

        # weight init and dropout
        self.std_mul = std_mul
        self.dropout = dropout

        self.residual = residual
        if residual:
            assert (
                in_channels == num_filters
            ), "this block uses residual connection"\
                "the input_channes should equals num_filters"
        std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
        self.conv = Conv1DCell(in_channels,
                               2 * num_filters,
                               filter_size,
                               dilation,
                               causal,
                               param_attr=I.Normal(scale=std))

        if n_speakers > 1:
            assert (speaker_dim is not None
                    ), "speaker embed should not be null in multi-speaker case"
            std = np.sqrt(1 / speaker_dim)
            self.fc = Linear(speaker_dim,
                             num_filters,
                             param_attr=I.Normal(scale=std))
Beispiel #17
0
def make_model(config):
    c = config["model"]
    # speaker embedding
    n_speakers = c["n_speakers"]
    speaker_dim = c["speaker_embed_dim"]
    if n_speakers > 1:
        speaker_embed = dg.Embedding(
            (n_speakers, speaker_dim),
            param_attr=I.Normal(scale=c["speaker_embedding_weight_std"]))
    else:
        speaker_embed = None

    # encoder
    h = c["encoder_channels"]
    k = c["kernel_size"]
    encoder_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
    )
    encoder = Encoder(n_vocab=en.n_vocab,
                      embed_dim=c["text_embed_dim"],
                      n_speakers=n_speakers,
                      speaker_dim=speaker_dim,
                      embedding_weight_std=c["embedding_weight_std"],
                      convolutions=encoder_convolutions,
                      dropout=c["dropout"])
    if c["freeze_embedding"]:
        freeze(encoder.embed)

    # decoder
    h = c["decoder_channels"]
    k = c["kernel_size"]
    prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3))
    attentive_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
    )
    attention = [True, False, False, False, True]
    force_monotonic_attention = [True, False, False, False, True]
    window = WindowRange(c["window_backward"], c["window_ahead"])
    decoder = Decoder(n_speakers,
                      speaker_dim,
                      embed_dim=c["text_embed_dim"],
                      mel_dim=config["transform"]["n_mels"],
                      r=c["outputs_per_step"],
                      max_positions=c["max_positions"],
                      preattention=prenet_convolutions,
                      convolutions=attentive_convolutions,
                      attention=attention,
                      dropout=c["dropout"],
                      use_memory_mask=c["use_memory_mask"],
                      force_monotonic_attention=force_monotonic_attention,
                      query_position_rate=c["query_position_rate"],
                      key_position_rate=c["key_position_rate"],
                      window_range=window,
                      key_projection=c["key_projection"],
                      value_projection=c["value_projection"])
    if not c["trainable_positional_encodings"]:
        freeze(decoder.embed_keys_positions)
        freeze(decoder.embed_query_positions)

    # converter(postnet)
    linear_dim = 1 + config["transform"]["n_fft"] // 2
    h = c["converter_channels"]
    k = c["kernel_size"]
    postnet_convolutions = (
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(2 * h, k, 1),
        ConvSpec(2 * h, k, 3),
    )
    use_decoder_states = c["use_decoder_state_for_postnet_input"]
    converter = Converter(n_speakers,
                          speaker_dim,
                          in_channels=decoder.state_dim if use_decoder_states
                          else config["transform"]["n_mels"],
                          linear_dim=linear_dim,
                          time_upsampling=c["downsample_factor"],
                          convolutions=postnet_convolutions,
                          dropout=c["dropout"])

    model = DeepVoice3(encoder,
                       decoder,
                       converter,
                       speaker_embed,
                       use_decoder_states=use_decoder_states)
    return model
Beispiel #18
0
    def __init__(self,
                 n_speakers,
                 speaker_dim,
                 in_channels,
                 linear_dim,
                 convolutions=(ConvSpec(256, 5, 1), ) * 4,
                 time_upsampling=1,
                 dropout=0.0):
        """Vocoder that transforms mel spectrogram (or ecoder hidden states) to waveform.

        Args:
            n_speakers (int): number of speakers.
            speaker_dim (int): speaker embedding size.
            in_channels (int): channels of the input.
            linear_dim (int): channels of the linear spectrogram.
            convolutions (Iterable[ConvSpec], optional): specifications of the internal convolutional layers. ConvSpec is a namedtuple of (output_channels, filter_size, dilation) Defaults to (ConvSpec(256, 5, 1), )*4.
            time_upsampling (int, optional): time upsampling factor of the converter, possible options are {1, 2, 4}. Note that this should equals the downsample factor of the mel spectrogram. Defaults to 1.
            dropout (float, optional): dropout probability. Defaults to 0.0.
        """
        super(Converter, self).__init__()

        self.n_speakers = n_speakers
        self.speaker_dim = speaker_dim
        self.in_channels = in_channels
        self.linear_dim = linear_dim
        # CAUTION: this should equals the downsampling steps coefficient
        self.time_upsampling = time_upsampling
        self.dropout = dropout

        target_channels = convolutions[0].out_channels

        # conv proj to target channels
        self.first_conv_proj = Conv1D(
            in_channels,
            target_channels,
            1,
            param_attr=I.Normal(scale=np.sqrt(1 / in_channels)))

        # Idea from nyanko
        if time_upsampling == 4:
            self.upsampling_convolutions = dg.LayerList(
                upsampling_4x_blocks(n_speakers, speaker_dim, target_channels,
                                     dropout))
        elif time_upsampling == 2:
            self.upsampling_convolutions = dg.LayerList(
                upsampling_2x_blocks(n_speakers, speaker_dim, target_channels,
                                     dropout))
        elif time_upsampling == 1:
            self.upsampling_convolutions = dg.LayerList(
                upsampling_1x_blocks(n_speakers, speaker_dim, target_channels,
                                     dropout))
        else:
            raise ValueError(
                "Upsampling factors other than {1, 2, 4} are Not supported.")

        # post conv layers
        std_mul = 4.0
        in_channels = target_channels
        self.convolutions = dg.LayerList()
        for (out_channels, filter_size, dilation) in convolutions:
            if in_channels != out_channels:
                std = np.sqrt(std_mul / in_channels)
                # CAUTION: relu
                self.convolutions.append(
                    Conv1D(in_channels,
                           out_channels,
                           1,
                           act="relu",
                           param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0
            self.convolutions.append(
                Conv1DGLU(n_speakers,
                          speaker_dim,
                          in_channels,
                          out_channels,
                          filter_size,
                          dilation=dilation,
                          std_mul=std_mul,
                          dropout=dropout))
            in_channels = out_channels
            std_mul = 4.0

        # final conv proj, channel transformed to linear dim
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        # CAUTION: sigmoid
        self.last_conv_proj = Conv1D(in_channels,
                                     linear_dim,
                                     1,
                                     act="sigmoid",
                                     param_attr=I.Normal(scale=std))