def __init__(self, residual_channels: int, condition_dim: int, filter_size: Union[int, Sequence[int]], dilation: int): super(ResidualBlock, self).__init__() dilated_channels = 2 * residual_channels # following clarinet's implementation, we do not have parametric residual # & skip connection. _filter_size = filter_size[0] if isinstance(filter_size, (list, tuple)) else filter_size std = math.sqrt(1 / (_filter_size * residual_channels)) conv = Conv1dCell(residual_channels, dilated_channels, filter_size, dilation=dilation, weight_attr=I.Normal(scale=std)) self.conv = nn.utils.weight_norm(conv) std = math.sqrt(1 / condition_dim) condition_proj = Conv1dCell(condition_dim, dilated_channels, (1, ), weight_attr=I.Normal(scale=std)) self.condition_proj = nn.utils.weight_norm(condition_proj) self.filter_size = filter_size self.dilation = dilation self.dilated_channels = dilated_channels self.residual_channels = residual_channels self.condition_dim = condition_dim
def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout): """Return a list of Layers that upsamples the input by 4 times in time dimension. Args: n_speakers (int): number of speakers of the Conv1DGLU layers used. speaker_dim (int): speaker embedding size of the Conv1DGLU layers used. target_channels (int): channels of the input and the output.(the list of layers does not change the number of channels.) dropout (float): dropout probability. Returns: List[Layer]: upsampling layers. """ # upsampling convolitions upsampling_convolutions = [ Conv1DTranspose( target_channels, target_channels, 2, stride=2, param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))), Conv1DGLU(n_speakers, speaker_dim, target_channels, target_channels, 3, dilation=1, std_mul=1., dropout=dropout), Conv1DGLU(n_speakers, speaker_dim, target_channels, target_channels, 3, dilation=3, std_mul=4., dropout=dropout), Conv1DTranspose( target_channels, target_channels, 2, stride=2, param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))), Conv1DGLU(n_speakers, speaker_dim, target_channels, target_channels, 3, dilation=1, std_mul=1., dropout=dropout), Conv1DGLU(n_speakers, speaker_dim, target_channels, target_channels, 3, dilation=3, std_mul=4., dropout=dropout), ] return upsampling_convolutions
def __init__(self, residual_channels, condition_dim, filter_size, dilation): """A Residual block in wavenet. It does not have parametric residual or skip connection. It consists of a Conv1DCell and an Conv1D(filter_size = 1) to integrate the condition. Args: residual_channels (int): the channels of the input, residual and skip. condition_dim (int): the channels of the condition. filter_size (int): filter size of the internal convolution cell. dilation (int): dilation of the internal convolution cell. """ super(ResidualBlock, self).__init__() dilated_channels = 2 * residual_channels # following clarinet's implementation, we do not have parametric residual # & skip connection. std = np.sqrt(1 / (filter_size * residual_channels)) self.conv = Conv1DCell(residual_channels, dilated_channels, filter_size, dilation=dilation, causal=True, param_attr=I.Normal(scale=std)) std = np.sqrt(1 / condition_dim) self.condition_proj = Conv1D(condition_dim, dilated_channels, 1, param_attr=I.Normal(scale=std)) self.filter_size = filter_size self.dilation = dilation self.dilated_channels = dilated_channels self.residual_channels = residual_channels self.condition_dim = condition_dim
def __init__(self, in_channel, out_channel, has_bias=False, bias_dim=0): super(AffineBlock1, self).__init__() std = np.sqrt(1.0 / in_channel) affine = dg.Linear(in_channel, out_channel, param_attr=I.Normal(scale=std)) self.affine = weight_norm(affine, dim=-1) if has_bias: std = np.sqrt(1 / bias_dim) self.bias_affine = dg.Linear(bias_dim, out_channel, param_attr=I.Normal(scale=std)) self.has_bias = has_bias self.bias_dim = bias_dim
def __init__(self, in_channel, out_channel, has_bias=False, bias_dim=0, dropout=False, keep_prob=1.): super(AffineBlock2, self).__init__() if has_bias: std = np.sqrt(1 / bias_dim) self.bias_affine = dg.Linear(bias_dim, in_channel, param_attr=I.Normal(scale=std)) std = np.sqrt(1.0 / in_channel) affine = dg.Linear(in_channel, out_channel, param_attr=I.Normal(scale=std)) self.affine = weight_norm(affine, dim=-1) self.has_bias = has_bias self.bias_dim = bias_dim self.dropout = dropout self.keep_prob = keep_prob
def test_set_global_bias_initilizer(self): """Test Set Global Bias initilizer with NormalInitializer """ main_prog = framework.Program() startup_prog = framework.Program() fluid.set_global_initializer(initializer.Uniform(low=-0.5, high=0.5), bias_init=initializer.Normal(loc=0.0, scale=2.0)) with fluid.program_guard(main_prog, startup_prog): x = fluid.data(name="x", shape=[1, 3, 32, 32]) # default initilizer of bias in layers.conv2d is ConstantInitializer conv = fluid.layers.conv2d(x, 5, 3) block = startup_prog.global_block() self.assertEqual(len(block.ops), 2) # init bias is the first op, and weight is the second bias_init_op = block.ops[0] self.assertEqual(bias_init_op.type, 'gaussian_random') self.assertAlmostEqual(bias_init_op.attr('mean'), 0.0, delta=DELTA) self.assertAlmostEqual(bias_init_op.attr('std'), 2.0, delta=DELTA) self.assertEqual(bias_init_op.attr('seed'), 0) param_init_op = block.ops[1] self.assertEqual(param_init_op.type, 'uniform_random') self.assertAlmostEqual(param_init_op.attr('min'), -0.5, delta=DELTA) self.assertAlmostEqual(param_init_op.attr('max'), 0.5, delta=DELTA) self.assertEqual(param_init_op.attr('seed'), 0) fluid.set_global_initializer(None)
def __init__(self, in_channel, kernel_size, causal=False, has_bias=False, bias_dim=None, keep_prob=1.): super(ConvBlock, self).__init__() self.causal = causal self.keep_prob = keep_prob self.in_channel = in_channel self.has_bias = has_bias std = np.sqrt(4 * keep_prob / (kernel_size * in_channel)) padding = "valid" if causal else "same" conv = Conv1D(in_channel, 2 * in_channel, (kernel_size, ), padding=padding, data_format="NTC", param_attr=I.Normal(scale=std)) self.conv = weight_norm(conv) if has_bias: std = np.sqrt(1 / bias_dim) self.bias_affine = dg.Linear(bias_dim, 2 * in_channel, param_attr=I.Normal(scale=std))
def __init__(self, layers, in_channels, postnet_dim, kernel_size, out_channels, upsample_factor, has_bias=False, bias_dim=0, keep_prob=1.): super(PostNet, self).__init__() self.pre_affine = AffineBlock1(in_channels, postnet_dim, has_bias, bias_dim) self.convs = dg.LayerList([ ConvBlock(postnet_dim, kernel_size, False, has_bias, bias_dim, keep_prob) for _ in range(layers) ]) std = np.sqrt(1.0 / postnet_dim) post_affine = dg.Linear(postnet_dim, out_channels, param_attr=I.Normal(scale=std)) self.post_affine = weight_norm(post_affine, dim=-1) self.upsample_factor = upsample_factor
def __init__(self, attention_dim, input_dim, position_encoding_weight=1., position_rate=1., reduction_factor=1, has_bias=False, bias_dim=0, keep_prob=1.): super(AttentionBlock, self).__init__() # positional encoding omega_default = position_rate / reduction_factor self.omega_default = omega_default # multispeaker case if has_bias: std = np.sqrt(1.0 / bias_dim) self.q_pos_affine = dg.Linear(bias_dim, 1, param_attr=I.Normal(scale=std)) self.k_pos_affine = dg.Linear(bias_dim, 1, param_attr=I.Normal(scale=std)) self.omega_initial = self.create_parameter(shape=[1], attr=I.ConstantInitializer(value=omega_default)) # mind the fact that q, k, v have the same feature dimension # so we can init k_affine and q_affine's weight as the same matrix # to get a better init attention init_weight = np.random.normal(size=(input_dim, attention_dim), scale=np.sqrt(1. / input_dim)) initializer = I.NumpyArrayInitializer(init_weight.astype(np.float32)) # 3 affine transformation to project q, k, v into attention_dim q_affine = dg.Linear(input_dim, attention_dim, param_attr=initializer) self.q_affine = weight_norm(q_affine, dim=-1) k_affine = dg.Linear(input_dim, attention_dim, param_attr=initializer) self.k_affine = weight_norm(k_affine, dim=-1) std = np.sqrt(1.0 / input_dim) v_affine = dg.Linear(input_dim, attention_dim, param_attr=I.Normal(scale=std)) self.v_affine = weight_norm(v_affine, dim=-1) std = np.sqrt(1.0 / attention_dim) out_affine = dg.Linear(attention_dim, input_dim, param_attr=I.Normal(scale=std)) self.out_affine = weight_norm(out_affine, dim=-1) self.keep_prob = keep_prob self.has_bias = has_bias self.bias_dim = bias_dim self.attention_dim = attention_dim self.position_encoding_weight = position_encoding_weight
def __init__(self, in_channels, reduction_factor, prenet_sizes, layers, kernel_size, attention_dim, position_encoding_weight=1., omega=1., has_bias=False, bias_dim=0, keep_prob=1.): super(Decoder, self).__init__() # prenet-mind the difference of AffineBlock2 and AffineBlock1 c_in = in_channels self.prenet = dg.LayerList() for i, c_out in enumerate(prenet_sizes): affine = AffineBlock2(c_in, c_out, has_bias, bias_dim, dropout=(i!=0), keep_prob=keep_prob) self.prenet.append(affine) c_in = c_out # causal convolutions + multihop attention decoder_dim = prenet_sizes[-1] self.causal_convs = dg.LayerList() self.attention_blocks = dg.LayerList() for i in range(layers): conv = ConvBlock(decoder_dim, kernel_size, True, has_bias, bias_dim, keep_prob) attn = AttentionBlock(attention_dim, decoder_dim, position_encoding_weight, omega, reduction_factor, has_bias, bias_dim, keep_prob) self.causal_convs.append(conv) self.attention_blocks.append(attn) # output mel spectrogram output_dim = reduction_factor * in_channels # r * mel_dim std = np.sqrt(1.0 / decoder_dim) out_affine = dg.Linear(decoder_dim, output_dim, param_attr=I.Normal(scale=std)) self.out_affine = weight_norm(out_affine, dim=-1) if has_bias: std = np.sqrt(1 / bias_dim) self.out_sp_affine = dg.Linear(bias_dim, output_dim, param_attr=I.Normal(scale=std)) self.has_bias = has_bias self.kernel_size = kernel_size self.in_channels = in_channels self.decoder_dim = decoder_dim self.reduction_factor = reduction_factor self.out_channels = output_dim
def __init__(self, query_dim, embed_dim, dropout=0.0, window_range=WindowRange(-1, 3), key_projection=True, value_projection=True): """Attention Layer for Deep Voice 3. Args: query_dim (int): the dimension of query vectors. (The size of a single vector of query.) embed_dim (int): the dimension of keys and values. dropout (float, optional): dropout probability of attention. Defaults to 0.0. window_range (WindowRange, optional): range of attention, this is only used at inference. Defaults to WindowRange(-1, 3). key_projection (bool, optional): whether the `Attention` Layer has a Linear Layer for the keys to pass through before computing attention. Defaults to True. value_projection (bool, optional): whether the `Attention` Layer has a Linear Layer for the values to pass through before computing attention. Defaults to True. """ super(Attention, self).__init__() std = np.sqrt(1 / query_dim) self.query_proj = Linear( query_dim, embed_dim, param_attr=I.Normal(scale=std)) if key_projection: std = np.sqrt(1 / embed_dim) self.key_proj = Linear( embed_dim, embed_dim, param_attr=I.Normal(scale=std)) if value_projection: std = np.sqrt(1 / embed_dim) self.value_proj = Linear( embed_dim, embed_dim, param_attr=I.Normal(scale=std)) std = np.sqrt(1 / embed_dim) self.out_proj = Linear( embed_dim, query_dim, param_attr=I.Normal(scale=std)) self.key_projection = key_projection self.value_projection = value_projection self.dropout = dropout self.window_range = window_range
def create_model(config): char_embedding = dg.Embedding((en.n_vocab, config["char_dim"]), param_attr=I.Normal(scale=0.1)) multi_speaker = config["n_speakers"] > 1 speaker_embedding = dg.Embedding((config["n_speakers"], config["speaker_dim"]), param_attr=I.Normal(scale=0.1)) \ if multi_speaker else None encoder = Encoder(config["encoder_layers"], config["char_dim"], config["encoder_dim"], config["kernel_size"], has_bias=multi_speaker, bias_dim=config["speaker_dim"], keep_prob=1.0 - config["dropout"]) decoder = Decoder(config["n_mels"], config["reduction_factor"], list(config["prenet_sizes"]) + [config["char_dim"]], config["decoder_layers"], config["kernel_size"], config["attention_dim"], position_encoding_weight=config["position_weight"], omega=config["position_rate"], has_bias=multi_speaker, bias_dim=config["speaker_dim"], keep_prob=1.0 - config["dropout"]) postnet = PostNet(config["postnet_layers"], config["char_dim"], config["postnet_dim"], config["kernel_size"], config["n_mels"], config["reduction_factor"], has_bias=multi_speaker, bias_dim=config["speaker_dim"], keep_prob=1.0 - config["dropout"]) spectranet = SpectraNet(char_embedding, speaker_embedding, encoder, decoder, postnet) return spectranet
def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx, embedding_std, max_positions, n_vocab, freeze_embedding, filter_size, encoder_channels, mel_dim, decoder_channels, r, trainable_positional_encodings, use_memory_mask, query_position_rate, key_position_rate, window_behind, window_ahead, key_projection, value_projection, downsample_factor, linear_dim, use_decoder_states, converter_channels, dropout): """just a simple function to create a deepvoice 3 model""" if n_speakers > 1: spe = dg.Embedding((n_speakers, speaker_dim), param_attr=I.Normal(scale=speaker_embed_std)) else: spe = None h = encoder_channels k = filter_size encoder_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ConvSpec(h, k, 3), ) enc = Encoder(n_vocab, embed_dim, n_speakers, speaker_dim, padding_idx=None, embedding_weight_std=embedding_std, convolutions=encoder_convolutions, dropout=dropout) if freeze_embedding: freeze(enc.embed) h = decoder_channels prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3)) attentive_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ) attention = [True, False, False, False, True] force_monotonic_attention = [True, False, False, False, True] dec = Decoder(n_speakers, speaker_dim, embed_dim, mel_dim, r=r, max_positions=max_positions, preattention=prenet_convolutions, convolutions=attentive_convolutions, attention=attention, dropout=dropout, use_memory_mask=use_memory_mask, force_monotonic_attention=force_monotonic_attention, query_position_rate=query_position_rate, key_position_rate=key_position_rate, window_range=WindowRange(window_behind, window_ahead), key_projection=key_projection, value_projection=value_projection) if not trainable_positional_encodings: freeze(dec.embed_keys_positions) freeze(dec.embed_query_positions) h = converter_channels postnet_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(2 * h, k, 1), ConvSpec(2 * h, k, 3), ) cvt = Converter(n_speakers, speaker_dim, dec.state_dim if use_decoder_states else mel_dim, linear_dim, time_upsampling=downsample_factor, convolutions=postnet_convolutions, dropout=dropout) dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states) return dv3
def __init__(self, n_vocab, embed_dim, n_speakers, speaker_dim, padding_idx=None, embedding_weight_std=0.1, convolutions=(ConvSpec(64, 5, 1), ) * 7, dropout=0.): """Encoder of Deep Voice 3. Args: n_vocab (int): vocabulary size of the text embedding. embed_dim (int): embedding size of the text embedding. n_speakers (int): number of speakers. speaker_dim (int): speaker embedding size. padding_idx (int, optional): padding index of text embedding. Defaults to None. embedding_weight_std (float, optional): standard deviation of the embedding weights when intialized. Defaults to 0.1. convolutions (Iterable[ConvSpec], optional): specifications of the convolutional layers. ConvSpec is a namedtuple of output channels, filter_size and dilation. Defaults to (ConvSpec(64, 5, 1), )*7. dropout (float, optional): dropout probability. Defaults to 0.. """ super(Encoder, self).__init__() self.embedding_weight_std = embedding_weight_std self.embed = dg.Embedding( (n_vocab, embed_dim), padding_idx=padding_idx, param_attr=I.Normal(scale=embedding_weight_std)) self.dropout = dropout if n_speakers > 1: std = np.sqrt((1 - dropout) / speaker_dim) self.sp_proj1 = Linear(speaker_dim, embed_dim, act="softsign", param_attr=I.Normal(scale=std)) self.sp_proj2 = Linear(speaker_dim, embed_dim, act="softsign", param_attr=I.Normal(scale=std)) self.n_speakers = n_speakers self.convolutions = dg.LayerList() in_channels = embed_dim std_mul = 1.0 for (out_channels, filter_size, dilation) in convolutions: # 1 * 1 convolution & relu if in_channels != out_channels: std = np.sqrt(std_mul / in_channels) self.convolutions.append( Conv1D(in_channels, out_channels, 1, act="relu", param_attr=I.Normal(scale=std))) in_channels = out_channels std_mul = 2.0 self.convolutions.append( Conv1DGLU(n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation, std_mul, dropout, causal=False, residual=True)) in_channels = out_channels std_mul = 4.0 std = np.sqrt(std_mul * (1 - dropout) / in_channels) self.convolutions.append( Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
def __init__(self, n_speakers, speaker_dim, embed_dim, mel_dim, r=1, max_positions=512, preattention=(ConvSpec(128, 5, 1), ) * 4, convolutions=(ConvSpec(128, 5, 1), ) * 4, attention=True, dropout=0.0, use_memory_mask=False, force_monotonic_attention=False, query_position_rate=1.0, key_position_rate=1.0, window_range=WindowRange(-1, 3), key_projection=True, value_projection=True): """Decoder of the Deep Voice 3 model. Args: n_speakers (int): number of speakers. speaker_dim (int): speaker embedding size. embed_dim (int): text embedding size. mel_dim (int): channel of mel input.(mel bands) r (int, optional): number of frames generated per decoder step. Defaults to 1. max_positions (int, optional): max position for text and decoder steps. Defaults to 512. convolutions (Iterable[ConvSpec], optional): specification of causal convolutional layers inside the decoder. ConvSpec is a namedtuple of output_channels, filter_size and dilation. Defaults to (ConvSpec(128, 5, 1), )*4. attention (bool or List[bool], optional): whether to use attention, it should have the same length with `convolutions` if it is a list of bool, indicating whether to have an Attention layer coupled with the corresponding convolutional layer. If it is a bool, it is repeated len(convolutions) times internally. Defaults to True. dropout (float, optional): dropout probability. Defaults to 0.0. use_memory_mask (bool, optional): whether to use memory mask at the Attention layer. It should have the same length with `attention` if it is a list of bool, indicating whether to use memory mask at the corresponding Attention layer. If it is a bool, it is repeated len(attention) times internally. Defaults to False. force_monotonic_attention (bool, optional): whether to use monotonic_attention at the Attention layer when inferencing. It should have the same length with `attention` if it is a list of bool, indicating whether to use monotonic_attention at the corresponding Attention layer. If it is a bool, it is repeated len(attention) times internally. Defaults to False. query_position_rate (float, optional): position_rate of the PositionEmbedding for query. Defaults to 1.0. key_position_rate (float, optional): position_rate of the PositionEmbedding for key. Defaults to 1.0. window_range (WindowRange, optional): window range of monotonic attention. Defaults to WindowRange(-1, 3). key_projection (bool, optional): `key_projection` of Attention layers. Defaults to True. value_projection (bool, optional): `value_projection` of Attention layers Defaults to True. """ super(Decoder, self).__init__() self.dropout = dropout self.mel_dim = mel_dim self.r = r self.query_position_rate = query_position_rate self.key_position_rate = key_position_rate self.window_range = window_range self.n_speakers = n_speakers conv_channels = convolutions[0].out_channels # only when padding idx is 0 can we easilt handle it self.embed_keys_positions = PositionEmbedding(max_positions, embed_dim) self.embed_query_positions = PositionEmbedding(max_positions, conv_channels) if n_speakers > 1: std = np.sqrt((1 - dropout) / speaker_dim) self.speaker_proj1 = Linear(speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std)) self.speaker_proj2 = Linear(speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std)) # prenet self.prenet = dg.LayerList() in_channels = mel_dim * r # multiframe std_mul = 1.0 for (out_channels, filter_size, dilation) in preattention: if in_channels != out_channels: # conv1d & relu std = np.sqrt(std_mul / in_channels) self.prenet.append( Conv1D(in_channels, out_channels, 1, act="relu", param_attr=I.Normal(scale=std))) in_channels = out_channels std_mul = 2.0 self.prenet.append( Conv1DGLU(n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation, std_mul, dropout, causal=True, residual=True)) in_channels = out_channels std_mul = 4.0 # attention self.use_memory_mask = use_memory_mask if isinstance(attention, bool): self.attention = [attention] * len(convolutions) else: self.attention = attention if isinstance(force_monotonic_attention, bool): self.force_monotonic_attention = [force_monotonic_attention ] * len(convolutions) else: self.force_monotonic_attention = force_monotonic_attention for x, y in zip(self.force_monotonic_attention, self.attention): if x is True and y is False: raise ValueError("When not using attention, there is no " "monotonic attention at all") # causual convolution & attention self.conv_attn = [] for use_attention, (out_channels, filter_size, dilation) in zip(self.attention, convolutions): assert ( in_channels == out_channels ), "the stack of convolution & attention does not change channels" conv_layer = Conv1DGLU(n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation, std_mul, dropout, causal=True, residual=False) attn_layer = Attention( out_channels, embed_dim, dropout, window_range, key_projection=key_projection, value_projection=value_projection) if use_attention else None in_channels = out_channels std_mul = 4.0 self.conv_attn.append((conv_layer, attn_layer)) for i, (conv_layer, attn_layer) in enumerate(self.conv_attn): self.add_sublayer("conv_{}".format(i), conv_layer) if attn_layer is not None: self.add_sublayer("attn_{}".format(i), attn_layer) # 1 * 1 conv to transform channels std = np.sqrt(std_mul * (1 - dropout) / in_channels) self.last_conv = Conv1D(in_channels, mel_dim * r, 1, param_attr=I.Normal(scale=std)) # mel (before sigmoid) to done hat std = np.sqrt(1 / in_channels) self.fc = Conv1D(mel_dim * r, 1, 1, param_attr=I.Normal(scale=std)) # decoding configs self.max_decoder_steps = 200 self.min_decoder_steps = 10 assert convolutions[-1].out_channels % r == 0, \ "decoder_state dim must be divided by r" self.state_dim = convolutions[-1].out_channels // self.r
def __init__(self, n_speakers, speaker_dim, in_channels, num_filters, filter_size=1, dilation=1, std_mul=4.0, dropout=0.0, causal=False, residual=True): """[summary] Args: n_speakers (int): number of speakers. speaker_dim (int): speaker embedding's size. in_channels (int): channels of the input. num_filters (int): channels of the output. filter_size (int, optional): filter size of the internal Conv1DCell. Defaults to 1. dilation (int, optional): dilation of the internal Conv1DCell. Defaults to 1. std_mul (float, optional): [description]. Defaults to 4.0. dropout (float, optional): dropout probability. Defaults to 0.0. causal (bool, optional): padding of the Conv1DCell. It shoudl be True if `add_input` method of `Conv1DCell` is ever used. Defaults to False. residual (bool, optional): whether to use residual connection. If True, in_channels shoudl equals num_filters. Defaults to True. """ super(Conv1DGLU, self).__init__() # conv spec self.in_channels = in_channels self.n_speakers = n_speakers self.speaker_dim = speaker_dim self.num_filters = num_filters self.filter_size = filter_size self.dilation = dilation # padding self.causal = causal # weight init and dropout self.std_mul = std_mul self.dropout = dropout self.residual = residual if residual: assert ( in_channels == num_filters ), "this block uses residual connection"\ "the input_channes should equals num_filters" std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels)) self.conv = Conv1DCell(in_channels, 2 * num_filters, filter_size, dilation, causal, param_attr=I.Normal(scale=std)) if n_speakers > 1: assert (speaker_dim is not None ), "speaker embed should not be null in multi-speaker case" std = np.sqrt(1 / speaker_dim) self.fc = Linear(speaker_dim, num_filters, param_attr=I.Normal(scale=std))
def make_model(config): c = config["model"] # speaker embedding n_speakers = c["n_speakers"] speaker_dim = c["speaker_embed_dim"] if n_speakers > 1: speaker_embed = dg.Embedding( (n_speakers, speaker_dim), param_attr=I.Normal(scale=c["speaker_embedding_weight_std"])) else: speaker_embed = None # encoder h = c["encoder_channels"] k = c["kernel_size"] encoder_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ConvSpec(h, k, 3), ) encoder = Encoder(n_vocab=en.n_vocab, embed_dim=c["text_embed_dim"], n_speakers=n_speakers, speaker_dim=speaker_dim, embedding_weight_std=c["embedding_weight_std"], convolutions=encoder_convolutions, dropout=c["dropout"]) if c["freeze_embedding"]: freeze(encoder.embed) # decoder h = c["decoder_channels"] k = c["kernel_size"] prenet_convolutions = (ConvSpec(h, k, 1), ConvSpec(h, k, 3)) attentive_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(h, k, 9), ConvSpec(h, k, 27), ConvSpec(h, k, 1), ) attention = [True, False, False, False, True] force_monotonic_attention = [True, False, False, False, True] window = WindowRange(c["window_backward"], c["window_ahead"]) decoder = Decoder(n_speakers, speaker_dim, embed_dim=c["text_embed_dim"], mel_dim=config["transform"]["n_mels"], r=c["outputs_per_step"], max_positions=c["max_positions"], preattention=prenet_convolutions, convolutions=attentive_convolutions, attention=attention, dropout=c["dropout"], use_memory_mask=c["use_memory_mask"], force_monotonic_attention=force_monotonic_attention, query_position_rate=c["query_position_rate"], key_position_rate=c["key_position_rate"], window_range=window, key_projection=c["key_projection"], value_projection=c["value_projection"]) if not c["trainable_positional_encodings"]: freeze(decoder.embed_keys_positions) freeze(decoder.embed_query_positions) # converter(postnet) linear_dim = 1 + config["transform"]["n_fft"] // 2 h = c["converter_channels"] k = c["kernel_size"] postnet_convolutions = ( ConvSpec(h, k, 1), ConvSpec(h, k, 3), ConvSpec(2 * h, k, 1), ConvSpec(2 * h, k, 3), ) use_decoder_states = c["use_decoder_state_for_postnet_input"] converter = Converter(n_speakers, speaker_dim, in_channels=decoder.state_dim if use_decoder_states else config["transform"]["n_mels"], linear_dim=linear_dim, time_upsampling=c["downsample_factor"], convolutions=postnet_convolutions, dropout=c["dropout"]) model = DeepVoice3(encoder, decoder, converter, speaker_embed, use_decoder_states=use_decoder_states) return model
def __init__(self, n_speakers, speaker_dim, in_channels, linear_dim, convolutions=(ConvSpec(256, 5, 1), ) * 4, time_upsampling=1, dropout=0.0): """Vocoder that transforms mel spectrogram (or ecoder hidden states) to waveform. Args: n_speakers (int): number of speakers. speaker_dim (int): speaker embedding size. in_channels (int): channels of the input. linear_dim (int): channels of the linear spectrogram. convolutions (Iterable[ConvSpec], optional): specifications of the internal convolutional layers. ConvSpec is a namedtuple of (output_channels, filter_size, dilation) Defaults to (ConvSpec(256, 5, 1), )*4. time_upsampling (int, optional): time upsampling factor of the converter, possible options are {1, 2, 4}. Note that this should equals the downsample factor of the mel spectrogram. Defaults to 1. dropout (float, optional): dropout probability. Defaults to 0.0. """ super(Converter, self).__init__() self.n_speakers = n_speakers self.speaker_dim = speaker_dim self.in_channels = in_channels self.linear_dim = linear_dim # CAUTION: this should equals the downsampling steps coefficient self.time_upsampling = time_upsampling self.dropout = dropout target_channels = convolutions[0].out_channels # conv proj to target channels self.first_conv_proj = Conv1D( in_channels, target_channels, 1, param_attr=I.Normal(scale=np.sqrt(1 / in_channels))) # Idea from nyanko if time_upsampling == 4: self.upsampling_convolutions = dg.LayerList( upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout)) elif time_upsampling == 2: self.upsampling_convolutions = dg.LayerList( upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout)) elif time_upsampling == 1: self.upsampling_convolutions = dg.LayerList( upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout)) else: raise ValueError( "Upsampling factors other than {1, 2, 4} are Not supported.") # post conv layers std_mul = 4.0 in_channels = target_channels self.convolutions = dg.LayerList() for (out_channels, filter_size, dilation) in convolutions: if in_channels != out_channels: std = np.sqrt(std_mul / in_channels) # CAUTION: relu self.convolutions.append( Conv1D(in_channels, out_channels, 1, act="relu", param_attr=I.Normal(scale=std))) in_channels = out_channels std_mul = 2.0 self.convolutions.append( Conv1DGLU(n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation=dilation, std_mul=std_mul, dropout=dropout)) in_channels = out_channels std_mul = 4.0 # final conv proj, channel transformed to linear dim std = np.sqrt(std_mul * (1 - dropout) / in_channels) # CAUTION: sigmoid self.last_conv_proj = Conv1D(in_channels, linear_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))