def __init__(self, encoder_model_dimension: int, decoder_model_dimension: int, encoder_num_heads: list, decoder_num_heads: list, encoder_maximum_position_encoding: int, decoder_maximum_position_encoding: int, encoder_dense_blocks: int, decoder_dense_blocks: int, encoder_prenet_dimension: int, decoder_prenet_dimension: int, postnet_conv_filters: int, postnet_conv_layers: int, postnet_kernel_size: int, dropout_rate: float, mel_start_value: int, mel_end_value: int, mel_channels: int, xvec_channels: int, phoneme_language: str, with_stress: bool, encoder_attention_conv_filters: int = None, decoder_attention_conv_filters: int = None, encoder_attention_conv_kernel: int = None, decoder_attention_conv_kernel: int = None, encoder_feed_forward_dimension: int = None, decoder_feed_forward_dimension: int = None, decoder_prenet_dropout=0.5, max_r: int = 10, debug=False, **kwargs): super(AutoregressiveTransformer, self).__init__(**kwargs) self.start_vec = tf.ones( (1, mel_channels), dtype=tf.float32) * mel_start_value self.end_vec = tf.ones( (1, mel_channels), dtype=tf.float32) * mel_end_value self.stop_prob_index = 2 self.max_r = max_r self.r = max_r self.mel_channels = mel_channels self.drop_n_heads = 0 self.text_pipeline = Pipeline.default_pipeline(phoneme_language, add_start_end=True, with_stress=with_stress) self.encoder_prenet = tf.keras.layers.Embedding( self.text_pipeline.tokenizer.vocab_size, encoder_prenet_dimension, name='Embedding') self.enc_speaker_mod = enc_Speaker_module(dim=512) self.dec_speaker_mod = dec_Speaker_module(dim=256) self.encoder = SelfAttentionBlocks( model_dim=encoder_model_dimension, dropout_rate=dropout_rate, num_heads=encoder_num_heads, feed_forward_dimension=encoder_feed_forward_dimension, maximum_position_encoding=encoder_maximum_position_encoding, dense_blocks=encoder_dense_blocks, conv_filters=encoder_attention_conv_filters, kernel_size=encoder_attention_conv_kernel, conv_activation='relu', name='Encoder') self.decoder_prenet = DecoderPrenet( model_dim=decoder_model_dimension, dense_hidden_units=decoder_prenet_dimension, dropout_rate=decoder_prenet_dropout, name='DecoderPrenet') self.decoder = CrossAttentionBlocks( model_dim=decoder_model_dimension, dropout_rate=dropout_rate, num_heads=decoder_num_heads, feed_forward_dimension=decoder_feed_forward_dimension, maximum_position_encoding=decoder_maximum_position_encoding, dense_blocks=decoder_dense_blocks, conv_filters=decoder_attention_conv_filters, conv_kernel=decoder_attention_conv_kernel, conv_activation='relu', conv_padding='causal', name='Decoder') self.final_proj_mel = tf.keras.layers.Dense(self.mel_channels * self.max_r, name='FinalProj') self.decoder_postnet = Postnet(mel_channels=mel_channels, conv_filters=postnet_conv_filters, conv_layers=postnet_conv_layers, kernel_size=postnet_kernel_size, name='Postnet') self.training_input_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32), tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32) ] self.forward_input_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32), tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32) ] self.encoder_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32) ] self.decoder_signature = [ tf.TensorSpec(shape=(None, None, encoder_model_dimension), dtype=tf.float32), tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32), tf.TensorSpec(shape=(None, None, None, None), dtype=tf.float32), tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32) ] self.debug = debug self._apply_all_signatures()
def __init__(self, encoder_model_dimension: int, decoder_model_dimension: int, dropout_rate: float, decoder_num_heads: list, encoder_num_heads: list, encoder_maximum_position_encoding: int, decoder_maximum_position_encoding: int, postnet_conv_filters: int, postnet_conv_layers: int, postnet_kernel_size: int, encoder_dense_blocks: int, decoder_dense_blocks: int, mel_channels: int, phoneme_language: str, with_stress: bool, encoder_attention_conv_filters: int = None, decoder_attention_conv_filters: int = None, encoder_attention_conv_kernel: int = None, decoder_attention_conv_kernel: int = None, encoder_feed_forward_dimension: int = None, decoder_feed_forward_dimension: int = None, debug=False, decoder_prenet_dropout=0., **kwargs): super(ForwardTransformer, self).__init__(**kwargs) self.text_pipeline = Pipeline.default_pipeline(phoneme_language, add_start_end=False, with_stress=with_stress) self.drop_n_heads = 0 self.mel_channels = mel_channels self.encoder_prenet = tf.keras.layers.Embedding( self.text_pipeline.tokenizer.vocab_size, encoder_model_dimension, name='Embedding') self.encoder = SelfAttentionBlocks( model_dim=encoder_model_dimension, dropout_rate=dropout_rate, num_heads=encoder_num_heads, feed_forward_dimension=encoder_feed_forward_dimension, maximum_position_encoding=encoder_maximum_position_encoding, dense_blocks=encoder_dense_blocks, conv_filters=encoder_attention_conv_filters, kernel_size=encoder_attention_conv_kernel, conv_activation='relu', name='Encoder') self.dur_pred = DurationPredictor(model_dim=encoder_model_dimension, kernel_size=3, conv_padding='same', conv_activation='relu', conv_block_n=2, dense_activation='relu', name='dur_pred') self.expand = Expand(name='expand', model_dim=encoder_model_dimension) self.decoder_prenet = DecoderPrenet( model_dim=decoder_model_dimension, dense_hidden_units=decoder_feed_forward_dimension, dropout_rate=decoder_prenet_dropout, name='DecoderPrenet') self.decoder = SelfAttentionBlocks( model_dim=decoder_model_dimension, dropout_rate=dropout_rate, num_heads=decoder_num_heads, feed_forward_dimension=decoder_feed_forward_dimension, maximum_position_encoding=decoder_maximum_position_encoding, dense_blocks=decoder_dense_blocks, conv_filters=decoder_attention_conv_filters, kernel_size=decoder_attention_conv_kernel, conv_activation='relu', name='Decoder') self.out = tf.keras.layers.Dense(mel_channels) self.decoder_postnet = CNNResNorm(out_size=mel_channels, kernel_size=postnet_kernel_size, padding='same', inner_activation='tanh', last_activation='linear', hidden_size=postnet_conv_filters, n_layers=postnet_conv_layers, normalization='batch', name='Postnet') self.training_input_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32), tf.TensorSpec(shape=(None, None), dtype=tf.int32) ] self.forward_input_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(), dtype=tf.float32), ] self.debug = debug self._apply_all_signatures()
def __init__(self, mel_channels: int, encoder_model_dimension: int, decoder_model_dimension: int, encoder_num_heads: list, decoder_num_heads: list, encoder_feed_forward_dimension: int, decoder_feed_forward_dimension: int, encoder_maximum_position_encoding: int, decoder_maximum_position_encoding: int, encoder_dense_blocks: int, decoder_dense_blocks: int, encoder_prenet_dimension: int, decoder_prenet_dimension: int, postnet_conv_filters: int, postnet_conv_layers: int, postnet_kernel_size: int, dropout_rate: float, mel_start_value: int, mel_end_value: int, max_r: int = 10, phoneme_language: str = 'en', decoder_prenet_dropout=0., debug=False, **kwargs): super(AutoregressiveTransformer, self).__init__(**kwargs) self.start_vec = tf.ones((1, mel_channels), dtype=tf.float32) * mel_start_value self.end_vec = tf.ones((1, mel_channels), dtype=tf.float32) * mel_end_value self.stop_prob_index = 2 self.max_r = max_r self.r = max_r self.mel_channels = mel_channels self.decoder_prenet_dropout = decoder_prenet_dropout self.drop_n_heads = 0 self.tokenizer = Tokenizer(sorted(list(_phonemes) + list(_punctuations)), add_start_end=True) self.phonemizer = Phonemizer(language=phoneme_language) self.encoder_prenet = tf.keras.layers.Embedding(self.tokenizer.vocab_size, encoder_prenet_dimension, name='Embedding') self.encoder = SelfAttentionBlocks(model_dim=encoder_model_dimension, dropout_rate=dropout_rate, num_heads=encoder_num_heads, feed_forward_dimension=encoder_feed_forward_dimension, maximum_position_encoding=encoder_maximum_position_encoding, dense_blocks=encoder_dense_blocks, name='Encoder') self.decoder_prenet = DecoderPrenet(model_dim=decoder_model_dimension, dense_hidden_units=decoder_prenet_dimension, name='DecoderPrenet') self.decoder = CrossAttentionBlocks(model_dim=decoder_model_dimension, dropout_rate=dropout_rate, num_heads=decoder_num_heads, feed_forward_dimension=decoder_feed_forward_dimension, maximum_position_encoding=decoder_maximum_position_encoding, dense_blocks=decoder_dense_blocks, name='Decoder') self.final_proj_mel = tf.keras.layers.Dense(self.mel_channels * self.max_r, name='FinalProj') self.decoder_postnet = Postnet(mel_channels=mel_channels, conv_filters=postnet_conv_filters, conv_layers=postnet_conv_layers, kernel_size=postnet_kernel_size, name='Postnet') self.training_input_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32), tf.TensorSpec(shape=(None, None), dtype=tf.int32) ] self.forward_input_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32), tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32) ] self.encoder_signature = [ tf.TensorSpec(shape=(None, None), dtype=tf.int32) ] self.decoder_signature = [ tf.TensorSpec(shape=(None, None, encoder_model_dimension), dtype=tf.float32), tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32), tf.TensorSpec(shape=(None, None, None, None), dtype=tf.float32), ] self.debug = debug self.__apply_all_signatures()