Example #1
0
    def __init__(self,
                 encoder_model_dimension: int,
                 decoder_model_dimension: int,
                 encoder_num_heads: list,
                 decoder_num_heads: list,
                 encoder_maximum_position_encoding: int,
                 decoder_maximum_position_encoding: int,
                 encoder_dense_blocks: int,
                 decoder_dense_blocks: int,
                 encoder_prenet_dimension: int,
                 decoder_prenet_dimension: int,
                 postnet_conv_filters: int,
                 postnet_conv_layers: int,
                 postnet_kernel_size: int,
                 dropout_rate: float,
                 mel_start_value: int,
                 mel_end_value: int,
                 mel_channels: int,
                 xvec_channels: int,
                 phoneme_language: str,
                 with_stress: bool,
                 encoder_attention_conv_filters: int = None,
                 decoder_attention_conv_filters: int = None,
                 encoder_attention_conv_kernel: int = None,
                 decoder_attention_conv_kernel: int = None,
                 encoder_feed_forward_dimension: int = None,
                 decoder_feed_forward_dimension: int = None,
                 decoder_prenet_dropout=0.5,
                 max_r: int = 10,
                 debug=False,
                 **kwargs):
        super(AutoregressiveTransformer, self).__init__(**kwargs)
        self.start_vec = tf.ones(
            (1, mel_channels), dtype=tf.float32) * mel_start_value
        self.end_vec = tf.ones(
            (1, mel_channels), dtype=tf.float32) * mel_end_value
        self.stop_prob_index = 2
        self.max_r = max_r
        self.r = max_r
        self.mel_channels = mel_channels
        self.drop_n_heads = 0
        self.text_pipeline = Pipeline.default_pipeline(phoneme_language,
                                                       add_start_end=True,
                                                       with_stress=with_stress)
        self.encoder_prenet = tf.keras.layers.Embedding(
            self.text_pipeline.tokenizer.vocab_size,
            encoder_prenet_dimension,
            name='Embedding')
        self.enc_speaker_mod = enc_Speaker_module(dim=512)
        self.dec_speaker_mod = dec_Speaker_module(dim=256)
        self.encoder = SelfAttentionBlocks(
            model_dim=encoder_model_dimension,
            dropout_rate=dropout_rate,
            num_heads=encoder_num_heads,
            feed_forward_dimension=encoder_feed_forward_dimension,
            maximum_position_encoding=encoder_maximum_position_encoding,
            dense_blocks=encoder_dense_blocks,
            conv_filters=encoder_attention_conv_filters,
            kernel_size=encoder_attention_conv_kernel,
            conv_activation='relu',
            name='Encoder')
        self.decoder_prenet = DecoderPrenet(
            model_dim=decoder_model_dimension,
            dense_hidden_units=decoder_prenet_dimension,
            dropout_rate=decoder_prenet_dropout,
            name='DecoderPrenet')
        self.decoder = CrossAttentionBlocks(
            model_dim=decoder_model_dimension,
            dropout_rate=dropout_rate,
            num_heads=decoder_num_heads,
            feed_forward_dimension=decoder_feed_forward_dimension,
            maximum_position_encoding=decoder_maximum_position_encoding,
            dense_blocks=decoder_dense_blocks,
            conv_filters=decoder_attention_conv_filters,
            conv_kernel=decoder_attention_conv_kernel,
            conv_activation='relu',
            conv_padding='causal',
            name='Decoder')
        self.final_proj_mel = tf.keras.layers.Dense(self.mel_channels *
                                                    self.max_r,
                                                    name='FinalProj')
        self.decoder_postnet = Postnet(mel_channels=mel_channels,
                                       conv_filters=postnet_conv_filters,
                                       conv_layers=postnet_conv_layers,
                                       kernel_size=postnet_kernel_size,
                                       name='Postnet')

        self.training_input_signature = [
            tf.TensorSpec(shape=(None, None), dtype=tf.int32),
            tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32),
            tf.TensorSpec(shape=(None, None), dtype=tf.int32),
            tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32)
        ]
        self.forward_input_signature = [
            tf.TensorSpec(shape=(None, None), dtype=tf.int32),
            tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32),
            tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32)
        ]
        self.encoder_signature = [
            tf.TensorSpec(shape=(None, None), dtype=tf.int32),
            tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32)
        ]
        self.decoder_signature = [
            tf.TensorSpec(shape=(None, None, encoder_model_dimension),
                          dtype=tf.float32),
            tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32),
            tf.TensorSpec(shape=(None, None, None, None), dtype=tf.float32),
            tf.TensorSpec(shape=(None, None, xvec_channels), dtype=tf.float32)
        ]
        self.debug = debug
        self._apply_all_signatures()
Example #2
0
 def __init__(self,
              encoder_model_dimension: int,
              decoder_model_dimension: int,
              dropout_rate: float,
              decoder_num_heads: list,
              encoder_num_heads: list,
              encoder_maximum_position_encoding: int,
              decoder_maximum_position_encoding: int,
              postnet_conv_filters: int,
              postnet_conv_layers: int,
              postnet_kernel_size: int,
              encoder_dense_blocks: int,
              decoder_dense_blocks: int,
              mel_channels: int,
              phoneme_language: str,
              with_stress: bool,
              encoder_attention_conv_filters: int = None,
              decoder_attention_conv_filters: int = None,
              encoder_attention_conv_kernel: int = None,
              decoder_attention_conv_kernel: int = None,
              encoder_feed_forward_dimension: int = None,
              decoder_feed_forward_dimension: int = None,
              debug=False,
              decoder_prenet_dropout=0.,
              **kwargs):
     super(ForwardTransformer, self).__init__(**kwargs)
     self.text_pipeline = Pipeline.default_pipeline(phoneme_language,
                                                    add_start_end=False,
                                                    with_stress=with_stress)
     self.drop_n_heads = 0
     self.mel_channels = mel_channels
     self.encoder_prenet = tf.keras.layers.Embedding(
         self.text_pipeline.tokenizer.vocab_size,
         encoder_model_dimension,
         name='Embedding')
     self.encoder = SelfAttentionBlocks(
         model_dim=encoder_model_dimension,
         dropout_rate=dropout_rate,
         num_heads=encoder_num_heads,
         feed_forward_dimension=encoder_feed_forward_dimension,
         maximum_position_encoding=encoder_maximum_position_encoding,
         dense_blocks=encoder_dense_blocks,
         conv_filters=encoder_attention_conv_filters,
         kernel_size=encoder_attention_conv_kernel,
         conv_activation='relu',
         name='Encoder')
     self.dur_pred = DurationPredictor(model_dim=encoder_model_dimension,
                                       kernel_size=3,
                                       conv_padding='same',
                                       conv_activation='relu',
                                       conv_block_n=2,
                                       dense_activation='relu',
                                       name='dur_pred')
     self.expand = Expand(name='expand', model_dim=encoder_model_dimension)
     self.decoder_prenet = DecoderPrenet(
         model_dim=decoder_model_dimension,
         dense_hidden_units=decoder_feed_forward_dimension,
         dropout_rate=decoder_prenet_dropout,
         name='DecoderPrenet')
     self.decoder = SelfAttentionBlocks(
         model_dim=decoder_model_dimension,
         dropout_rate=dropout_rate,
         num_heads=decoder_num_heads,
         feed_forward_dimension=decoder_feed_forward_dimension,
         maximum_position_encoding=decoder_maximum_position_encoding,
         dense_blocks=decoder_dense_blocks,
         conv_filters=decoder_attention_conv_filters,
         kernel_size=decoder_attention_conv_kernel,
         conv_activation='relu',
         name='Decoder')
     self.out = tf.keras.layers.Dense(mel_channels)
     self.decoder_postnet = CNNResNorm(out_size=mel_channels,
                                       kernel_size=postnet_kernel_size,
                                       padding='same',
                                       inner_activation='tanh',
                                       last_activation='linear',
                                       hidden_size=postnet_conv_filters,
                                       n_layers=postnet_conv_layers,
                                       normalization='batch',
                                       name='Postnet')
     self.training_input_signature = [
         tf.TensorSpec(shape=(None, None), dtype=tf.int32),
         tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32),
         tf.TensorSpec(shape=(None, None), dtype=tf.int32)
     ]
     self.forward_input_signature = [
         tf.TensorSpec(shape=(None, None), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.float32),
     ]
     self.debug = debug
     self._apply_all_signatures()
Example #3
0
 def __init__(self,
              mel_channels: int,
              encoder_model_dimension: int,
              decoder_model_dimension: int,
              encoder_num_heads: list,
              decoder_num_heads: list,
              encoder_feed_forward_dimension: int,
              decoder_feed_forward_dimension: int,
              encoder_maximum_position_encoding: int,
              decoder_maximum_position_encoding: int,
              encoder_dense_blocks: int,
              decoder_dense_blocks: int,
              encoder_prenet_dimension: int,
              decoder_prenet_dimension: int,
              postnet_conv_filters: int,
              postnet_conv_layers: int,
              postnet_kernel_size: int,
              dropout_rate: float,
              mel_start_value: int,
              mel_end_value: int,
              max_r: int = 10,
              phoneme_language: str = 'en',
              decoder_prenet_dropout=0.,
              debug=False,
              **kwargs):
     super(AutoregressiveTransformer, self).__init__(**kwargs)
     self.start_vec = tf.ones((1, mel_channels), dtype=tf.float32) * mel_start_value
     self.end_vec = tf.ones((1, mel_channels), dtype=tf.float32) * mel_end_value
     self.stop_prob_index = 2
     self.max_r = max_r
     self.r = max_r
     self.mel_channels = mel_channels
     self.decoder_prenet_dropout = decoder_prenet_dropout
     self.drop_n_heads = 0
     
     self.tokenizer = Tokenizer(sorted(list(_phonemes) + list(_punctuations)), add_start_end=True)
     self.phonemizer = Phonemizer(language=phoneme_language)
     self.encoder_prenet = tf.keras.layers.Embedding(self.tokenizer.vocab_size, encoder_prenet_dimension,
                                                     name='Embedding')
     self.encoder = SelfAttentionBlocks(model_dim=encoder_model_dimension,
                                        dropout_rate=dropout_rate,
                                        num_heads=encoder_num_heads,
                                        feed_forward_dimension=encoder_feed_forward_dimension,
                                        maximum_position_encoding=encoder_maximum_position_encoding,
                                        dense_blocks=encoder_dense_blocks,
                                        name='Encoder')
     self.decoder_prenet = DecoderPrenet(model_dim=decoder_model_dimension,
                                         dense_hidden_units=decoder_prenet_dimension,
                                         name='DecoderPrenet')
     self.decoder = CrossAttentionBlocks(model_dim=decoder_model_dimension,
                                         dropout_rate=dropout_rate,
                                         num_heads=decoder_num_heads,
                                         feed_forward_dimension=decoder_feed_forward_dimension,
                                         maximum_position_encoding=decoder_maximum_position_encoding,
                                         dense_blocks=decoder_dense_blocks,
                                         name='Decoder')
     self.final_proj_mel = tf.keras.layers.Dense(self.mel_channels * self.max_r, name='FinalProj')
     self.decoder_postnet = Postnet(mel_channels=mel_channels,
                                    conv_filters=postnet_conv_filters,
                                    conv_layers=postnet_conv_layers,
                                    kernel_size=postnet_kernel_size,
                                    name='Postnet')
     
     self.training_input_signature = [
         tf.TensorSpec(shape=(None, None), dtype=tf.int32),
         tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32),
         tf.TensorSpec(shape=(None, None), dtype=tf.int32)
     ]
     self.forward_input_signature = [
         tf.TensorSpec(shape=(None, None), dtype=tf.int32),
         tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32)
     ]
     self.encoder_signature = [
         tf.TensorSpec(shape=(None, None), dtype=tf.int32)
     ]
     self.decoder_signature = [
         tf.TensorSpec(shape=(None, None, encoder_model_dimension), dtype=tf.float32),
         tf.TensorSpec(shape=(None, None, mel_channels), dtype=tf.float32),
         tf.TensorSpec(shape=(None, None, None, None), dtype=tf.float32),
     ]
     self.debug = debug
     self.__apply_all_signatures()