Ejemplo n.º 1
0
    def build_model(self):
        with tf.variable_scope(
                "Text2Mel"):  ## keep scope names consistent with full Text2Mel
            ## to allow parameters to be reused more easily later
            # Get S or decoder inputs. (B, T//r, n_mels). This is audio shifted 1 frame to the right.
            self.S = tf.concat(
                (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1)

            ## Babbler has no TextEnc

            with tf.variable_scope("AudioEnc"):
                self.Q = AudioEnc(self.hp,
                                  self.S,
                                  training=self.training,
                                  reuse=self.reuse)

            with tf.variable_scope("Attention"):
                ## Babbler has no real attention. Dummy (all 0) text encoder outputs are supplied instead.
                # R: concat Q with zero vector (dummy text encoder outputs)
                dummy_R_prime = tf.zeros_like(
                    self.Q)  ## R_prime shares shape of audio encoder output
                self.R = tf.concat((dummy_R_prime, self.Q), -1)

            with tf.variable_scope("AudioDec"):
                self.Y_logits, self.Y = AudioDec(
                    self.hp,
                    self.R,
                    training=self.training,
                    speaker_codes=self.speakers,
                    reuse=self.reuse)  # (B, T/r, n_mels)
Ejemplo n.º 2
0
    def __init__(self, num=1, mode="train"):
        '''
        Args:
          num: Either 1 or 2. 1 for Text2Mel 2 for SSRN.
          mode: Either "train" or "synthesize".
        '''
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()

        # Set flag
        training = True if mode == "train" else False

        # Graph
        # Data Feeding
        ## L: Text. (B, N), int32
        ## mels: Reduced melspectrogram. (B, T/r, n_mels) float32
        ## mags: Magnitude. (B, T, n_fft//2+1) float32

        self.L = tf.placeholder(tf.int32, shape=(None, None))
        self.mels = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels))
        self.prev_max_attentions = tf.placeholder(tf.int32, shape=(None, ))

        with tf.variable_scope("Text2Mel"):
            # Get S or decoder inputs. (B, T//r, n_mels)
            self.S = tf.concat(
                (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1)

            # Networks
            with tf.variable_scope("TextEnc"):
                self.K, self.V = TextEnc(self.L,
                                         training=training)  # (N, Tx, e)

            with tf.variable_scope("AudioEnc"):
                self.Q = AudioEnc(self.S, training=training)

            with tf.variable_scope("Attention"):
                # R: (B, T/r, 2d)
                # alignments: (B, N, T/r)
                # max_attentions: (B,)
                self.R, self.alignments, self.max_attentions = Attention(
                    self.Q,
                    self.K,
                    self.V,
                    mononotic_attention=(not training),
                    prev_max_attentions=self.prev_max_attentions)
            with tf.variable_scope("AudioDec"):
                self.Y_logits, self.Y = AudioDec(
                    self.R, training=training)  # (B, T/r, n_mels)

            # During inference, the predicted melspectrogram values are fed.
        with tf.variable_scope("SSRN"):
            self.Z_logits, self.Z = SSRN(self.Y, training=training)

        with tf.variable_scope("gs"):
            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)
    def build_model(self):
        self.load_data_in_memory()
        self.add_data(reuse=self.reuse)

        with tf.variable_scope(
                "Text2Mel"):  ## keep scope names consistent with full Text2Mel
            ## to allow parameters to be reused more easily later
            # Get S or decoder inputs. (B, T//r, n_mels). This is audio shifted 1 frame to the right.
            self.S = tf.concat(
                (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1)

            # Build a latent representation of expressiveness, if we defined uee in config file (for unsupervised expressiveness embedding)

            if self.hp.uee != 0:
                with tf.variable_scope("Audio2Emo"):
                    with tf.variable_scope("AudioEnc"):
                        self.emos = Audio2Emo(
                            self.hp,
                            self.S,
                            training=self.training,
                            speaker_codes=self.speakers,
                            reuse=self.reuse)  # (B, T/r, d=8)
                        self.emo_mean = tf.reduce_mean(self.emos, 1)
                        print(self.emo_mean.get_shape())
                        self.emo_mean = tf.expand_dims(self.emo_mean, axis=1)
                        print(self.emo_mean.get_shape())
                        #pdb.set_trace()
            else:
                print('No unsupervised expressive embedding')
                self.emo_mean = None
                #pdb.set_trace()

            ## Babbler has no TextEnc

            with tf.variable_scope("AudioEnc"):
                self.Q = AudioEnc(self.hp,
                                  self.S,
                                  training=self.training,
                                  reuse=self.reuse)

            with tf.variable_scope("Attention"):
                ## Babbler has no real attention. Dummy (all 0) text encoder outputs are supplied instead.
                # R: concat Q with zero vector (dummy text encoder outputs)
                dummy_R_prime = tf.zeros_like(
                    self.Q)  ## R_prime shares shape of audio encoder output
                self.R = tf.concat((dummy_R_prime, self.Q), -1)

            with tf.variable_scope("AudioDec"):
                self.Y_logits, self.Y = AudioDec(
                    self.hp,
                    self.R,
                    emos=self.emo_mean,
                    training=self.training,
                    speaker_codes=self.speakers,
                    reuse=self.reuse)  # (B, T/r, n_mels)
Ejemplo n.º 4
0
    def __init__(self, num=1):

        # Load vocabulary
        self.char2idx, self.idx2char = self.load_vocab()

        # Set flag
        training = False

        # Graph
        # Data Feeding

        # Synthesize
        self.L = tf.placeholder(tf.int32, shape=(None, None))
        self.mels = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels))
        self.prev_max_attentions = tf.placeholder(tf.int32, shape=(None, ))

        with tf.variable_scope("Text2Mel"):
            # Get S or decoder inputs. (B, T//r, n_mels)
            self.S = tf.concat(
                (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1)

            # Networks
            with tf.variable_scope("TextEnc"):
                self.K, self.V = TextEnc(self.L,
                                         training=training)  # (N, Tx, e)
            with tf.variable_scope("AudioEnc"):
                self.Q = AudioEnc(self.S, training=training)

            with tf.variable_scope("Attention"):
                # R: (B, T/r, 2d)
                # alignments: (B, N, T/r)
                # max_attentions: (B,)
                self.R, self.alignments, self.max_attentions = Attention(
                    self.Q,
                    self.K,
                    self.V,
                    mononotic_attention=(not training),
                    prev_max_attentions=self.prev_max_attentions)
            with tf.variable_scope("AudioDec"):
                self.Y_logits, self.Y = AudioDec(
                    self.R, training=training)  # (B, T/r, n_mels)

        # During inference, the predicted melspectrogram values are fed.
        with tf.variable_scope("SSRN"):
            self.Z_logits, self.Z = SSRN(self.Y, training=training)

        with tf.variable_scope("gs"):
            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)
Ejemplo n.º 5
0
    def __init__(self, num=1, mode="train"):
        '''
        Args:
          mode: Either "train" or "synthesize".
        '''
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()

        # Set flag
        training = True if mode == "train" else False

        # Graph
        # Data Feeding
        ## L: Text. (B, N), int32
        ## world: World Vocoder concatenate tensor.(B, 8*T/r, num_lf0+num_mgc+num_bap) float32
        if mode == "train":
            self.L, self.worlds, self.worlds_WSR, self.fnames, self.num_batch = get_batch(
            )
            self.prev_max_attentions = tf.ones(shape=(hp.B, ), dtype=tf.int32)
            self.gts = tf.convert_to_tensor(guided_attention())
        else:  # Synthesize
            self.L = tf.placeholder(tf.int32, shape=(None, None))
            self.worlds = tf.placeholder(
                tf.float32,
                shape=(None, None,
                       hp.num_bap + hp.num_lf0 + hp.num_mgc + hp.num_vuv))
            self.prev_max_attentions = tf.placeholder(tf.int32, shape=(None, ))
            self.gts = tf.convert_to_tensor(guided_attention())

        if num == 1 or (not training):
            with tf.variable_scope("Text2World"):
                # Get S or decoder inputs. (B, 8*T/r, num_lf0+num_mgc+num_bap)
                self.S = tf.concat((tf.zeros_like(
                    self.worlds[:, :1, :]), self.worlds[:, :-1, :]), 1)

                # Networks
                with tf.variable_scope("TextEnc"):
                    self.K, self.V = TextEnc(self.L,
                                             training=training)  # (N, Tx, e)

                with tf.variable_scope("AudioEnc"):
                    self.Q = AudioEnc(self.S, training=training)

                with tf.variable_scope("Attention"):
                    # R: (B, T/r, 2d)
                    # alignments: (B, N, T/r)
                    # max_attentions: (B,)
                    self.R, self.alignments, self.max_attentions = Attention(
                        self.Q,
                        self.K,
                        self.V,
                        mononotic_attention=(not training),
                        prev_max_attentions=self.prev_max_attentions)
                with tf.variable_scope("AudioDec"):
                    self.Y_logits, self.Y = AudioDec(
                        self.R,
                        training=training)  # (B, T/r, num_lf0+num_mgc+num_bap)
        else:  # num==2 & training. Note that during training,
            with tf.variable_scope("WSRN"):
                self.Z_logits, self.Z = WSRN(self.worlds, training=training)

        if not training:
            # During inference, the predicted world values are fed.
            with tf.variable_scope("WSRN"):
                self.Z_logits, self.Z = WSRN(self.Y, training=training)

        with tf.variable_scope("gs"):
            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)

        if training:
            if num == 1:  # Text2World
                # world L1 loss
                self.loss_worlds = tf.losses.mean_squared_error(
                    self.worlds, self.Y)
                #self.loss_worlds = tf.reduce_mean(tf.abs(self.Y - self.worlds))

                # world binary divergence loss
                #self.loss_bd1 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Y_logits, labels=self.worlds))

                # guided_attention loss
                self.A = tf.pad(self.alignments, [(0, 0), (0, hp.max_N),
                                                  (0, hp.max_T)],
                                mode="CONSTANT",
                                constant_values=-1.)[:, :hp.max_N, :hp.max_T]
                self.attention_masks = tf.to_float(tf.not_equal(self.A, -1))
                self.loss_att = tf.reduce_sum(
                    tf.abs(self.A * self.gts) * self.attention_masks)
                self.mask_sum = tf.reduce_sum(self.attention_masks)
                self.loss_att /= self.mask_sum

                # total loss
                self.loss = self.loss_worlds + self.loss_att  #self.loss_bd1 +

                tf.summary.scalar('train/loss_worlds', self.loss_worlds)
                #tf.summary.scalar('train/loss_bd1', self.loss_bd1)
                tf.summary.scalar('train/loss_att', self.loss_att)
                tf.summary.image(
                    'train/world_gt',
                    tf.expand_dims(tf.transpose(self.worlds[:1], [0, 2, 1]),
                                   -1))
                tf.summary.image(
                    'train/world_hat',
                    tf.expand_dims(tf.transpose(self.Y[:1], [0, 2, 1]), -1))
            else:  #WSRN
                # world L1 loss
                self.loss_WSR = tf.losses.mean_squared_error(
                    self.Z, self.worlds_WSR)

                # world binary divergence loss
                #self.loss_bd2 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Z_logits, labels=self.worlds_WSR))

                # total loss
                self.loss = self.loss_WSR  #+ self.loss_bd2

                tf.summary.scalar('train/loss_world_SSRN', self.loss_WSR)
                #tf.summary.scalar('train/loss_bd2', self.loss_bd2)

            # Training Scheme
            self.lr = learning_rate_decay(hp.lr, self.global_step)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
            tf.summary.scalar("lr", self.lr)

            ## gradient clipping
            self.gvs = self.optimizer.compute_gradients(self.loss)
            self.clipped = []
            for grad, var in self.gvs:
                grad = tf.clip_by_value(grad, -1., 1.)
                self.clipped.append((grad, var))
                self.train_op = self.optimizer.apply_gradients(
                    self.clipped, global_step=self.global_step)

            # Summary
            self.merged = tf.summary.merge_all()
Ejemplo n.º 6
0
    def build_model(self):
        with tf.variable_scope("Text2Mel"):
            # Get S or decoder inputs. (B, T//r, n_mels). This is audio shifted 1 frame to the right.
            self.S = tf.concat(
                (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1)

            # Networks
            if self.hp.text_encoder_type == 'none':
                assert self.hp.merlin_label_dir
                self.K = self.V = self.merlin_label
            elif self.hp.text_encoder_type == 'minimal_feedforward':
                assert self.hp.merlin_label_dir
                #sys.exit('Not implemented: hp.text_encoder_type=="minimal_feedforward"')
                self.K = self.V = LinearTransformLabels(self.hp,
                                                        self.merlin_label,
                                                        training=self.training,
                                                        reuse=self.reuse)
            elif self.hp.text_encoder_type == 'MerlinTextEnc':
                assert self.hp.merlin_label_dir
                with tf.variable_scope("MerlinTextEnc"):
                    self.K, self.V = MerlinTextEnc(
                        self.hp,
                        self.L,
                        self.merlin_label,
                        training=self.training,
                        speaker_codes=self.speakers,
                        reuse=self.reuse)  # (N, Tx, e)
            else:  ## default DCTTS text encoder
                with tf.variable_scope("TextEnc"):
                    self.K, self.V = TextEnc(self.hp,
                                             self.L,
                                             training=self.training,
                                             speaker_codes=self.speakers,
                                             reuse=self.reuse)  # (N, Tx, e)

            with tf.variable_scope("AudioEnc"):
                if self.hp.history_type in [
                        'fractional_position_in_phone',
                        'absolute_position_in_phone'
                ]:
                    self.Q = self.position_in_phone
                elif self.hp.history_type == 'minimal_history':
                    sys.exit(
                        'Not implemented: hp.history_type=="minimal_history"')
                else:
                    assert self.hp.history_type == 'DCTTS_standard'
                    self.Q = AudioEnc(self.hp,
                                      self.S,
                                      training=self.training,
                                      speaker_codes=self.speakers,
                                      reuse=self.reuse)

            with tf.variable_scope("Attention"):
                # R: (B, T/r, 2d)
                # alignments: (B, N, T/r)
                # max_attentions: (B,)
                if self.hp.use_external_durations:
                    self.R, self.alignments, self.max_attentions = FixedAttention(
                        self.hp, self.durations, self.Q, self.V)

                elif self.mode is 'synthesize':
                    self.R, self.alignments, self.max_attentions = Attention(
                        self.hp,
                        self.Q,
                        self.K,
                        self.V,
                        monotonic_attention=True,
                        prev_max_attentions=self.prev_max_attentions)
                elif self.mode is 'train':
                    self.R, self.alignments, self.max_attentions = Attention(
                        self.hp,
                        self.Q,
                        self.K,
                        self.V,
                        monotonic_attention=False,
                        prev_max_attentions=self.prev_max_attentions)
                elif self.mode is 'generate_attention':
                    self.R, self.alignments, self.max_attentions = Attention(
                        self.hp,
                        self.Q,
                        self.K,
                        self.V,
                        monotonic_attention=False,
                        prev_max_attentions=None)

            with tf.variable_scope("AudioDec"):
                self.Y_logits, self.Y = AudioDec(
                    self.hp,
                    self.R,
                    training=self.training,
                    speaker_codes=self.speakers,
                    reuse=self.reuse)  # (B, T/r, n_mels)
Ejemplo n.º 7
0
    def __init__(self, num=1, mode="train"):
        '''
        Args:
          num: Either 1 or 2. 1 for Text2Mel 2 for SSRN.
          mode: Either "train" or "synthesize".
        '''
        # Load vocabulary
        self.char2idx, self.idx2char = load_vocab()

        # Set flag
        training = True if mode == "train" else False

        # Graph
        # Data Feeding
        ## L: Text. (B, N), int32
        ## mels: Reduced melspectrogram. (B, T/r, n_mels) float32
        ## mags: Magnitude. (B, T, n_fft//2+1) float32
        if mode == "train":
            self.L, self.mels, self.mags, self.fnames, self.num_batch = get_batch(
            )
            self.prev_max_attentions = tf.ones(shape=(hp.B, ), dtype=tf.int32)
            self.gts = tf.convert_to_tensor(guided_attention())
        else:  # Synthesize
            self.L = tf.placeholder(tf.int32, shape=(None, None))
            self.mels = tf.placeholder(tf.float32,
                                       shape=(None, None, hp.n_mels))
            self.prev_max_attentions = tf.placeholder(tf.int32, shape=(None, ))

        # Training first neural net or testing
        if num == 1 or (not training):
            with tf.variable_scope("Text2Mel"):
                # Get S or decoder inputs. (B, T//r, n_mels)
                self.S = tf.concat(
                    (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]),
                    1)

                # Networks
                with tf.variable_scope("TextEnc"):
                    self.K, self.V = TextEnc(self.L,
                                             training=training)  # (N, Tx, e)

                with tf.variable_scope("AudioEnc"):
                    self.Q = AudioEnc(self.S, training=training)

                with tf.variable_scope("Attention"):
                    # R: (B, T/r, 2d)
                    # alignments: (B, N, T/r)
                    # max_attentions: (B,)
                    self.R, self.alignments, self.max_attentions = Attention(
                        self.Q,
                        self.K,
                        self.V,
                        mononotic_attention=(not training),
                        prev_max_attentions=self.prev_max_attentions)
                with tf.variable_scope("AudioDec"):
                    self.Y_logits, self.Y = AudioDec(
                        self.R, training=training)  # (B, T/r, n_mels)

        # Training second neural net
        else:  # num==2 & training. Note that during training,
            # the ground truth melspectrogram values are fed.
            with tf.variable_scope("SSRN"):
                self.Z_logits, self.Z = SSRN(self.mels, training=training)

        if not training:
            # During inference, the predicted melspectrogram values are fed.
            with tf.variable_scope("SSRN"):
                self.Z_logits, self.Z = SSRN(self.Y, training=training)

        with tf.variable_scope("gs"):
            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)

        if training:
            if num == 1:  # Text2Mel
                # mel L1 loss
                self.loss_mels = tf.reduce_mean(tf.abs(self.Y - self.mels))

                # mel binary divergence loss
                self.loss_bd1 = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(
                        logits=self.Y_logits, labels=self.mels))

                # guided_attention loss
                self.A = tf.pad(self.alignments, [(0, 0), (0, hp.max_N),
                                                  (0, hp.max_T)],
                                mode="CONSTANT",
                                constant_values=-1.)[:, :hp.max_N, :hp.max_T]
                self.attention_masks = tf.to_float(tf.not_equal(self.A, -1))
                self.loss_att = tf.reduce_sum(
                    tf.abs(self.A * self.gts) * self.attention_masks)
                self.mask_sum = tf.reduce_sum(self.attention_masks)
                self.loss_att /= self.mask_sum

                # total loss
                self.loss = self.loss_mels + self.loss_bd1 + self.loss_att

                tf.summary.scalar('train/loss_mels', self.loss_mels)
                tf.summary.scalar('train/loss_bd1', self.loss_bd1)
                tf.summary.scalar('train/loss_att', self.loss_att)
                tf.summary.image(
                    'train/mel_gt',
                    tf.expand_dims(tf.transpose(self.mels[:1], [0, 2, 1]), -1))
                tf.summary.image(
                    'train/mel_hat',
                    tf.expand_dims(tf.transpose(self.Y[:1], [0, 2, 1]), -1))
            else:  # SSRN
                # mag L1 loss
                self.loss_mags = tf.reduce_mean(tf.abs(self.Z - self.mags))

                # mag binary divergence loss
                self.loss_bd2 = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(
                        logits=self.Z_logits, labels=self.mags))

                # total loss
                self.loss = self.loss_mags + self.loss_bd2

                tf.summary.scalar('train/loss_mags', self.loss_mags)
                tf.summary.scalar('train/loss_bd2', self.loss_bd2)
                tf.summary.image(
                    'train/mag_gt',
                    tf.expand_dims(tf.transpose(self.mags[:1], [0, 2, 1]), -1))
                tf.summary.image(
                    'train/mag_hat',
                    tf.expand_dims(tf.transpose(self.Z[:1], [0, 2, 1]), -1))

            # Training Scheme
            self.lr = learning_rate_decay(hp.lr, self.global_step)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
            tf.summary.scalar("lr", self.lr)

            ## gradient clipping
            self.gvs = self.optimizer.compute_gradients(self.loss)
            self.clipped = []
            for grad, var in self.gvs:
                grad = tf.clip_by_value(grad, -1., 1.)
                self.clipped.append((grad, var))
                self.train_op = self.optimizer.apply_gradients(
                    self.clipped, global_step=self.global_step)

            # Summary
            self.merged = tf.summary.merge_all()
    def build_model(self):
        self.load_data_in_memory()
        self.add_data(reuse=self.reuse)

        # Get S or decoder inputs. (B, T//r, n_mels). This is audio shifted 1 frame to the right.
        self.S = tf.concat(
            (tf.zeros_like(self.mels[:, :1, :]), self.mels[:, :-1, :]), 1)

        # Build a latent representation of expressiveness, if we defined uee in config file (for unsupervised expressiveness embedding)
        try:
            if self.hp.uee != 0:
                with tf.variable_scope("Audio2Emo"):
                    with tf.variable_scope("AudioEnc"):
                        self.emos = Audio2Emo(
                            self.hp,
                            self.S,
                            training=self.training,
                            speaker_codes=self.speakers,
                            reuse=self.reuse)  # (B, T/r, d=8)
                        self.emo_mean = tf.reduce_mean(self.emos, 1)
                        print(self.emo_mean.get_shape())
                        self.emo_mean = tf.expand_dims(self.emo_mean, axis=1)
                        print(self.emo_mean.get_shape())
                        #pdb.set_trace()
        except:
            print('No unsupervised expressive embedding')
            self.emo_mean = None
            #pdb.set_trace()

        with tf.variable_scope("Text2Mel"):
            # Networks
            if self.hp.text_encoder_type == 'none':
                assert self.hp.merlin_label_dir
                self.K = self.V = self.merlin_label
            elif self.hp.text_encoder_type == 'minimal_feedforward':
                assert self.hp.merlin_label_dir
                #sys.exit('Not implemented: hp.text_encoder_type=="minimal_feedforward"')
                self.K = self.V = LinearTransformLabels(self.hp,
                                                        self.merlin_label,
                                                        training=self.training,
                                                        reuse=self.reuse)
            else:  ## default DCTTS text encoder
                with tf.variable_scope("TextEnc_emotional"):
                    self.K, self.V = TextEnc(self.hp,
                                             self.L,
                                             training=self.training,
                                             emos=self.emo_mean,
                                             speaker_codes=self.speakers,
                                             reuse=self.reuse)  # (N, Tx, e)

            with tf.variable_scope("AudioEnc"):
                if self.hp.history_type in [
                        'fractional_position_in_phone',
                        'absolute_position_in_phone'
                ]:
                    self.Q = self.position_in_phone
                elif self.hp.history_type == 'minimal_history':
                    sys.exit(
                        'Not implemented: hp.history_type=="minimal_history"')
                else:
                    assert self.hp.history_type == 'DCTTS_standard'
                    self.Q = AudioEnc(self.hp,
                                      self.S,
                                      training=self.training,
                                      speaker_codes=self.speakers,
                                      reuse=self.reuse)

            with tf.variable_scope("Attention"):
                # R: (B, T/r, 2d)
                # alignments: (B, N, T/r)
                # max_attentions: (B,)
                if self.hp.use_external_durations:
                    self.R, self.alignments, self.max_attentions = FixedAttention(
                        self.hp, self.durations, self.Q, self.V)

                elif self.mode is 'synthesize':
                    self.R, self.alignments, self.max_attentions = Attention(
                        self.hp,
                        self.Q,
                        self.K,
                        self.V,
                        monotonic_attention=True,
                        prev_max_attentions=self.prev_max_attentions)
                elif self.mode is 'train':
                    self.R, self.alignments, self.max_attentions = Attention(
                        self.hp,
                        self.Q,
                        self.K,
                        self.V,
                        monotonic_attention=False,
                        prev_max_attentions=self.prev_max_attentions)
                elif self.mode is 'generate_attention':
                    self.R, self.alignments, self.max_attentions = Attention(
                        self.hp,
                        self.Q,
                        self.K,
                        self.V,
                        monotonic_attention=False,
                        prev_max_attentions=None)

            with tf.variable_scope("AudioDec"):
                self.Y_logits, self.Y = AudioDec(
                    self.hp,
                    self.R,
                    training=self.training,
                    speaker_codes=self.speakers,
                    reuse=self.reuse)  # (B, T/r, n_mels)