def __init__(self, inp, inp_mask, decode_time_steps, ctr_flag, ctr_attention, hyper_params=None, name='Tacotron'):
        """
        Build the computational graph.
        :param inp:
        :param inp_mask:
        :param decode_time_steps:
        :param hyper_params:
        :param name:
        """
        super(Tacotron, self).__init__(name)
        self.hyper_params = HyperParams() if hyper_params is None else hyper_params

        with tf.variable_scope(name):
            self.global_step = tf.Variable(0, name='global_step', trainable=False)

            batch_size = tf.shape(inp)[0]
            input_time_steps = tf.shape(inp)[1]
            reduc = self.hyper_params.reduction_rate
            output_time_steps = decode_time_steps * reduc

            ### Encoder [begin]
            with tf.variable_scope('character_embedding'):
                embed_inp = EmbeddingLayer(self.hyper_params.embed_class, self.hyper_params.embed_dim)(inp)
            with tf.variable_scope("changeToVarible"):
                self.single_style_token = tf.get_variable('style_token', (1, self.hyper_params.styles_kind, self.hyper_params.style_dim), dtype=tf.float32)
                self.style_token = tf.tile(self.single_style_token, (batch_size, 1, 1))
            with tf.variable_scope('encoder_pre_net'):
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(embed_inp, 256, tf.nn.relu), training=False)
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(pre_ed_inp, 128, tf.nn.relu), training=False)
            encoder_output = modules.cbhg(pre_ed_inp, training=False, k=16, bank_filters=128,
                                          projection_filters=(128, 128), highway_layers=4, highway_units=128,
                                          bi_gru_units=128, sequence_length=inp_mask,
                                          name='encoder_cbhg', reuse=False)

            with tf.variable_scope('post_text'):
                all_outputs, _ = tf.nn.dynamic_rnn(cell=GRUCell(256), inputs=encoder_output, sequence_length=inp_mask,
                                               dtype=encoder_output.dtype, parallel_iterations=unkonwn_parallel_iterations)
                all_outputs = tf.transpose(all_outputs, [1, 0, 2])
                static_encoder_output = all_outputs[-1]
            ### Encoder [end]

            ### Attention Module
            with tf.variable_scope('attention'):
                att_module = AttentionModule(256, encoder_output, sequence_length=inp_mask, time_major=False)
            with tf.variable_scope("attention_style"):
                att_module_style = AttentionModule(256, self.style_token, time_major=False)

            ### Decoder [begin]
            att_cell = GRUCell(256)
            dec_cell = MultiRNNCell([ResidualWrapper(GRUCell(256)) for _ in range(2)])
            # prepare output alpha TensorArray
            with tf.variable_scope('prepare_decode'):
                # prepare output alpha TensorArray
                reduced_time_steps = tf.div(output_time_steps, reduc)
                init_att_cell_state = att_cell.zero_state(batch_size, tf.float32)
                init_dec_cell_state = dec_cell.zero_state(batch_size, tf.float32)
                init_state_tup = tuple([init_att_cell_state, init_dec_cell_state])
                init_output_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32)
                init_alpha_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32)
                init_weight_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32)
                init_weight_per_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32)
                init_alpha_style_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32)
                go_array = tf.zeros([batch_size, self.hyper_params.seq2seq_dim], dtype=tf.float32)
                init_context = tf.zeros([batch_size, 256], dtype=tf.float32)
                init_context_style = tf.zeros([batch_size, 256], dtype=tf.float32)
                init_time = tf.constant(0, dtype=tf.int32)
            cond = lambda x, *_: tf.less(x, reduced_time_steps)
            def body(this_time, old_output_ta, old_alpha_ta, old_alpha_style_ta, old_weight_ta, old_weight_per_ta,
                     old_state_tup, last_context, last_context_style, last_output):
                with tf.variable_scope('decoder_pre_net'):
                    dec_pre_ed_inp = last_output
                    dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense(dec_pre_ed_inp, 256, tf.nn.relu), training=False)
                    dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense(dec_pre_ed_inp, 128, tf.nn.relu), training=False)
                with tf.variable_scope('attention_rnn'):
                    # dec_pre_ed_inp = tf.Print(dec_pre_ed_inp, [dec_pre_ed_inp[0]], message='dec', summarize=10)
                    att_cell_inp = tf.concat([last_context, dec_pre_ed_inp], axis=-1)
                    att_cell_out, att_cell_state = att_cell(att_cell_inp, old_state_tup[0])
                with tf.variable_scope('attention'):
                    query = att_cell_state[0]
                    context, alpha = att_module(query)
                    new_alpha_ta = old_alpha_ta.write(this_time, alpha)
                with tf.variable_scope("attention_style"):
                    query_style = att_cell_state[0]
                    context_style, alpha_style = att_module_style(query_style)
                    alpha_style = tf.cond(tf.equal(ctr_flag, 1), lambda: ctr_attention, lambda: alpha_style)
                    alpha_style = tf.Print(alpha_style, [alpha_style], message='alpha:', summarize=10)
                    context_style = tf.cond(tf.equal(ctr_flag, 1),
                                            lambda: tf.reduce_sum(tf.expand_dims(alpha_style, axis=-1) * self.style_token, axis=1),
                                            lambda: context_style)
                    context_style = tf.Print(context_style, [context_style], message='style:', summarize=10)
                    # alpha_style = ctr_attention
                    # alpha_style = tf.Print(alpha_style, [alpha_style], message='alpha', summarize=20)
                    # context_style = tf.reduce_sum(tf.expand_dims(alpha_style, axis=-1) * self.style_token, axis=1)
                    # context_style = tf.Print(context_style, [context_style], message='ctxt_style', summarize=20)
                    new_alpha_style_ta = old_alpha_style_ta.write(this_time, alpha_style)
                with tf.variable_scope("weighting"):
                    weight_input = tf.concat([static_encoder_output, dec_pre_ed_inp], axis=-1)
                    weighting = tf.layers.dense(weight_input, 2, tf.nn.sigmoid)
                    # weighting = tf.Print(weighting, [weighting[1]], message='weighting')
                    weighting = tf.nn.softmax(weighting)
                    weight_text, weight_style = tf.split(weighting, [1, 1], -1)
                    # weight_text = tf.Print(weight_text, [weight_text], message='weight_text:', summarize=20)
                    weight_style = tf.Print(weight_style, [weight_style], message='weight_style:')
                    new_weight_ta = old_weight_ta.write(this_time, weight_text)
                with tf.variable_scope('decoder_rnn'):
                    weighting_context = weight_text * context + weight_style * context_style
                    weight_per = tf.reduce_mean(tf.abs(weight_style * context_style) / (
                            tf.abs(weight_text * context) + tf.abs(weight_style * context_style)))
                    new_weight_per_ta = old_weight_per_ta.write(this_time, weight_per)
                    dec_input = tf.layers.dense(tf.concat([att_cell_out, weighting_context], axis=-1), 256)
                    # dec_input = tf.layers.dense(tf.concat([att_cell_out, context], axis=-1), 256)
                    dec_cell_out, dec_cell_state = dec_cell(dec_input, old_state_tup[1])
                    dense_out = tf.layers.dense(dec_cell_out, self.hyper_params.seq2seq_dim * reduc)
                    new_output_ta = old_output_ta.write(this_time, dense_out)
                    new_output = dense_out[:, -self.hyper_params.seq2seq_dim:]
                new_state_tup = tuple([att_cell_state, dec_cell_state])
                return tf.add(this_time, 1), new_output_ta, new_alpha_ta, new_alpha_style_ta, new_weight_ta,\
                       new_weight_per_ta, new_state_tup, context, context_style, new_output


            # run loop
            _, seq2seq_output_ta, alpha_ta, alpha_style_ta, weight_ta, weight_per_ta, *_ = tf.while_loop(cond, body, [init_time,
                                                                                                                      init_output_ta,
                                                                                                                      init_alpha_ta,
                                                                                                                      init_alpha_style_ta,
                                                                                                                      init_weight_ta,
                                                                                                                      init_weight_per_ta,
                                                                                                                      init_state_tup,
                                                                                                                      init_context,
                                                                                                                      init_context_style,
                                                                                                                      go_array
                                                                                                                      ])
            with tf.variable_scope('reshape_decode'):
                seq2seq_output = tf.reshape(seq2seq_output_ta.stack(),
                                            shape=(reduced_time_steps, batch_size, self.hyper_params.seq2seq_dim * reduc))
                seq2seq_output = tf.reshape(tf.transpose(seq2seq_output, perm=(1, 0, 2)),
                                            shape=(batch_size, output_time_steps, self.hyper_params.seq2seq_dim))
                self.seq2seq_output = seq2seq_output

                alpha_output = tf.reshape(alpha_ta.stack(),
                                          shape=(reduced_time_steps, batch_size, input_time_steps))
                alpha_output = tf.expand_dims(tf.transpose(alpha_output, perm=(1, 0, 2)), -1)
                self.alpha_output = alpha_output

                alpha_output_style = tf.reshape(alpha_style_ta.stack(),
                                                shape=(reduced_time_steps, batch_size, self.hyper_params.styles_kind))
                alpha_output_style = tf.expand_dims(tf.transpose(alpha_output_style, perm=(1, 0, 2)), -1)  # batch major
                self.alpha_output_style = alpha_output_style

                weight_ta = tf.reshape(weight_ta.stack(), shape=(reduced_time_steps, batch_size, 1))
                weight_ta = tf.transpose(weight_ta, perm=(1, 0, 2))
                self.weight_ta = weight_ta

                weight_per_ta = tf.reshape(weight_per_ta.stack(), shape=(reduced_time_steps, 1))
                self.weight_per_ta = weight_per_ta
            ### Decoder [end]

            ### PostNet [begin]
            post_output = modules.cbhg(seq2seq_output, training=False, k=8, bank_filters=128,
                                       projection_filters=(256, self.hyper_params.seq2seq_dim),
                                       highway_layers=4, highway_units=128,
                                       bi_gru_units=128, sequence_length=None,
                                       name='decoder_cbhg', reuse=False)
            post_output = tf.layers.dense(post_output, self.hyper_params.post_dim, name='post_linear_transform')
            self.post_output = post_output
Esempio n. 2
0
    def __init__(self,
                 inp,
                 inp_mask,
                 seq2seq_gtruth,
                 post_gtruth,
                 hyper_params=None,
                 training=True,
                 name='Tacotron',
                 reuse=False):
        """
        Build the computational graph.
        :param inp:
        :param inp_mask:
        :param seq2seq_gtruth:
        :param post_gtruth:
        :param hyper_params:
        :param training:
        :param name:
        """
        super(Tacotron, self).__init__(name)
        self.hyper_params = HyperParams(
        ) if hyper_params is None else hyper_params
        with tf.variable_scope(name, reuse=reuse):
            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)
            self.learning_rate = tf.Variable(
                self.hyper_params.learning_rate[0],
                name='learning_rate',
                trainable=False,
                dtype=tf.float32)

            batch_size = tf.shape(inp)[0]
            input_time_steps = tf.shape(inp)[1]
            output_time_steps = tf.shape(seq2seq_gtruth)[1]

            ### Encoder [begin]
            with tf.variable_scope('character_embedding'):
                embed_inp = EmbeddingLayer(self.hyper_params.embed_class,
                                           self.hyper_params.embed_dim)(inp)
            with tf.variable_scope("changeToVarible"):
                self.single_style_token = tf.get_variable(
                    'style_token', (1, self.hyper_params.styles_kind,
                                    self.hyper_params.style_dim),
                    dtype=tf.float32)
                self.style_token = tf.tile(self.single_style_token,
                                           (batch_size, 1, 1))
            with tf.variable_scope('encoder_pre_net'):
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                    embed_inp, 256, tf.nn.relu),
                                               training=training)
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                    pre_ed_inp, 128, tf.nn.relu),
                                               training=training)
            encoder_output = modules.cbhg(pre_ed_inp,
                                          training=training,
                                          k=16,
                                          bank_filters=128,
                                          projection_filters=(128, 128),
                                          highway_layers=4,
                                          highway_units=128,
                                          bi_gru_units=128,
                                          sequence_length=inp_mask,
                                          name='encoder_cbhg',
                                          reuse=False)
            ### Encoder [end]

            ### Attention Module
            with tf.variable_scope('attention'):
                att_module = AttentionModule(256,
                                             encoder_output,
                                             sequence_length=inp_mask,
                                             time_major=False)
            with tf.variable_scope("attention_style"):
                att_module_style = AttentionModule(256,
                                                   self.style_token,
                                                   time_major=False)

            ### Decoder [begin]
            att_cell = GRUCell(256)
            dec_cell = MultiRNNCell(
                [ResidualWrapper(GRUCell(256)) for _ in range(2)])
            # prepare output alpha TensorArray
            with tf.variable_scope('prepare_decode'):
                reduc = self.hyper_params.reduction_rate
                reduced_time_steps = tf.div(output_time_steps, reduc)
                init_att_cell_state = att_cell.zero_state(
                    batch_size, tf.float32)
                init_dec_cell_state = dec_cell.zero_state(
                    batch_size, tf.float32)
                init_state_tup = tuple(
                    [init_att_cell_state, init_dec_cell_state])
                init_output_ta = tf.TensorArray(size=reduced_time_steps,
                                                dtype=tf.float32)
                init_alpha_ta = tf.TensorArray(size=reduced_time_steps,
                                               dtype=tf.float32)
                init_weight_ta = tf.TensorArray(size=reduced_time_steps,
                                                dtype=tf.float32)
                init_weight_per_ta = tf.TensorArray(size=reduced_time_steps,
                                                    dtype=tf.float32)
                init_alpha_style_ta = tf.TensorArray(size=reduced_time_steps,
                                                     dtype=tf.float32)
                time_major_seq2seq_gtruth = tf.transpose(seq2seq_gtruth,
                                                         perm=(1, 0, 2))
                indic_array = tf.concat([
                    tf.zeros([
                        reduc, batch_size, self.hyper_params.seq2seq_dim
                    ]), time_major_seq2seq_gtruth
                ],
                                        axis=0)
                init_context = tf.zeros([batch_size, 256], dtype=tf.float32)
                init_context_style = tf.zeros([batch_size, 256],
                                              dtype=tf.float32)
                init_time = tf.constant(0, dtype=tf.int32)
            cond = lambda x, *_: tf.less(x, reduced_time_steps)

            def body(this_time, old_context, old_context_style, old_output_ta,
                     old_alpha_ta, old_alpha_style_ta, old_weight_ta,
                     old_weight_per_ta, old_state_tup):
                with tf.variable_scope('decoder_pre_net'):
                    dec_pre_ed_inp = indic_array[reduc * this_time + reduc - 1]
                    dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                        dec_pre_ed_inp, 256, tf.nn.relu),
                                                       training=training)
                    dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                        dec_pre_ed_inp, 128, tf.nn.relu),
                                                       training=training)
                with tf.variable_scope('attention_rnn'):
                    att_cell_inp = tf.concat(
                        [old_context, old_context_style, dec_pre_ed_inp],
                        axis=-1)
                    att_cell_out, att_cell_state = att_cell(
                        att_cell_inp, old_state_tup[0])
                with tf.variable_scope('attention'):
                    query = att_cell_state[0]
                    context, alpha = att_module(query)
                    new_alpha_ta = old_alpha_ta.write(this_time, alpha)
                with tf.variable_scope("attention_style"):
                    query_style = att_cell_state[0]
                    context_style, alpha_style = att_module_style(query_style)
                    new_alpha_style_ta = old_alpha_style_ta.write(
                        this_time, alpha_style)
                with tf.variable_scope("weighting"):
                    weighting = tf.layers.dense(dec_pre_ed_inp, 1,
                                                tf.nn.sigmoid)
                    # weighting = tf.nn.softmax(weighting)
                    new_weight_ta = old_weight_ta.write(this_time, weighting)
                with tf.variable_scope('decoder_rnn'):
                    weighting_context = weighting * context + (
                        1 - weighting) * context_style
                    weight_per = tf.reduce_mean(
                        tf.abs((1 - weighting) * context_style) /
                        (tf.abs(weighting * context) + tf.abs(
                            (1 - weighting) * context_style)))
                    new_weight_per_ta = old_weight_per_ta.write(
                        this_time, weight_per)
                    dec_input = tf.layers.dense(
                        tf.concat([att_cell_out, weighting_context], axis=-1),
                        256)
                    dec_cell_out, dec_cell_state = dec_cell(
                        dec_input, old_state_tup[1])
                    dense_out = tf.layers.dense(
                        dec_cell_out, self.hyper_params.seq2seq_dim * reduc)
                    new_output_ta = old_output_ta.write(this_time, dense_out)
                new_state_tup = tuple([att_cell_state, dec_cell_state])
                return tf.add(
                    this_time, 1
                ), context, context_style, new_output_ta, new_alpha_ta, new_alpha_style_ta, new_weight_ta, new_weight_per_ta, new_state_tup

            # run loop
            _, _, _, seq2seq_output_ta, alpha_ta, alpha_style_ta, weight_ta, weight_per_ta, *_ = tf.while_loop(
                cond,
                body, [
                    init_time, init_context, init_context_style,
                    init_output_ta, init_alpha_ta, init_alpha_style_ta,
                    init_weight_ta, init_weight_per_ta, init_state_tup
                ],
                parallel_iterations=32)

            with tf.variable_scope('reshape_decode'):
                seq2seq_output = tf.reshape(
                    seq2seq_output_ta.stack(),
                    shape=(reduced_time_steps, batch_size,
                           self.hyper_params.seq2seq_dim * reduc))
                seq2seq_output = tf.reshape(
                    tf.transpose(seq2seq_output, perm=(1, 0, 2)),
                    shape=(batch_size, output_time_steps,
                           self.hyper_params.seq2seq_dim))
                self.seq2seq_output = seq2seq_output

                alpha_output = tf.reshape(alpha_ta.stack(),
                                          shape=(reduced_time_steps,
                                                 batch_size, input_time_steps))
                alpha_output = tf.expand_dims(
                    tf.transpose(alpha_output, perm=(1, 0, 2)), -1)
                self.alpha_output = alpha_output

                alpha_output_style = tf.reshape(
                    alpha_style_ta.stack(),
                    shape=(reduced_time_steps, batch_size,
                           self.hyper_params.styles_kind))
                alpha_output_style = tf.expand_dims(
                    tf.transpose(alpha_output_style, perm=(1, 0, 2)),
                    -1)  # batch major
                self.alpha_output_style = alpha_output_style

                weight_ta = tf.reshape(weight_ta.stack(),
                                       shape=(reduced_time_steps, batch_size,
                                              1))
                weight_ta = tf.transpose(weight_ta, perm=(1, 0, 2))
                self.weight_ta = weight_ta

                weight_per_ta = tf.reshape(weight_per_ta.stack(),
                                           shape=(reduced_time_steps, 1))
                self.weight_per_ta = weight_per_ta
            ### Decoder [end]

            ### PostNet [begin]
            post_output = modules.cbhg(
                seq2seq_output,
                training=training,
                k=8,
                bank_filters=128,
                projection_filters=(256, self.hyper_params.seq2seq_dim),
                highway_layers=4,
                highway_units=128,
                bi_gru_units=128,
                sequence_length=None,
                name='decoder_cbhg',
                reuse=False)
            post_output = tf.layers.dense(post_output,
                                          self.hyper_params.post_dim,
                                          name='post_linear_transform')
            self.post_output = post_output
            ### PostNet [end]

        ### Loss
        with tf.variable_scope('loss'):
            self.seq2seq_loss = l1_loss(seq2seq_gtruth, seq2seq_output)
            self.post_loss = l1_loss(post_gtruth, post_output)
            self.loss = self.seq2seq_loss + self.post_loss
Esempio n. 3
0
    def build(self, inp, inp_mask, mel_gtruth, spec_gtruth):
        batch_size = tf.shape(inp)[0]
        input_time_steps = tf.shape(inp)[1]
        output_time_steps = tf.shape(mel_gtruth)[1]

        ### Encoder [ begin
        with tf.variable_scope("encoder"):

            with tf.variable_scope("embedding"):
                embed_inp = EmbeddingLayer(EMBED_CLASS, EMBED_DIM)(inp)

            with tf.variable_scope("changeToVarible"):

                self.single_style_token = tf.get_variable(
                    'style_token', (1, styles_kind, style_dim),
                    dtype=tf.float32)
                self.style_token = tf.tile(self.single_style_token,
                                           (batch_size, 1, 1))

            with tf.variable_scope("pre-net"):
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                    embed_inp, 256, tf.nn.relu),
                                               training=self.training)
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                    pre_ed_inp, 128, tf.nn.relu),
                                               training=self.training)

            with tf.variable_scope("CBHG"):
                # batch major
                encoder_output = CBHG(16,
                                      (128, 128))(pre_ed_inp,
                                                  sequence_length=inp_mask,
                                                  is_training=self.training,
                                                  time_major=False)

        with tf.variable_scope("attention"):
            att_module = AttentionModule(ATT_RNN_SIZE,
                                         encoder_output,
                                         sequence_length=inp_mask,
                                         time_major=False)
        with tf.variable_scope("attention_style"):
            att_module_style = AttentionModule(STYLE_ATT_RNN_SIZE,
                                               self.style_token,
                                               time_major=False)

        with tf.variable_scope("decoder"):
            with tf.variable_scope("attentionRnn"):
                att_cell = GRUCell(ATT_RNN_SIZE)
            with tf.variable_scope("acoustic_module"):
                aco_cell = MultiRNNCell(
                    [ResidualWrapper(GRUCell(DEC_RNN_SIZE)) for _ in range(2)])

            ### prepare output alpha TensorArray
            reduced_time_steps = tf.div(output_time_steps, self.r)
            att_cell_state = att_cell.init_state(batch_size, tf.float32)
            aco_cell_state = aco_cell.zero_state(batch_size, tf.float32)
            state_tup = tuple([att_cell_state, aco_cell_state])
            output_ta = tf.TensorArray(size=reduced_time_steps,
                                       dtype=tf.float32)
            alpha_ta = tf.TensorArray(size=reduced_time_steps,
                                      dtype=tf.float32)
            weight_ta = tf.TensorArray(size=reduced_time_steps,
                                       dtype=tf.float32)
            alpha_style_ta = tf.TensorArray(size=reduced_time_steps,
                                            dtype=tf.float32)
            indic_ta = tf.TensorArray(size=self.r + output_time_steps,
                                      dtype=tf.float32)
            time_major_mel_gtruth = tf.transpose(mel_gtruth, perm=(1, 0, 2))
            indic_array = tf.concat([
                tf.zeros([self.r, batch_size, OUTPUT_MEL_DIM]),
                time_major_mel_gtruth
            ],
                                    axis=0)
            indic_ta = indic_ta.unstack(indic_array)
            #init_context = tf.zeros((batch_size, 256))

            time = tf.constant(0, dtype=tf.int32)
            cond = lambda time, *_: tf.less(time, reduced_time_steps)

            def body(time, output_ta, alpha_ta, alpha_style_ta, weight_ta,
                     state_tup):
                with tf.variable_scope("att-rnn"):
                    pre_ed_indic = tf.layers.dropout(tf.layers.dense(
                        indic_ta.read(self.r * time + self.r - 1), 256,
                        tf.nn.relu),
                                                     training=self.training)
                    pre_ed_indic = tf.layers.dropout(tf.layers.dense(
                        pre_ed_indic, 128, tf.nn.relu),
                                                     training=self.training)
                    att_cell_out, att_cell_state = att_cell(
                        tf.concat([pre_ed_indic], axis=-1), state_tup[0])
                with tf.variable_scope("attention"):
                    query = att_cell_state[0]  # att_cell_out
                    context, alpha = att_module(query)
                    alpha_ta = alpha_ta.write(time, alpha)
                with tf.variable_scope("attention_style"):
                    context_style, alpha_style = att_module_style(query)
                    alpha_style_ta = alpha_style_ta.write(time, alpha_style)
                with tf.variable_scope("weighting"):
                    print(query)
                    weighting = add_layer(query,
                                          query.shape[-1],
                                          1,
                                          'weighting_w',
                                          'weighting_b',
                                          activation_function=tf.nn.sigmoid)
                    # weighting = tf.nn.softmax(weighting)
                    weight_ta = weight_ta.write(time, weighting)

                with tf.variable_scope("acoustic_module"):
                    # weighting0 = tf.reshape(weighting[:, 0], (BATCH_SIZE, 1))
                    # weighting1 = tf.reshape(weighting[:, 1], (BATCH_SIZE, 1))
                    weighting_context = context + weighting * tf.nn.tanh(
                        context_style)
                    # print(weighting_context)
                    aco_input = tf.layers.dense(
                        tf.concat([att_cell_out, weighting_context], axis=-1),
                        DEC_RNN_SIZE)
                    aco_cell_out, aco_cell_state = aco_cell(
                        aco_input, state_tup[1])
                    dense_out = tf.layers.dense(aco_cell_out,
                                                OUTPUT_MEL_DIM * self.r)
                    output_ta = output_ta.write(time, dense_out)
                state_tup = tuple([att_cell_state, aco_cell_state])

                return tf.add(
                    time, 1
                ), output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup

            ### run loop
            _, output_mel_ta, final_alpha_ta, final_alpha_style_ta, final_weight_ta, *_ = tf.while_loop(
                cond, body, [
                    time, output_ta, alpha_ta, alpha_style_ta, weight_ta,
                    state_tup
                ])
        # print('hjhhhh', reduced_time_steps, batch_size, OUTPUT_MEL_DIM * self.r, batch_size, output_time_steps,
        #       OUTPUT_MEL_DIM)
        # sys.stdout.flush()
        ### time major
        with tf.variable_scope("output"):
            # print('hjhhhh', reduced_time_steps, batch_size, OUTPUT_MEL_DIM * self.r, batch_size, output_time_steps, OUTPUT_MEL_DIM)
            # sys.stdout.flush()
            output_mel = tf.reshape(output_mel_ta.stack(),
                                    shape=(reduced_time_steps, batch_size,
                                           OUTPUT_MEL_DIM * self.r))
            output_mel = tf.reshape(tf.transpose(output_mel, perm=(1, 0, 2)),
                                    shape=(batch_size, output_time_steps,
                                           OUTPUT_MEL_DIM))
            self.out_mel = output_mel

            with tf.variable_scope("post-net"):
                output_post = CBHG(8, (256, OUTPUT_MEL_DIM))(
                    output_mel,
                    sequence_length=None,
                    is_training=self.training,
                    time_major=False)
                output_spec = tf.layers.dense(output_post, OUTPUT_SPEC_DIM)
                self.out_stftm = output_spec

            final_alpha = tf.reshape(final_alpha_ta.stack(),
                                     shape=(reduced_time_steps, batch_size,
                                            input_time_steps))
            final_alpha = tf.transpose(final_alpha,
                                       perm=(1, 0, 2))  # batch major

            final_alpha_style = tf.reshape(final_alpha_style_ta.stack(),
                                           shape=(reduced_time_steps,
                                                  batch_size, styles_kind))
            final_alpha_style = tf.transpose(final_alpha_style,
                                             perm=(1, 0, 2))  # batch major

            final_weight_ta = tf.reshape(final_weight_ta.stack(),
                                         shape=(reduced_time_steps, batch_size,
                                                1))
            final_weight_ta = tf.transpose(final_weight_ta,
                                           perm=(1, 0, 2))  # batch major
            self.weighting = final_weight_ta

            # self.alpha_style_hjk_img = tf.reshape(final_alpha_style, shape=(batch_size, reduced_time_steps, styles_kind))

        with tf.variable_scope("loss_and_metric"):
            self.loss_mel = tf.reduce_mean(tf.abs(mel_gtruth - output_mel))
            self.loss_spec = tf.reduce_mean(tf.abs(spec_gtruth - output_spec))
            self.loss = self.loss_mel + self.loss_spec
            self.alpha_img = tf.expand_dims(final_alpha, -1)
            self.alpha_style_img = tf.expand_dims(final_alpha_style, -1)
            self.weight_img = tf.expand_dims(final_weight_ta, -1)

            self.sums = []
            self.sums.append(
                tf.summary.image("train/alpha", self.alpha_img[:2]))
            self.sums.append(
                tf.summary.image("train/alpha_style",
                                 self.alpha_style_img[:2]))
            self.sums.append(
                tf.summary.image("train/weight", self.weight_img[:2]))
            self.sums.append(tf.summary.scalar("train/loss", self.loss))
            self.sums.append(
                tf.summary.scalar("train/style_0_0",
                                  self.single_style_token[0][0][0]))
            self.sums.append(
                tf.summary.scalar("train/style_0_100",
                                  self.single_style_token[0][0][100]))
            self.sums.append(
                tf.summary.scalar("train/style_5_100",
                                  self.single_style_token[0][5][100]))
            self.sums.append(
                tf.summary.histogram("train/style_vec",
                                     self.single_style_token))

            self.pred_audio_holder = tf.placeholder(shape=(None, None),
                                                    dtype=tf.float32,
                                                    name='pred_audio')
            self.pred_audio_summary = tf.summary.audio('pred_audio_summary',
                                                       self.pred_audio_holder,
                                                       sample_rate=sr,
                                                       max_outputs=12)
Esempio n. 4
0
    def build(self, inp, inp_mask):

        batch_size = tf.shape(inp)[0]
        input_time_steps = tf.shape(inp)[1]

        ### Encoder [ begin ]
        with tf.variable_scope("encoder"):
            with tf.variable_scope("embedding"):
                embed_inp = EmbeddingLayer(EMBED_CLASS, EMBED_DIM)(inp)

            with tf.variable_scope("changeToVarible"):
                self.single_style_token = tf.get_variable(
                    'style_token', (1, styles_kind, style_dim),
                    dtype=tf.float32)
                self.style_token = tf.tile(self.single_style_token,
                                           (batch_size, 1, 1))

            with tf.variable_scope("pre-net"):
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                    embed_inp, 256, tf.nn.relu),
                                               training=self.training)
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                    pre_ed_inp, 128, tf.nn.relu),
                                               training=self.training)

            with tf.variable_scope("CBHG"):
                # batch major
                encoder_output = CBHG(16,
                                      (128, 128))(pre_ed_inp,
                                                  sequence_length=inp_mask,
                                                  is_training=self.training,
                                                  time_major=False)

        with tf.variable_scope("attention"):
            att_module = AttentionModule(ATT_RNN_SIZE,
                                         encoder_output,
                                         sequence_length=inp_mask,
                                         time_major=False)
        with tf.variable_scope("attention_style"):
            att_module_style = AttentionModule(STYLE_ATT_RNN_SIZE,
                                               self.style_token,
                                               time_major=False)

        with tf.variable_scope("decoder"):
            with tf.variable_scope("attentionRnn"):
                att_cell = GRUCell(ATT_RNN_SIZE)
            with tf.variable_scope("acoustic_module"):
                aco_cell = MultiRNNCell(
                    [ResidualWrapper(GRUCell(DEC_RNN_SIZE)) for _ in range(2)])

            ### prepare output alpha TensorArray
            reduced_time_steps = tf.div(MAX_OUT_STEPS, self.r)
            att_cell_state = att_cell.init_state(batch_size, tf.float32)
            aco_cell_state = aco_cell.zero_state(batch_size, tf.float32)
            state_tup = tuple([att_cell_state, aco_cell_state])
            output_ta = tf.TensorArray(size=reduced_time_steps,
                                       dtype=tf.float32)
            alpha_ta = tf.TensorArray(size=reduced_time_steps,
                                      dtype=tf.float32)
            weight_ta = tf.TensorArray(size=reduced_time_steps,
                                       dtype=tf.float32)
            alpha_style_ta = tf.TensorArray(size=reduced_time_steps,
                                            dtype=tf.float32)
            init_indic = tf.zeros([batch_size, OUTPUT_MEL_DIM])
            # init_context = tf.zeros((batch_size, 256))

            time = tf.constant(0, dtype=tf.int32)
            cond = lambda time, *_: tf.less(time, reduced_time_steps)

            def body(time, indic, output_ta, alpha_ta, alpha_style_ta,
                     weight_ta, state_tup):
                with tf.variable_scope("att-rnn"):
                    pre_ed_indic = tf.layers.dropout(tf.layers.dense(
                        indic, 256, tf.nn.relu),
                                                     training=self.training)
                    pre_ed_indic = tf.layers.dropout(tf.layers.dense(
                        pre_ed_indic, 128, tf.nn.relu),
                                                     training=self.training)
                    att_cell_out, att_cell_state = att_cell(
                        tf.concat([pre_ed_indic], axis=-1), state_tup[0])
                with tf.variable_scope("attention"):
                    query = att_cell_state[0]  # att_cell_out
                    context, alpha = att_module(query)
                    alpha_ta = alpha_ta.write(time, alpha)
                with tf.variable_scope("attention_style"):
                    context_style, alpha_style = att_module_style(query)
                    print('context_style:', context_style)
                    # print('context_style22:', alpha_style)
                    alpha_style_ta = alpha_style_ta.write(time, alpha_style)
                with tf.variable_scope("weighting"):
                    weighting = add_layer(query,
                                          query.shape[-1],
                                          1,
                                          'weighting_w',
                                          'weighting_b',
                                          activation_function=tf.nn.sigmoid)

                    # weighting = tf.nn.softmax(weighting)
                    weight_ta = weight_ta.write(time, weighting)
                with tf.variable_scope("acoustic_module"):
                    # weighting0 = tf.reshape(weighting[:, 0], (BATCH_SIZE, 1))
                    # weighting1 = tf.reshape(weighting[:, 1], (BATCH_SIZE, 1))
                    # weighting_context = weighting0 * context + weighting1 * context_style
                    # print('context:', context)
                    weighting = tf.Print(weighting, [weighting],
                                         message='weight',
                                         summarize=100)
                    context_style = tf.Print(context_style,
                                             [context_style[0][0:5]],
                                             message='origal_style',
                                             summarize=100)
                    context_style = tf.Print(
                        context_style, [tf.nn.tanh(context_style)[0][0:5]],
                        message='tanh_style',
                        summarize=100)
                    context = tf.Print(context, [context[0][0:5]],
                                       message='context',
                                       summarize=100)

                    weighting_context = context + weighting * tf.nn.tanh(
                        context_style)

                    aco_input = tf.layers.dense(
                        tf.concat([att_cell_out, weighting_context], axis=-1),
                        DEC_RNN_SIZE)
                    aco_cell_out, aco_cell_state = aco_cell(
                        aco_input, state_tup[1])
                    dense_out = tf.reshape(
                        tf.layers.dense(aco_cell_out, OUTPUT_MEL_DIM * self.r),
                        shape=(batch_size, self.r, OUTPUT_MEL_DIM))
                    output_ta = output_ta.write(time, dense_out)
                    new_indic = dense_out[:, -1]
                state_tup = tuple([att_cell_state, aco_cell_state])

                return tf.add(
                    time, 1
                ), new_indic, output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup

            ### run loop
            _, _, output_mel_ta, final_alpha_ta, final_alpha_style_ta, final_weight_ta, *_ = tf.while_loop(
                cond, body, [
                    time, init_indic, output_ta, alpha_ta, alpha_style_ta,
                    weight_ta, state_tup
                ])

        ### time major
        with tf.variable_scope("output"):
            output_mel = tf.reshape(output_mel_ta.stack(),
                                    shape=(reduced_time_steps, batch_size,
                                           OUTPUT_MEL_DIM * self.r))
            output_mel = tf.reshape(tf.transpose(output_mel, perm=(1, 0, 2)),
                                    shape=(batch_size, MAX_OUT_STEPS,
                                           OUTPUT_MEL_DIM))
            self.out_mel = output_mel

            with tf.variable_scope("post-net"):
                output_post = CBHG(8, (256, OUTPUT_MEL_DIM))(
                    output_mel,
                    sequence_length=None,
                    is_training=self.training,
                    time_major=False)
                output_spec = tf.layers.dense(output_post, OUTPUT_SPEC_DIM)
                self.out_stftm = output_spec

            final_alpha = tf.reshape(final_alpha_ta.stack(),
                                     shape=(reduced_time_steps, batch_size,
                                            input_time_steps))
            self.final_alpha = tf.transpose(final_alpha,
                                            perm=(1, 0, 2))  # batch major

            final_alpha_style = tf.reshape(final_alpha_style_ta.stack(),
                                           shape=(reduced_time_steps,
                                                  batch_size, styles_kind))
            self.final_alpha_style = tf.transpose(final_alpha_style,
                                                  perm=(1, 0,
                                                        2))  # batch major

            final_weight_ta = tf.reshape(final_weight_ta.stack(),
                                         shape=(reduced_time_steps, batch_size,
                                                1))
            self.final_weight_ta = tf.transpose(final_weight_ta,
                                                perm=(1, 0, 2))  # batch major
Esempio n. 5
0
    def __init__(self,
                 inp,
                 inp_mask,
                 decode_time_steps,
                 hyper_params=None,
                 name='Tacotron'):
        """
        Build the computational graph.
        :param inp:
        :param inp_mask:
        :param decode_time_steps:
        :param hyper_params:
        :param name:
        """
        super(Tacotron, self).__init__(name)
        self.hyper_params = HyperParams(
        ) if hyper_params is None else hyper_params

        with tf.variable_scope(name):
            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False)

            batch_size = tf.shape(inp)[0]
            input_time_steps = tf.shape(inp)[1]
            reduc = self.hyper_params.reduction_rate
            output_time_steps = decode_time_steps * reduc

            ### Encoder [begin]
            with tf.variable_scope('character_embedding'):
                embed_inp = EmbeddingLayer(self.hyper_params.embed_class,
                                           self.hyper_params.embed_dim)(inp)
            with tf.variable_scope('encoder_pre_net'):
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                    embed_inp, 256, tf.nn.relu),
                                               training=False)
                pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                    pre_ed_inp, 128, tf.nn.relu),
                                               training=False)
            encoder_output = modules.cbhg(pre_ed_inp,
                                          training=False,
                                          k=16,
                                          bank_filters=128,
                                          projection_filters=(128, 128),
                                          highway_layers=4,
                                          highway_units=128,
                                          bi_gru_units=128,
                                          sequence_length=inp_mask,
                                          name='encoder_cbhg',
                                          reuse=False)
            ### Encoder [end]

            ### Attention Module
            with tf.variable_scope('attention'):
                att_module = AttentionModule(256,
                                             encoder_output,
                                             sequence_length=inp_mask,
                                             time_major=False)

            ### Decoder [begin]
            att_cell = GRUCell(256)
            dec_cell = MultiRNNCell(
                [ResidualWrapper(GRUCell(256)) for _ in range(2)])
            # prepare output alpha TensorArray
            with tf.variable_scope('prepare_decode'):
                # prepare output alpha TensorArray
                reduced_time_steps = tf.div(output_time_steps, reduc)
                init_att_cell_state = att_cell.zero_state(
                    batch_size, tf.float32)
                init_dec_cell_state = dec_cell.zero_state(
                    batch_size, tf.float32)
                init_state_tup = tuple(
                    [init_att_cell_state, init_dec_cell_state])
                init_output_ta = tf.TensorArray(size=reduced_time_steps,
                                                dtype=tf.float32)
                init_alpha_ta = tf.TensorArray(size=reduced_time_steps,
                                               dtype=tf.float32)
                go_array = tf.zeros(
                    [batch_size, self.hyper_params.seq2seq_dim],
                    dtype=tf.float32)
                init_context = tf.zeros([batch_size, 256], dtype=tf.float32)
                init_time = tf.constant(0, dtype=tf.int32)
            cond = lambda x, *_: tf.less(x, reduced_time_steps)

            def body(this_time, old_output_ta, old_alpha_ta, old_state_tup,
                     last_context, last_output):
                with tf.variable_scope('decoder_pre_net'):
                    dec_pre_ed_inp = last_output
                    dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                        dec_pre_ed_inp, 256, tf.nn.relu),
                                                       training=False)
                    dec_pre_ed_inp = tf.layers.dropout(tf.layers.dense(
                        dec_pre_ed_inp, 128, tf.nn.relu),
                                                       training=False)
                with tf.variable_scope('attention_rnn'):
                    att_cell_inp = tf.concat([last_context, dec_pre_ed_inp],
                                             axis=-1)
                    att_cell_out, att_cell_state = att_cell(
                        att_cell_inp, old_state_tup[0])
                with tf.variable_scope('attention'):
                    query = att_cell_state[0]
                    context, alpha = att_module(query)
                    new_alpha_ta = old_alpha_ta.write(this_time, alpha)
                with tf.variable_scope('decoder_rnn'):
                    dec_input = tf.layers.dense(
                        tf.concat([att_cell_out, context], axis=-1), 256)
                    dec_cell_out, dec_cell_state = dec_cell(
                        dec_input, old_state_tup[1])
                    dense_out = tf.layers.dense(
                        dec_cell_out, self.hyper_params.seq2seq_dim * reduc)
                    new_output_ta = old_output_ta.write(this_time, dense_out)
                    new_output = dense_out[:, -self.hyper_params.seq2seq_dim:]
                new_state_tup = tuple([att_cell_state, dec_cell_state])
                return tf.add(
                    this_time, 1
                ), new_output_ta, new_alpha_ta, new_state_tup, context, new_output

            # run loop
            _, seq2seq_output_ta, alpha_ta, *_ = tf.while_loop(
                cond, body, [
                    init_time, init_output_ta, init_alpha_ta, init_state_tup,
                    init_context, go_array
                ])
            with tf.variable_scope('reshape_decode'):
                seq2seq_output = tf.reshape(
                    seq2seq_output_ta.stack(),
                    shape=(reduced_time_steps, batch_size,
                           self.hyper_params.seq2seq_dim * reduc))
                seq2seq_output = tf.reshape(
                    tf.transpose(seq2seq_output, perm=(1, 0, 2)),
                    shape=(batch_size, output_time_steps,
                           self.hyper_params.seq2seq_dim))
                self.seq2seq_output = seq2seq_output

                alpha_output = tf.reshape(alpha_ta.stack(),
                                          shape=(reduced_time_steps,
                                                 batch_size, input_time_steps))
                alpha_output = tf.expand_dims(
                    tf.transpose(alpha_output, perm=(1, 0, 2)), -1)
                self.alpha_output = alpha_output
            ### Decoder [end]

            ### PostNet [begin]
            post_output = modules.cbhg(
                seq2seq_output,
                training=False,
                k=8,
                bank_filters=128,
                projection_filters=(256, self.hyper_params.seq2seq_dim),
                highway_layers=4,
                highway_units=128,
                bi_gru_units=128,
                sequence_length=None,
                name='decoder_cbhg',
                reuse=False)
            post_output = tf.layers.dense(post_output,
                                          self.hyper_params.post_dim,
                                          name='post_linear_transform')
            self.post_output = post_output