def __init__(self,
                 out_units,
                 attention_cell: AttentionRNN,
                 is_training,
                 zoneout_factor_cell=0.0,
                 zoneout_factor_output=0.0,
                 lstm_impl=LSTMImpl.LSTMCell,
                 trainable=True,
                 name=None,
                 **kwargs):
        super(DecoderRNNV2, self).__init__(name=name,
                                           trainable=trainable,
                                           **kwargs)

        self._cell = MultiRNNCell([
            OutputProjectionWrapper(attention_cell, out_units),
            ZoneoutLSTMCell(out_units,
                            is_training,
                            zoneout_factor_cell,
                            zoneout_factor_output,
                            lstm_impl=lstm_impl),
            ZoneoutLSTMCell(out_units,
                            is_training,
                            zoneout_factor_cell,
                            zoneout_factor_output,
                            lstm_impl=lstm_impl),
        ],
                                  state_is_tuple=True)
def inference_decode(enc_outputs, seq_len, embeddings, out_dim):
    tiled_enc_outputs = tf.contrib.seq2seq.tile_batch(enc_outputs,
                                                      hp.beam_width)
    tiled_seq_len = tf.contrib.seq2seq.tile_batch(seq_len, hp.beam_width)

    beam_batch_size = tf.shape(tiled_enc_outputs)[0]
    # start tokens, end token
    start_tokens = tf.tile([hp.START_TOKEN],
                           [beam_batch_size // hp.beam_width])
    end_token = hp.END_TOKEN

    dec_prenet_outputs = DecoderPrenetWrapper(GRUCell(hp.embed_size),
                                              is_training=False,
                                              prenet_sizes=hp.embed_size,
                                              dropout_prob=hp.dropout)
    attention_mechanism = BahdanauAttention(
        hp.embed_size,
        tiled_enc_outputs,
        normalize=True,
        memory_sequence_length=tiled_seq_len,
        probability_fn=tf.nn.softmax)
    attn_cell = AttentionWrapper(dec_prenet_outputs,
                                 attention_mechanism,
                                 alignment_history=True,
                                 output_attention=False)
    concat_cell = ConcatOutputAndAttentionWrapper(attn_cell)
    decoder_cell = MultiRNNCell([
        OutputProjectionWrapper(concat_cell, hp.embed_size),
        ResidualWrapper(GRUCell(hp.embed_size)),
        ResidualWrapper(GRUCell(hp.embed_size))
    ],
                                state_is_tuple=True)

    output_cell = OutputProjectionWrapper(decoder_cell, out_dim)
    initial_state = output_cell.zero_state(batch_size=beam_batch_size,
                                           dtype=tf.float32)

    decoder = BeamSearchDecoder(cell=output_cell,
                                embedding=embeddings,
                                start_tokens=start_tokens,
                                end_token=end_token,
                                initial_state=initial_state,
                                beam_width=hp.beam_width)
    outputs, t1, t2 = tf.contrib.seq2seq.dynamic_decode(
        decoder, maximum_iterations=hp.max_len)
    return outputs
Exemple #3
0
    def __init__(self, out_units, attention_cell: AttentionRNN,
                 trainable=True, name=None, **kwargs):
        super(DecoderRNNV1, self).__init__(name=name, trainable=trainable, **kwargs)

        self._cell = MultiRNNCell([
            OutputProjectionWrapper(attention_cell, out_units),
            ResidualWrapper(GRUCell(out_units)),
            ResidualWrapper(GRUCell(out_units)),
        ], state_is_tuple=True)
    def __init__(self,
                 num_units,
                 state_cell,
                 vocab_size,
                 max_utt_len,
                 config,
                 num_zt=10,
                 use_peepholes=False,
                 cell_clip=None,
                 initializer=None,
                 num_proj=None,
                 proj_clip=None,
                 num_unit_shards=None,
                 num_proj_shards=None,
                 forget_bias=1.0,
                 state_is_tuple=True,
                 activation=None,
                 reuse=None,
                 name=None,
                 dtype=None):

        self._state_is_tuple = state_is_tuple
        self.num_zt = num_zt
        self.tau = tf.Variable(5.0, name="temperature")
        self.vocab_size = vocab_size
        self.max_utt_len = max_utt_len
        self.config = config
        if self.config.word_weights:
            self.weights = tf.constant(self.config.word_weights)
        else:
            self.weights = self.config.word_weights
        self.decoder_cell_1 = self.get_rnncell('lstm',
                                               200 + num_zt,
                                               keep_prob=self.config.keep_prob)
        self.decoder_cell_1 = OutputProjectionWrapper(self.decoder_cell_1,
                                                      self.vocab_size)
        self.decoder_cell_2 = self.get_rnncell('lstm',
                                               2 * (200 + num_zt),
                                               keep_prob=self.config.keep_prob)
        self.decoder_cell_2 = OutputProjectionWrapper(self.decoder_cell_2,
                                                      self.vocab_size)
        self.state_cell = state_cell
def training_decode(enc_outputs, seq_len, helper, out_dim):
    dec_prenet_outputs = DecoderPrenetWrapper(GRUCell(hp.embed_size),
                                              is_training=True,
                                              prenet_sizes=hp.embed_size,
                                              dropout_prob=hp.dropout)
    attention_mechanism = BahdanauAttention(hp.embed_size,
                                            enc_outputs,
                                            normalize=True,
                                            memory_sequence_length=seq_len,
                                            probability_fn=tf.nn.softmax)
    attn_cell = AttentionWrapper(dec_prenet_outputs,
                                 attention_mechanism,
                                 alignment_history=True,
                                 output_attention=False)
    concat_cell = ConcatOutputAndAttentionWrapper(attn_cell)
    decoder_cell = MultiRNNCell([
        OutputProjectionWrapper(concat_cell, hp.embed_size),
        ResidualWrapper(GRUCell(hp.embed_size)),
        ResidualWrapper(GRUCell(hp.embed_size))
    ],
                                state_is_tuple=True)

    output_cell = OutputProjectionWrapper(decoder_cell, out_dim)
    initial_state = output_cell.zero_state(batch_size=tf.shape(enc_outputs)[0],
                                           dtype=tf.float32)

    decoder = BasicDecoder(cell=output_cell,
                           helper=helper,
                           initial_state=initial_state)

    (outputs, _), last_state, _ = tf.contrib.seq2seq.dynamic_decode(
        decoder=decoder, maximum_iterations=hp.max_len)
    # for attention plot
    alignments = tf.transpose(last_state[0].alignment_history.stack(),
                              [1, 2, 0])
    return outputs, alignments
    def __init__(self,
                 out_units,
                 attention_cell: AttentionRNN,
                 gru_impl=GRUImpl.GRUCell,
                 trainable=True,
                 name=None,
                 **kwargs):
        super(DecoderRNNV1, self).__init__(name=name,
                                           trainable=trainable,
                                           **kwargs)

        self._cell = tf.nn.rnn_cell.MultiRNNCell([
            OutputProjectionWrapper(attention_cell, out_units),
            tf.nn.rnn_cell.ResidualWrapper(
                gru_cell_factory(gru_impl, out_units)),
            tf.nn.rnn_cell.ResidualWrapper(
                gru_cell_factory(gru_impl, out_units)),
        ],
                                                 state_is_tuple=True)
Exemple #7
0
    def _rnn(self,
             conv_inputs,
             output_size,
             training_placeholder,
             sequence_lengths,
             hidden_size,
             num_layers=4):
        """
        Takes output of convolutional tower and passes it through a multilayer LSTM
        :param sequence_lengths: vector containing lengths of sequences in batch
        :param num_layers: number of layers in multilayer RNN (n-1 normal layers and one output layer with projection function)
        :param conv_inputs: NOT reshaped output of convolutional tower (shaped [batch, l, h, w, channels]
        :param output_size: number of neurons in last layer, associated with number of possible class_ids
        :param training_placeholder: a placeholder
        :return: a tensor [batch_size, timesteps, num_classes] as an output of each timmestep of LSTM
        """
        keep_prob = tf.maximum(
            1 - tf.to_float(training_placeholder), 0.85
        )  # during training, keep_prob=0.85, during validation keep_prob=1.0
        with tf.variable_scope('sequential', initializer=xavier_initializer()):
            # flatten h, w, c dimensions
            flattened = tf.squeeze(tf.squeeze(conv_inputs, -2), -2)

            # only forward LSTMs are used
            fw_cells = []
            for i in range(num_layers):
                fw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
                if i < num_layers - 1:
                    fw_cell = tf.nn.rnn_cell.DropoutWrapper(
                        fw_cell, output_keep_prob=keep_prob)
                if i == num_layers - 1:
                    fw_cell = OutputProjectionWrapper(fw_cell, output_size,
                                                      tf.nn.tanh)

                fw_cells.append(fw_cell)

            fw_cells = tf.nn.rnn_cell.MultiRNNCell(fw_cells)
            outputs, _ = tf.nn.dynamic_rnn(cell=fw_cells,
                                           inputs=flattened,
                                           sequence_length=sequence_lengths,
                                           dtype=tf.float32)

            return outputs
Exemple #8
0
def single_cell(num_units,
                is_train,
                cell_type,
                dropout=0.0,
                forget_bias=0.0,
                dim_project=None):
    """Create an instance of a single RNN cell."""
    # dropout (= 1 - keep_prob) is set to 0 during eval and infer
    dropout = dropout if is_train else 0.0

    # Cell Type
    if cell_type == "lstm":
        single_cell = tf.contrib.rnn.LSTMCell(num_units,
                                              use_peepholes=True,
                                              num_proj=dim_project,
                                              cell_clip=50.0,
                                              forget_bias=1.0)
    elif cell_type == "cudnn_lstm":
        single_cell = tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell(num_units)
    elif cell_type == "gru":
        single_cell = GRUCell(num_units)
    elif cell_type == "LSTMBlockCell":
        single_cell = tf.contrib.rnn.LSTMBlockCell(num_units,
                                                   forget_bias=forget_bias)
    elif cell_type == "layer_norm_lstm":
        single_cell = LayerNormBasicLSTMCell(num_units,
                                             forget_bias=forget_bias,
                                             layer_norm=True)
    else:
        raise ValueError("Unknown unit type %s!" % cell_type)

    if dim_project:
        single_cell = OutputProjectionWrapper(cell=single_cell,
                                              output_size=dim_project)

    if dropout > 0.0:
        single_cell = DropoutWrapper(cell=single_cell,
                                     input_keep_prob=(1.0 - dropout))

    return single_cell
Exemple #9
0
        def decode(helper, scope, reuse=None):
            with tf.variable_scope(scope, reuse=reuse):
                rnn_layers = []
                for i in range(n_decoder_layers):
                    # Create GRUCell with dropout. Do not forget to set the reuse flag properly.
                    cell = tf.nn.rnn_cell.GRUCell(hidden_size, reuse=reuse)
                    cell = tf.nn.rnn_cell.DropoutWrapper(
                        cell, input_keep_prob=self.dropout_ph)
                    rnn_layers.append(cell)

                decoder_cell = MultiRNNCell(rnn_layers)

                # Create a projection wrapper
                decoder_cell = OutputProjectionWrapper(decoder_cell,
                                                       vocab_size,
                                                       reuse=reuse)

                # Create BasicDecoder, pass the defined cell, a helper, and initial state
                # The initial state should be equal to the final state of the encoder!
                initial_state = decoder_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)
                decoder = BasicDecoder(decoder_cell,
                                       helper,
                                       initial_state=initial_state)

                # The first returning argument of dynamic_decode contains two fields:
                #   * rnn_output (predicted logits)
                #   * sample_id (predictions)
                max_iters = tf.reduce_max(self.ground_truth_lengths)
                # max_iters = max_iter
                outputs, _, _ = dynamic_decode(decoder=decoder,
                                               maximum_iterations=max_iters,
                                               output_time_major=False,
                                               impute_finished=True)

                return outputs
Exemple #10
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            # embedding_table = tf.get_variable(
            #   'embedding', [len(symbols), 256], dtype=tf.float32,
            #   initializer=tf.truncated_normal_initializer(stddev=0.5))
            # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)           # [N, T_in, 256]
            # embedded_inputs = inputs

            # Encoder
            # n_fft = (self._hparams.num_src_freq - 1) * 2
            # in_layer_size = n_fft
            in_layer_size = self._hparams.num_src_freq
            prenet_outputs = prenet(inputs,
                                    is_training,
                                    layer_sizes=[in_layer_size,
                                                 128])  # [N, T_in, 128]
            encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths,
                                           is_training)  # [N, T_in, 256]

            # Attention
            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(GRUCell(256), is_training),
                BahdanauAttention(256, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 256]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(concat_cell, 256),
                ResidualWrapper(GRUCell(256)),
                ResidualWrapper(GRUCell(256))
            ],
                                        state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(mel_outputs, hp.num_mels,
                                     is_training)  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  input:                   %d' % inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemple #11
0
  def initialize(self, inputs, input_lengths, inputs_jp=None, mel_targets=None, linear_targets=None ):
    '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
    with tf.variable_scope('inference') as scope:
      is_training = linear_targets is not None
      is_teacher_force_generating = mel_targets is not None
      batch_size = tf.shape(inputs)[0]
      hp = self._hparams

      # Embeddings
      # embedding_table = tf.get_variable(
      #   'text_embedding', [len(symbols), hp.embed_depth], dtype=tf.float32,
      #   initializer=tf.truncated_normal_initializer(stddev=0.5))
      # embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)           # [N, T_in, 256]
      
      if hp.use_gst:
        #Global style tokens (GST)
        gst_tokens = tf.get_variable(
          'style_tokens', [hp.num_gst, hp.style_embed_depth // hp.num_heads], dtype=tf.float32,
          initializer=tf.truncated_normal_initializer(stddev=0.5))
        self.gst_tokens = gst_tokens
 
      # Encoder
      # prenet_outputs = prenet(embedded_inputs, is_training)
      prenet_outputs = prenet(inputs, is_training)
      # [N, T_in, 128]
      encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training)  # [N, T_in, 256]
      


      if inputs_jp  is not None:
        # Reference encoder
        refnet_outputs = reference_encoder(
          inputs_jp,
          filters=hp.reference_filters, 
          kernel_size=(3,3),
          strides=(2,2),
          encoder_cell=GRUCell(hp.reference_depth),
          is_training=is_training)                                                 # [N, 128]
        self.refnet_outputs = refnet_outputs                                       

        if hp.use_gst:
          # Style attention
          style_attention = MultiheadAttention(
            tf.expand_dims(refnet_outputs, axis=1),                                   # [N, 1, 128]
            tf.tanh(tf.tile(tf.expand_dims(gst_tokens, axis=0), [batch_size,1,1])),            # [N, hp.num_gst, 256/hp.num_heads]   
            num_heads=hp.num_heads,
            num_units=hp.style_att_dim,
            attention_type=hp.style_att_type)

          style_embeddings = style_attention.multi_head_attention()                   # [N, 1, 256]
        else:
          style_embeddings = tf.expand_dims(refnet_outputs, axis=1)                   # [N, 1, 128]
      else:
        print("Use random weight for GST.")
        random_weights = tf.random_uniform([hp.num_heads, hp.num_gst], maxval=1.0, dtype=tf.float32)
        random_weights = tf.nn.softmax(random_weights, name="random_weights")
        style_embeddings = tf.matmul(random_weights, tf.nn.tanh(gst_tokens))
        style_embeddings = tf.reshape(style_embeddings, [1, 1] + [hp.num_heads * gst_tokens.get_shape().as_list()[1]])

      # Add style embedding to every text encoder state
      style_embeddings = tf.tile(style_embeddings, [1, shape_list(encoder_outputs)[1], 1]) # [N, T_in, 128]
      encoder_outputs = tf.concat([encoder_outputs, style_embeddings], axis=-1)

      # Attention
      attention_cell = AttentionWrapper(
        GRUCell(hp.attention_depth),
        BahdanauAttention(hp.attention_depth, encoder_outputs, memory_sequence_length=input_lengths),
        alignment_history=True,
        output_attention=False)                                                  # [N, T_in, 256]

      # Concatenate attention context vector and RNN cell output.
      concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)              

      # Decoder (layers specified bottom to top):
      decoder_cell = MultiRNNCell([
          OutputProjectionWrapper(concat_cell, hp.rnn_depth),
          ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1)),
          ResidualWrapper(ZoneoutWrapper(LSTMCell(hp.rnn_depth), 0.1))
        ], state_is_tuple=True)                                                  # [N, T_in, 256]

      # Project onto r mel spectrograms (predict r outputs at each RNN step):
      output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
      decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

      if is_training or is_teacher_force_generating:
        helper = TacoTrainingHelper(inputs, mel_targets, hp)
      else:
        helper = TacoTestHelper(batch_size, hp)

      (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
        BasicDecoder(output_cell, helper, decoder_init_state),
        maximum_iterations=hp.max_iters)                                        # [N, T_out/r, M*r]

      # Reshape outputs to be one output per entry
      mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M]

      # Add post-processing CBHG:
      post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)           # [N, T_out, 256]
      linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)               # [N, T_out, F]

      # Grab alignments from the final decoder state:
      alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

      self.inputs = inputs
      self.input_lengths = input_lengths
      self.mel_outputs = mel_outputs
      self.encoder_outputs = encoder_outputs
      self.style_embeddings = style_embeddings
      self.linear_outputs = linear_outputs
      self.alignments = alignments
      self.mel_targets = mel_targets
      self.linear_targets = linear_targets
      self.inputs_jp = inputs_jp
      log('Initialized Tacotron model. Dimensions: ')
      log('  style embedding:         %d' % style_embeddings.shape[-1])
      log('  prenet out:              %d' % prenet_outputs.shape[-1])
      log('  encoder out:             %d' % encoder_outputs.shape[-1])
      log('  attention out:           %d' % attention_cell.output_size)
      log('  concat attn & out:       %d' % concat_cell.output_size)
      log('  decoder cell out:        %d' % decoder_cell.output_size)
      log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
      log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
      log('  postnet out:             %d' % post_outputs.shape[-1])
      log('  linear out:              %d' % linear_outputs.shape[-1])
Exemple #12
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   reference_mel=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'text_embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table,
                                                     inputs)  # [N, T_in, 256]

            #Global style tokens (GST)
            gst_tokens = tf.get_variable(
                'style_tokens',
                [hp.num_gst, hp.style_embed_depth // hp.num_heads],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            self.gst_tokens = gst_tokens

            # Encoder

            encoder_outputs = encoder(embedded_inputs, input_lengths,
                                      is_training, 512, 5,
                                      256)  # [N, T_in, 256]

            if is_training:
                reference_mel = mel_targets

            if reference_mel is not None:
                # Reference encoder
                refnet_outputs = reference_encoder(
                    reference_mel,
                    filters=hp.ref_filters,
                    kernel_size=(3, 3),
                    strides=(2, 2),
                    encoder_cell=GRUCell(hp.ref_depth),
                    is_training=is_training)  # [N, 128]
                self.refnet_outputs = refnet_outputs

                # Style attention
                style_attention = MultiheadAttention(
                    tf.expand_dims(refnet_outputs, axis=1),  # [N, 1, 128]
                    tf.tanh(
                        tf.tile(tf.expand_dims(gst_tokens, axis=0),
                                [batch_size, 1, 1
                                 ])),  # [N, hp.num_gst, 256/hp.num_heads]   
                    num_heads=hp.num_heads,
                    num_units=hp.style_att_dim,
                    attention_type=hp.style_att_type)

                embedded_tokens = style_attention.multi_head_attention(
                )  # [N, 1, 256]

            else:
                random_weights = tf.constant(
                    hp.num_heads * [[0] * (hp.gst_index - 1) + [1] + [0] *
                                    (hp.num_gst - hp.gst_index)],
                    dtype=tf.float32)
                random_weights = tf.nn.softmax(random_weights,
                                               name="random_weights")
                # gst_tokens = tf.tile(gst_tokens, [1, hp.num_heads])
                embedded_tokens = tf.matmul(random_weights,
                                            tf.nn.tanh(gst_tokens))
                embedded_tokens = hp.gst_scale * embedded_tokens
                embedded_tokens = tf.reshape(
                    embedded_tokens, [1, 1] +
                    [hp.num_heads * gst_tokens.get_shape().as_list()[1]])

            # Add style embedding to every text encoder state
            style_embeddings = tf.tile(
                embedded_tokens,
                [1, shape_list(encoder_outputs)[1], 1])  # [N, T_in, 128]
            encoder_outputs = tf.concat([encoder_outputs, style_embeddings],
                                        axis=-1)

            # Attention
            attention_mechanism = LocationSensitiveAttention(
                128,
                encoder_outputs,
                hparams=hp,
                is_training=is_training,
                mask_encoder=True,
                memory_sequence_length=input_lengths,
                smoothing=False,
                cumulate_weights=True)
            decoder_lstm = [
                ZoneoutLSTMCell(1024,
                                is_training,
                                zoneout_factor_cell=0.1,
                                zoneout_factor_output=0.1,
                                name='decoder_LSTM_{}'.format(i + 1))
                for i in range(2)
            ]

            decoder_lstm = MultiRNNCell(decoder_lstm, state_is_tuple=True)
            decoder_init_state = decoder_lstm.zero_state(
                batch_size=batch_size, dtype=tf.float32)  #tensorflow1에는 없음

            attention_cell = AttentionWrapper(
                decoder_lstm,
                attention_mechanism,
                initial_cell_state=decoder_init_state,
                alignment_history=True,
                output_attention=False)

            # attention_state_size = 256
            # Decoder input -> prenet -> decoder_lstm -> concat[output, attention]
            # dec_outputs = DecoderPrenetWrapper(attention_cell, is_training, hp.prenet_depths)
            dec_outputs_cell = OutputProjectionWrapper(
                attention_cell, (hp.num_mels) * hp.outputs_per_step)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp)
            else:
                helper = TacoTestHelper(batch_size, hp)

            decoder_init_state = dec_outputs_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32)
            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(dec_outputs_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            decoder_mel_outputs = tf.reshape(
                decoder_outputs[:, :, :hp.num_mels * hp.outputs_per_step],
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            x = decoder_mel_outputs
            for i in range(5):
                activation = tf.nn.tanh if i != (4) else None
                x = tf.layers.conv1d(x,
                                     filters=512,
                                     kernel_size=5,
                                     padding='same',
                                     activation=activation,
                                     name='Postnet_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=0.5,
                                      training=is_training,
                                      name='Postnet_dropout_{}'.format(i))

            residual = tf.layers.dense(x,
                                       hp.num_mels,
                                       name='residual_projection')
            mel_outputs = decoder_mel_outputs + residual

            # Add post-processing CBHG:
            # mel_outputs: (N,T,num_mels)
            post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            linear_outputs = tf.layers.dense(
                post_outputs,
                hp.num_freq)  # [N, T_out, F(1025)]             # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_mel_outputs = decoder_mel_outputs
            self.mel_outputs = mel_outputs
            self.encoder_outputs = encoder_outputs
            self.style_embeddings = style_embeddings
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.reference_mel = reference_mel
            self.all_vars = tf.trainable_variables()
            log('Initialized Tacotron model. Dimensions: ')
            log('  text embedding:          %d' % embedded_inputs.shape[-1])
            log('  style embedding:         %d' % style_embeddings.shape[-1])
            # log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            # log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % dec_outputs_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
    def initialize(
        self,
        inputs,
        input_lengths,
        num_speakers,
        speaker_id,
        mel_targets=None,
        linear_targets=None,
        loss_coeff=None,
        rnn_decoder_test_mode=False,
        is_randomly_initialized=False,
    ):

        is_training2 = linear_targets is not None  # test에서 이게 True로 되는데, 이게 의도한 것인가???
        is_training = not rnn_decoder_test_mode

        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings(256)
            char_embed_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            zero_pad = True
            if zero_pad:  # transformer에 구현되어 있는 거 보고, 가져온 로직.
                # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다)
                char_embed_table = tf.concat(
                    (tf.zeros(shape=[1, hp.embedding_size]),
                     char_embed_table[1:, :]), 0)

            # [N, T_in, embedding_size]
            char_embedded_inputs = tf.nn.embedding_lookup(
                char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:  # speaker_embedding_size = f(16)
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(
                            speaker_id, self.num_speakers,
                            hp.enc_prenet_sizes[-1], "before_highway"
                        )  # 'enc_prenet_sizes': [f(256), f(128)]
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [
                            get_embed(
                                speaker_id, self.num_speakers, hp.dec_rnn_size,
                                "decoder_rnn_init_states{}".format(idx + 1))
                            for idx in range(hp.dec_layer_num)
                        ]
                    else:
                        deep_dense = lambda x, dim: tf.layers.dense(
                            x, dim, activation=tf.nn.softsign
                        )  # softsign: x / (abs(x) + 1)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly
                elif hp.model_type == 'simple':
                    # simple model은 speaker_embed를 DecoderPrenetWrapper,ConcatOutputAndAttentionWrapper에 각각 넣어서 concat하는 방식이다.
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None
                else:
                    raise Exception(
                        " [!] Unkown multi-speaker model type: {}".format(
                            hp.model_type))
            else:
                # self.num_speakers =1인 경우
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None  # bidirectional GRU의 init state
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            ##############
            # Encoder
            ##############

            # [N, T_in, enc_prenet_sizes[-1]]
            prenet_outputs = prenet(
                char_embedded_inputs,
                is_training,
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet'
            )  # 'enc_prenet_sizes': [f(256), f(128)],  dropout_prob = 0.5
            # ==> (N, T_in, 128)

            # enc_rnn_size = 128
            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            ##############
            # Attention
            ##############

            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            # single: attention_size = 128
            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=False)
            elif hp.attention_type == 'bah_mon_norm':  # hccho 추가
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    hp.attention_size,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah_mon_norm_hccho':
                attention_mechanism = BahdanauMonotonicAttention_hccho(
                    hp.attention_size, encoder_outputs, normalize=True)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            # DecoderPrenetWrapper, attention_mechanism을 결합하여 AttentionWrapper를 만든다.
            # carpedm20은  tensorflow 소스를코드를 가져와서 AttentionWrapper를 새로 구현했지만,  keith Ito는 tensorflow AttentionWrapper를 그냥 사용했다.
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_state_size),
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False
            )  # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다.

            # attention_state_size = 256
            dec_prenet_outputs = DecoderPrenetWrapper(
                attention_cell, speaker_embed, is_training,
                hp.dec_prenet_sizes,
                hp.dropout_prob)  # dec_prenet_sizes =  [f(256), f(128)]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            # [N, T_in, attention_size+attention_state_size]

            #dec_prenet_outputs의 다음 cell에 전달하는 AttentionWrapperState의 member (attention,cell_state, ...)에서 attention과 output을 concat하여 output으로 내보낸다.
            # output이 output은 cell_state와 같기 때문에, concat [ output(=cell_state) | attention ]
            concat_cell = ConcatOutputAndAttentionWrapper(
                dec_prenet_outputs, embed_to_concat=speaker_embed
            )  # concat(output,attention,speaker_embed)해서 새로운 output을 만든다.

            # Decoder (layers specified bottom to top):  dec_rnn_size= 256
            cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)
                     ]  # OutputProjectionWrapper는 논문에 언급이 없는 것 같은데...
            for _ in range(hp.dec_layer_num):  # hp.dec_layer_num = 2
                cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))

            # [N, T_in, 256]
            decoder_cell = MultiRNNCell(cells, state_is_tuple=True)

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor
            )  # 여기에 stop token도 나올 수 있도록...수정하면 되지 않을까???   (hp.num_mels+1) * hp.reduction_factor
            decoder_init_state = output_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32
            )  # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다.

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied: AttentionWrapper의 initial_cell_state를 이미 넣어 주었다. )
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training2:
                # rnn_decoder_test_mode = True if test mode,  train mode에서는 False
                helper = TacoTrainingHelper(
                    inputs, mel_targets, hp.num_mels, hp.reduction_factor,
                    rnn_decoder_test_mode)  # inputs은 batch_size 계산에만 사용됨
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(BasicDecoder(output_cell, helper, decoder_init_state),maximum_iterations=hp.max_iters)  # max_iters=200

            # [N, T_out, M]
            mel_outputs = tf.reshape(decoder_outputs,
                                     [batch_size, -1, hp.num_mels])

            # Add post-processing CBHG:
            # [N, T_out, 256]
            #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = tf.concat(
                    [tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq)  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            # MultiRNNCell이 3단이기 때문에, final_decoder_state는 len 3 tuple이다.  ==> final_decoder_state[0]
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('    prenet out:               %d' % prenet_outputs.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    concat attn & out:        %d' % concat_cell.output_size)
            log('    decoder cell out:         %d' % decoder_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
Exemple #14
0
def get_batch(batch, size=5):
    low = (batch * size) % (40 - size)
    high = low + size
    return t_vals[low:high], series[low:high]


n_steps = 20
n_inputs = 1
n_neurons = 100
n_outputs = 1

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])

cell = OutputProjectionWrapper(BasicRNNCell(num_units=n_neurons,
                                            activation=tf.nn.relu),
                               output_size=n_outputs)
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

loss = tf.reduce_mean(tf.square(outputs - y), name='loss')
loss_summary = tf.summary.scalar('loss', loss)

optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

batch_size = 100
n_iterations = 20000

with tf.Session() as sess:
Exemple #15
0
def embedding_attention_seq2seq_context(encoder_inputs,
                                        decoder_inputs,
                                        cell,
                                        num_encoder_symbols,
                                        num_decoder_symbols,
                                        embedding_size,
                                        num_heads=1,
                                        output_projection=None,
                                        feed_previous=False,
                                        dtype=dtypes.float32,
                                        scope=None):
    """A seq2seq architecture with two encoders, one for context, one for input DA. The decoder
    uses twice the cell size. Code adapted from TensorFlow examples."""

    with vs.variable_scope(scope or "embedding_attention_seq2seq_context"):

        # split context and real inputs into separate vectors
        context_inputs = encoder_inputs[0:len(encoder_inputs) / 2]
        encoder_inputs = encoder_inputs[len(encoder_inputs) / 2:]

        # build separate encoders
        encoder_cell = EmbeddingWrapper(cell, num_encoder_symbols,
                                        embedding_size)
        with vs.variable_scope("context_rnn") as scope:
            context_outputs, context_states = tf06s2s.rnn(encoder_cell,
                                                          context_inputs,
                                                          dtype=dtype,
                                                          scope=scope)
        with vs.variable_scope("input_rnn") as scope:
            encoder_outputs, encoder_states = tf06s2s.rnn(encoder_cell,
                                                          encoder_inputs,
                                                          dtype=dtype,
                                                          scope=scope)

        # concatenate outputs & states
        # adding positional arguments and concatenating output, cell and hidden states
        encoder_outputs = [
            array_ops.concat([co, eo],
                             axis=1,
                             name="context-and-encoder-output")
            for co, eo in zip(context_outputs, encoder_outputs)
        ]
        encoder_states = [
            (array_ops.concat([c1, c2],
                              axis=1), array_ops.concat([h1, h2], axis=1))
            for (c1, h1), (c2, h2) in zip(context_states, encoder_states)
        ]

        # calculate a concatenation of encoder outputs to put attention on.
        top_states = [
            array_ops.reshape(e, [-1, 1, cell.output_size * 2])
            for e in encoder_outputs
        ]
        #added positional arguements as it was taking axis to be the values
        attention_states = array_ops.concat(axis=1, values=top_states)

        # change the decoder cell to accommodate wider input
        # TODO this will work for BasicLSTMCell and GRUCell, but not for others
        cell = type(cell)(num_units=(cell.output_size * 2))

        # Decoder.
        output_size = None
        if output_projection is None:
            cell = OutputProjectionWrapper(cell, num_decoder_symbols)
            output_size = num_decoder_symbols

        if isinstance(feed_previous, bool):
            return tf06s2s.embedding_attention_decoder(
                decoder_inputs, encoder_states[-1], attention_states, cell,
                num_decoder_symbols, embedding_size, num_heads, output_size,
                output_projection, feed_previous)
        else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
            outputs1, states1 = tf06s2s.embedding_attention_decoder(
                decoder_inputs, encoder_states[-1], attention_states, cell,
                num_decoder_symbols, embedding_size, num_heads, output_size,
                output_projection, True)
            vs.get_variable_scope().reuse_variables()
            outputs2, states2 = tf06s2s.embedding_attention_decoder(
                decoder_inputs, encoder_states[-1], attention_states, cell,
                num_decoder_symbols, embedding_size, num_heads, output_size,
                output_projection, False)

            outputs = control_flow_ops.cond(feed_previous, lambda: outputs1,
                                            lambda: outputs2)
            states = control_flow_ops.cond(feed_previous, lambda: states1,
                                           lambda: states2)
            return outputs, states
Exemple #16
0
def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell,
                                num_encoder_symbols, num_decoder_symbols,
                                embedding_size,
                                num_heads=1, output_projection=None,
                                feed_previous=False, dtype=dtypes.float32,
                                scope=None):
  """Embedding sequence-to-sequence model with attention.

  This model first embeds encoder_inputs by a newly created embedding (of shape
  [num_encoder_symbols x cell.input_size]). Then it runs an RNN to encode
  embedded encoder_inputs into a state vector. It keeps the outputs of this
  RNN at every step to use for attention later. Next, it embeds decoder_inputs
  by another newly created embedding (of shape [num_decoder_symbols x
  cell.input_size]). Then it runs attention decoder, initialized with the last
  encoder state, on embedded decoder_inputs and attending to encoder outputs.

  Args:
    encoder_inputs: a list of 1D int32 Tensors of shape [batch_size].
    decoder_inputs: a list of 1D int32 Tensors of shape [batch_size].
    cell: RNNCell defining the cell function and size.
    num_encoder_symbols: integer; number of symbols on the encoder side.
    num_decoder_symbols: integer; number of symbols on the decoder side.
    num_heads: number of attention heads that read from attention_states.
    output_projection: None or a pair (W, B) of output projection weights and
      biases; W has shape [cell.output_size x num_decoder_symbols] and B has
      shape [num_decoder_symbols]; if provided and feed_previous=True, each
      fed previous output will first be multiplied by W and added B.
    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
      of decoder_inputs will be used (the "GO" symbol), and all other decoder
      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
      If False, decoder_inputs are used as given (the standard decoder case).
    dtype: The dtype of the initial RNN state (default: tf.float32).
    scope: VariableScope for the created subgraph; defaults to
      "embedding_attention_seq2seq".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors with
      shape [batch_size x num_decoder_symbols] containing the generated outputs.
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
  """
  with vs.variable_scope(scope or "embedding_attention_seq2seq"):
    # Encoder.
    encoder_cell = EmbeddingWrapper(cell, num_encoder_symbols, embedding_size)
    encoder_outputs, encoder_states = rnn(
        encoder_cell, encoder_inputs, dtype=dtype)

    # First calculate a concatenation of encoder outputs to put attention on.
    top_states = [array_ops.reshape(e, [-1, 1, cell.output_size])
                  for e in encoder_outputs]
    attention_states = array_ops.concat(top_states, 1)

    # Decoder.
    output_size = None
    if output_projection is None:
      cell = OutputProjectionWrapper(cell, num_decoder_symbols)
      output_size = num_decoder_symbols

    if isinstance(feed_previous, bool):
      return embedding_attention_decoder(
          decoder_inputs, encoder_states[-1], attention_states, cell,
          num_decoder_symbols, embedding_size, num_heads, output_size,
          output_projection, feed_previous)
    else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
      outputs1, states1 = embedding_attention_decoder(
          decoder_inputs, encoder_states[-1], attention_states, cell,
          num_decoder_symbols, embedding_size, num_heads, output_size,
          output_projection, True)
      vs.get_variable_scope().reuse_variables()
      outputs2, states2 = embedding_attention_decoder(
          decoder_inputs, encoder_states[-1], attention_states, cell,
          num_decoder_symbols, embedding_size, num_heads, output_size,
          output_projection, False)

      outputs = control_flow_ops.cond(feed_previous,
                                      lambda: outputs1, lambda: outputs2)
      states = control_flow_ops.cond(feed_previous,
                                     lambda: states1, lambda: states2)
      return outputs, states
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
from tensorflow.contrib.rnn import BasicRNNCell, OutputProjectionWrapper

n_steps = 20
n_inputs = 1
n_neurons = 100
n_outputs = 1

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.int32, [None])

basic_cell = BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu)
cell = OutputProjectionWrapper(basic_cell, output_size=n_outputs)
outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

logits = fully_connected(states, n_outputs, activation_fn=None)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                               logits=logits)

learning_rate = 0.001
loss = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate)

train_op = optimizer.minimize(loss)

correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()
    def initialize(self,
                   inputs,
                   input_lengths,
                   num_speakers,
                   speaker_id=None,
                   mel_targets=None,
                   linear_targets=None,
                   is_training=False,
                   loss_coeff=None,
                   stop_token_targets=None):

        with tf.variable_scope('Eembedding') as scope:
            hp = self._hparams
            batch_size = tf.shape(inputs)[0]

            # Embeddings(256)
            char_embed_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_size],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))

            zero_pad = True
            if zero_pad:  # transformer에 구현되어 있는 거 보고, 가져온 로직.
                # <PAD> 0 은 embedding이 0으로 고정되고, train으로 변하지 않는다. 즉, 위의 get_variable에서 잡았던 변수의 첫번째 행(<PAD>)에 대응되는 것은 사용되지 않는 것이다)
                char_embed_table = tf.concat(
                    (tf.zeros(shape=[1, hp.embedding_size]),
                     char_embed_table[1:, :]), 0)

            # [N, T_in, embedding_size]
            char_embedded_inputs = tf.nn.embedding_lookup(
                char_embed_table, inputs)

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                speaker_embed_table = tf.get_variable(
                    'speaker_embedding',
                    [self.num_speakers, hp.speaker_embedding_size],
                    dtype=tf.float32,
                    initializer=tf.truncated_normal_initializer(stddev=0.5))
                # [N, T_in, speaker_embedding_size]
                speaker_embed = tf.nn.embedding_lookup(speaker_embed_table,
                                                       speaker_id)

                deep_dense = lambda x, dim, name: tf.layers.dense(
                    x, dim, activation=tf.nn.softsign, name=name
                )  # softsign: x / (abs(x) + 1)

                encoder_rnn_init_state = deep_dense(
                    speaker_embed, hp.encoder_lstm_units * 4,
                    'encoder_init_dense')  # hp.encoder_lstm_units = 256

                decoder_rnn_init_states = [
                    deep_dense(speaker_embed, hp.decoder_lstm_units * 2,
                               'decoder_init_dense_{}'.format(i))
                    for i in range(hp.decoder_layers)
                ]  # hp.decoder_lstm_units = 1024

                speaker_embed = None
            else:
                # self.num_speakers =1인 경우
                speaker_embed = None
                encoder_rnn_init_state = None  # bidirectional GRU의 init state
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

        with tf.variable_scope('Encoder') as scope:
            ##############
            # Encoder
            ##############
            x = char_embedded_inputs
            for i in range(hp.enc_conv_num_layers):
                x = tf.layers.conv1d(x,
                                     filters=hp.enc_conv_channels,
                                     kernel_size=hp.enc_conv_kernel_size,
                                     padding='same',
                                     activation=tf.nn.relu,
                                     name='Encoder_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=hp.dropout_prob,
                                      training=is_training,
                                      name='dropout_{}'.format(i))

            if encoder_rnn_init_state is not None:
                initial_state_fw_c, initial_state_fw_h, initial_state_bw_c, initial_state_bw_h = tf.split(
                    encoder_rnn_init_state, 4, 1)
                initial_state_fw = LSTMStateTuple(initial_state_fw_c,
                                                  initial_state_fw_h)
                initial_state_bw = LSTMStateTuple(initial_state_bw_c,
                                                  initial_state_bw_h)
            else:  # single mode
                initial_state_fw, initial_state_bw = None, None

            cell_fw = ZoneoutLSTMCell(
                hp.encoder_lstm_units,
                is_training,
                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                zoneout_factor_output=hp.tacotron_zoneout_rate,
                name='encoder_fw_LSTM')
            cell_bw = ZoneoutLSTMCell(
                hp.encoder_lstm_units,
                is_training,
                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                zoneout_factor_output=hp.tacotron_zoneout_rate,
                name='encoder_fw_LSTM')
            encoder_conv_output = x
            outputs, states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                encoder_conv_output,
                sequence_length=input_lengths,
                initial_state_fw=initial_state_fw,
                initial_state_bw=initial_state_bw,
                dtype=tf.float32)

            # envoder_outpust = [N,T,2*encoder_lstm_units] = [N,T,512]
            encoder_outputs = tf.concat(
                outputs,
                axis=2)  # Concat and return forward + backward outputs

        with tf.variable_scope('Decoder') as scope:

            ##############
            # Attention
            ##############
            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=False)
            elif hp.attention_type == 'bah_mon_norm':  # hccho 추가
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'loc_sen':  # Location Sensitivity Attention
                attention_mechanism = LocationSensitiveAttention(
                    hp.attention_size,
                    encoder_outputs,
                    hparams=hp,
                    is_training=is_training,
                    mask_encoder=hp.mask_encoder,
                    memory_sequence_length=input_lengths,
                    smoothing=hp.smoothing,
                    cumulate_weights=hp.cumulative_weights)
            elif hp.attention_type == 'gmm':  # GMM Attention
                attention_mechanism = GmmAttention(
                    hp.attention_size,
                    memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths,
                    scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size,
                    encoder_outputs,
                    memory_sequence_length=input_lengths)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            decoder_lstm = [
                ZoneoutLSTMCell(hp.decoder_lstm_units,
                                is_training,
                                zoneout_factor_cell=hp.tacotron_zoneout_rate,
                                zoneout_factor_output=hp.tacotron_zoneout_rate,
                                name='decoder_LSTM_{}'.format(i + 1))
                for i in range(hp.decoder_layers)
            ]

            decoder_lstm = tf.contrib.rnn.MultiRNNCell(decoder_lstm,
                                                       state_is_tuple=True)
            decoder_init_state = decoder_lstm.zero_state(
                batch_size=batch_size, dtype=tf.float32
            )  # 여기서 zero_state를 부르면, 위의 AttentionWrapper에서 이미 넣은 준 값도 포함되어 있다.

            if hp.model_type == "multi-speaker":

                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx][0].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1[1] * 2 != shape2[1]:
                        raise Exception(
                            " [!] Shape {} and {} should be equal".format(
                                shape1, shape2))
                    c, h = tf.split(cell, 2, 1)
                    decoder_init_state[idx] = LSTMStateTuple(c, h)

                decoder_init_state = tuple(decoder_init_state)

            attention_cell = AttentionWrapper(
                decoder_lstm,
                attention_mechanism,
                initial_cell_state=decoder_init_state,
                alignment_history=True,
                output_attention=False
            )  # output_attention=False 에 주목, attention_layer_size에 값을 넣지 않았다. 그래서 attention = contex vector가 된다.

            # attention_state_size = 256
            # Decoder input -> prenet -> decoder_lstm -> concat[output, attention]
            dec_prenet_outputs = DecoderWrapper(attention_cell, is_training,
                                                hp.dec_prenet_sizes,
                                                hp.dropout_prob,
                                                hp.inference_prenet_dropout)

            dec_outputs_cell = OutputProjectionWrapper(
                dec_prenet_outputs, (hp.num_mels + 1) * hp.reduction_factor)

            if is_training:
                helper = TacoTrainingHelper(
                    mel_targets, hp.num_mels,
                    hp.reduction_factor)  # inputs은 batch_size 계산에만 사용됨
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            decoder_init_state = dec_outputs_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32)
            (decoder_outputs, _), final_decoder_state, _ = \
                    tf.contrib.seq2seq.dynamic_decode(BasicDecoder(dec_outputs_cell, helper, decoder_init_state),maximum_iterations=int(hp.max_n_frame/hp.reduction_factor))  # max_iters=200

            decoder_mel_outputs = tf.reshape(
                decoder_outputs[:, :, :hp.num_mels * hp.reduction_factor],
                [batch_size, -1, hp.num_mels
                 ])  # [N,iters,400] -> [N,5*iters,80]
            stop_token_outputs = tf.reshape(
                decoder_outputs[:, :, hp.num_mels * hp.reduction_factor:],
                [batch_size, -1])  # [N,iters]

            # Postnet
            x = decoder_mel_outputs
            for i in range(hp.postnet_num_layers):
                activation = tf.nn.tanh if i != (hp.postnet_num_layers -
                                                 1) else None
                x = tf.layers.conv1d(x,
                                     filters=hp.postnet_channels,
                                     kernel_size=hp.postnet_kernel_size,
                                     padding='same',
                                     activation=activation,
                                     name='Postnet_{}'.format(i))
                x = tf.layers.batch_normalization(x, training=is_training)
                x = tf.layers.dropout(x,
                                      rate=hp.dropout_prob,
                                      training=is_training,
                                      name='Postnet_dropout_{}'.format(i))

            residual = tf.layers.dense(x,
                                       hp.num_mels,
                                       name='residual_projection')
            mel_outputs = decoder_mel_outputs + residual

            # Add post-processing CBHG:
            # mel_outputs: (N,T,num_mels)
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            linear_outputs = tf.layers.dense(
                post_outputs, hp.num_freq,
                name='linear_spectogram_projection')  # [N, T_out, F(1025)]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(),
                [1, 2, 0
                 ])  # batch_size, text length(encoder), target length(decoder)

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.decoder_mel_outputs = decoder_mel_outputs
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state
            self.stop_token_targets = stop_token_targets
            self.stop_token_outputs = stop_token_outputs
            self.all_vars = tf.trainable_variables()
            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('    embedding:                %d' %
                char_embedded_inputs.shape[-1])
            log('    encoder conv out:               %d' %
                encoder_conv_output.shape[-1])
            log('    encoder out:              %d' % encoder_outputs.shape[-1])
            log('    attention out:            %d' %
                attention_cell.output_size)
            log('    decoder prenet lstm concat out :        %d' %
                dec_prenet_outputs.output_size)
            log('    decoder cell out:         %d' %
                dec_outputs_cell.output_size)
            log('    decoder out (%d frames):  %d' %
                (hp.reduction_factor, decoder_outputs.shape[-1]))
            log('    decoder mel out:    %d' % decoder_mel_outputs.shape[-1])
            log('    mel out:    %d' % mel_outputs.shape[-1])
            log('    postnet out:              %d' % post_outputs.shape[-1])
            log('    linear out:               %d' % linear_outputs.shape[-1])
            log('  Tacotron Parameters       {:.3f} Million.'.format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1000000))
Exemple #19
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   gta=False,
                   locked_alignments=None,
                   logs_enabled=True):
        '''Initializes the model for inference.

        Sets "pml_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
          logs_enabled: boolean flag that defaults to True, if False no construction logs output
        '''
        with tf.variable_scope('inference') as scope:
            is_training = pml_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            # Attention
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                                  hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if is_training or gta:
                helper = TacoTrainingHelper(inputs, pml_targets,
                                            hp.pml_dimension,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            (multi_decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            decoder_outputs = tf.reshape(
                multi_decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Postnet: predicts a residual
            postnet_outputs = postnet(decoder_outputs,
                                      layers=hp.postnet_conv_layers,
                                      conv_width=hp.postnet_conv_width,
                                      channels=hp.postnet_conv_channels,
                                      is_training=is_training)

            pml_outputs = decoder_outputs + postnet_outputs

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, multi_decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % pml_outputs.shape[-1])
Exemple #20
0
    def initialize(self, text_inputs, input_lengths, speaker_ids, mel_targets=None, linear_targets=None):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          text_inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          speaker_ids: int32 Tensor containing ids of specific speakers
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference'):
            is_training = linear_targets is not None
            batch_size = tf.shape(text_inputs)[0]
            hp = self._hparams
            vocab_size = len(symbols)
            embedded_inputs = embedding(text_inputs, vocab_size, hp.embedding_dim)  # [N, T_in, embd_size]

            # extract speaker embedding if multi-speaker
            with tf.variable_scope('speaker'):
                if hp.num_speakers > 1:
                    speaker_embedding = tf.get_variable('speaker_embed',
                                                        shape=(hp.num_speakers, hp.speaker_embed_dim),
                                                        dtype=tf.float32)
                    # TODO: what about special initializer=tf.truncated_normal_initializer(stddev=0.5)?
                    speaker_embd = tf.nn.embedding_lookup(speaker_embedding, speaker_ids)
                else:
                    speaker_embd = None
            # Encoder
            prenet_outputs = prenet(inputs=embedded_inputs,
                                    drop_rate=hp.drop_rate if is_training else 0.0,
                                    is_training=is_training,
                                    layer_sizes=hp.encoder_prenet,
                                    scope="prenet")  # [N, T_in, 128]
            encoder_outputs = cbhg(prenet_outputs, input_lengths,
                                   speaker_embd=speaker_embd,
                                   is_training=is_training,
                                   K=hp.encoder_cbhg_banks,
                                   c=hp.encoder_cbhg_bank_sizes,  # [N, T_in, 256]
                                   scope='encoder_cbhg')

            # Attention Mechanism
            attention_cell = attention_decoder(encoder_outputs, hp.attention_dim, input_lengths, is_training,
                                               speaker_embd=speaker_embd, attention_type=hp.attention_type)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                OutputProjectionWrapper(attention_cell, hp.decoder_dim),  # 256
                ResidualWrapper(GRUCell(hp.decoder_dim)),  # 256
                ResidualWrapper(GRUCell(hp.decoder_dim))  # 256
            ], state_is_tuple=True)  # [N, T_in, 256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

            if is_training:
                helper = TacoTrainingHelper(text_inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)

            (decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                    BasicDecoder(output_cell, helper, decoder_init_state),
                    maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing
            post_outputs = cbhg(mel_outputs, None,
                                speaker_embd=None,
                                is_training=is_training,
                                K=hp.post_cbhg_banks,
                                c=hp.post_cbhg_bank_sizes + [hp.num_mels],
                                scope='post_cbhg')  # [N, T_out, 256]
            linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = text_inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.audio = audio.inv_spectrogram_tensorflow(linear_outputs)
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            # TODO: later work around for getting info back?
            # log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % attention_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemple #21
0
    def initialize(self, inputs, input_lengths, num_speakers, speaker_id,
            mel_targets=None, linear_targets=None, loss_coeff=None,
            rnn_decoder_test_mode=False, is_randomly_initialized=False):
        '''Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
        '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            self.batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)  # [N, T_in, 512]

            # Encoder
            encoder_outputs = conv_and_lstm(
                embedded_inputs,
                input_lengths,
                conv_layers=hp.encoder_conv_layers,
                conv_width=hp.encoder_conv_width,
                conv_channels=hp.encoder_conv_channels,
                lstm_units=hp.encoder_lstm_units,
                is_training=is_training,
                scope='encoder')  # [N, T_in, 512]

            # Attention
            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool, shape=(), name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32, shape=[None, None, None], name="manual_alignments",
            )

            attention_cell = AttentionWrapper(
                DecoderPrenetWrapper(LSTMBlockCell(hp.attention_depth), is_training),
                LocationSensitiveAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, 128]

            # Concatenate attention context vector and RNN cell output into a 512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(attention_cell)  # [N, T_in, 512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell([
                concat_cell,
                LSTMBlockCell(hp.decoder_lstm_units),
                LSTMBlockCell(hp.decoder_lstm_units)
            ], state_is_tuple=True)  # [N, T_in, 1024]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
            else:
                helper = TacoTestHelper(self.batch_size, hp.num_mels, hp.outputs_per_step)

            decoder_init_state = output_cell.zero_state(batch_size=self.batch_size, dtype=tf.float32)
            (multi_decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                BasicDecoder(output_cell, helper, decoder_init_state),
                maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry                                [N, T_out, M]
            decoder_outputs = tf.reshape(multi_decoder_outputs, [self.batch_size, -1, hp.num_mels])

            # Postnet: predicts a residual
            postnet_outputs = postnet(
                decoder_outputs,
                layers=hp.postnet_conv_layers,
                conv_width=hp.postnet_conv_width,
                channels=hp.postnet_conv_channels,
                is_training=is_training)
            mel_outputs = decoder_outputs + postnet_outputs

            # Convert to linear using a similar architecture as the encoder:
            expand_outputs = conv_and_lstm(
                mel_outputs,
                None,
                conv_layers=hp.expand_conv_layers,
                conv_width=hp.expand_conv_width,
                conv_channels=hp.expand_conv_channels,
                lstm_units=hp.expand_lstm_units,
                is_training=is_training,
                scope='expand')  # [N, T_in, 512]
            linear_outputs = tf.layers.dense(expand_outputs, hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_outputs = decoder_outputs
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  expand out:              %d' % expand_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemple #22
0
    def __init__(self, sess, config, api, log_dir, forward, scope=None):
        self.vocab = api.vocab
        self.rev_vocab = api.rev_vocab
        self.vocab_size = len(self.vocab)
        self.topic_vocab = api.topic_vocab
        self.topic_vocab_size = len(self.topic_vocab)

        self.sess = sess
        self.scope = scope
        self.max_utt_len = config.max_utt_len
        self.go_id = self.rev_vocab["<s>"]
        self.eos_id = self.rev_vocab["</s>"]
        self.context_cell_size = config.cxt_cell_size
        self.sent_cell_size = config.sent_cell_size
        self.dec_cell_size = config.dec_cell_size
        self.bow_weights = config.bow_weights

        with tf.name_scope("io"):
            # all dialog context and known attributes
            self.input_contexts = tf.placeholder(dtype=tf.int32,
                                                 shape=(None, None,
                                                        self.max_utt_len),
                                                 name="context")
            self.context_lens = tf.placeholder(dtype=tf.int32,
                                               shape=(None, ),
                                               name="context_lens")

            # target response given the dialog context
            self.output_tokens = tf.placeholder(dtype=tf.int32,
                                                shape=(None, None),
                                                name="output_token")
            self.output_lens = tf.placeholder(dtype=tf.int32,
                                              shape=(None, ),
                                              name="output_lens")
            self.output_topics = tf.placeholder(dtype=tf.int32,
                                                shape=(None, ),
                                                name="output_topic")

            # optimization related variables
            self.learning_rate = tf.Variable(float(config.init_lr),
                                             trainable=False,
                                             name="learning_rate")
            self.learning_rate_decay_op = self.learning_rate.assign(
                tf.multiply(self.learning_rate, config.lr_decay))
            self.global_t = tf.placeholder(dtype=tf.int32, name="global_t")
            self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior")

        max_context_len = array_ops.shape(self.input_contexts)[1]
        max_out_len = array_ops.shape(self.output_tokens)[1]
        batch_size = array_ops.shape(self.input_contexts)[0]

        if config.use_hcf:
            with variable_scope.variable_scope("topicEmbedding"):
                t_embedding = tf.get_variable(
                    "embedding",
                    [self.topic_vocab_size, config.topic_embed_size],
                    dtype=tf.float32)
                topic_embedding = embedding_ops.embedding_lookup(
                    t_embedding, self.output_topics)

        with variable_scope.variable_scope("wordEmbedding"):
            self.embedding = tf.get_variable(
                "embedding", [self.vocab_size, config.embed_size],
                dtype=tf.float32)
            embedding_mask = tf.constant(
                [0 if i == 0 else 1 for i in range(self.vocab_size)],
                dtype=tf.float32,
                shape=[self.vocab_size, 1])
            embedding = self.embedding * embedding_mask

            input_embedding = embedding_ops.embedding_lookup(
                embedding, tf.reshape(self.input_contexts, [-1]))
            input_embedding = tf.reshape(
                input_embedding, [-1, self.max_utt_len, config.embed_size])
            output_embedding = embedding_ops.embedding_lookup(
                embedding, self.output_tokens)

            # context nn
            if config.sent_type == "bow":
                input_embedding, sent_size = get_bow(input_embedding)
                output_embedding, _ = get_bow(output_embedding)

            elif config.sent_type == "rnn":
                sent_cell = self.get_rnncell("gru", self.sent_cell_size,
                                             config.keep_prob, 1)
                input_embedding, sent_size = get_rnn_encode(input_embedding,
                                                            sent_cell,
                                                            scope="sent_rnn")
                output_embedding, _ = get_rnn_encode(output_embedding,
                                                     sent_cell,
                                                     self.output_lens,
                                                     scope="sent_rnn",
                                                     reuse=True)
            elif config.sent_type == "bi_rnn":
                fwd_sent_cell = self.get_rnncell("gru",
                                                 self.sent_cell_size,
                                                 keep_prob=1.0,
                                                 num_layer=1)
                bwd_sent_cell = self.get_rnncell("gru",
                                                 self.sent_cell_size,
                                                 keep_prob=1.0,
                                                 num_layer=1)
                input_embedding, sent_size = get_bi_rnn_encode(
                    input_embedding,
                    fwd_sent_cell,
                    bwd_sent_cell,
                    scope="sent_bi_rnn")
                output_embedding, _ = get_bi_rnn_encode(output_embedding,
                                                        fwd_sent_cell,
                                                        bwd_sent_cell,
                                                        self.output_lens,
                                                        scope="sent_bi_rnn",
                                                        reuse=True)
            else:
                raise ValueError(
                    "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]")

            # reshape input into dialogs
            input_embedding = tf.reshape(input_embedding,
                                         [-1, max_context_len, sent_size])
            if config.keep_prob < 1.0:
                input_embedding = tf.nn.dropout(input_embedding,
                                                config.keep_prob)

        with variable_scope.variable_scope("contextRNN"):
            enc_cell = self.get_rnncell(config.cell_type,
                                        self.context_cell_size,
                                        keep_prob=1.0,
                                        num_layer=config.num_layer)
            # and enc_last_state will be same as the true last state
            _, enc_last_state = tf.nn.dynamic_rnn(
                enc_cell,
                input_embedding,
                dtype=tf.float32,
                sequence_length=self.context_lens)

            if config.num_layer > 1:
                enc_last_state = tf.concat(enc_last_state, 1)

        # combine with other attributes
        if config.use_hcf:
            attribute_embedding = topic_embedding
            attribute_fc1 = layers.fully_connected(attribute_embedding,
                                                   30,
                                                   activation_fn=tf.tanh,
                                                   scope="attribute_fc1")

        cond_embedding = enc_last_state

        with variable_scope.variable_scope("recognitionNetwork"):
            if config.use_hcf:
                recog_input = tf.concat(
                    [cond_embedding, output_embedding, attribute_fc1], 1)
            else:
                recog_input = tf.concat([cond_embedding, output_embedding], 1)
            self.recog_mulogvar = recog_mulogvar = layers.fully_connected(
                recog_input,
                config.latent_size * 2,
                activation_fn=None,
                scope="muvar")
            recog_mu, recog_logvar = tf.split(recog_mulogvar, 2, axis=1)

        with variable_scope.variable_scope("priorNetwork"):
            prior_fc1 = layers.fully_connected(cond_embedding,
                                               np.maximum(
                                                   config.latent_size * 2,
                                                   100),
                                               activation_fn=tf.tanh,
                                               scope="fc1")
            prior_mulogvar = layers.fully_connected(prior_fc1,
                                                    config.latent_size * 2,
                                                    activation_fn=None,
                                                    scope="muvar")
            prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=1)

            # use sampled Z or posterior Z
            latent_sample = tf.cond(
                self.use_prior,
                lambda: sample_gaussian(prior_mu, prior_logvar),
                lambda: sample_gaussian(recog_mu, recog_logvar))

        with variable_scope.variable_scope("generationNetwork"):
            gen_inputs = tf.concat([cond_embedding, latent_sample], 1)

            # BOW loss
            bow_fc1 = layers.fully_connected(gen_inputs,
                                             400,
                                             activation_fn=tf.tanh,
                                             scope="bow_fc1")
            if config.keep_prob < 1.0:
                bow_fc1 = tf.nn.dropout(bow_fc1, config.keep_prob)
            self.bow_logits = layers.fully_connected(bow_fc1,
                                                     self.vocab_size,
                                                     activation_fn=None,
                                                     scope="bow_project")

            # Y loss
            if config.use_hcf:
                meta_fc1 = layers.fully_connected(latent_sample,
                                                  400,
                                                  activation_fn=tf.tanh,
                                                  scope="meta_fc1")
                if config.keep_prob < 1.0:
                    meta_fc1 = tf.nn.dropout(meta_fc1, config.keep_prob)
                self.topic_logits = layers.fully_connected(
                    meta_fc1, self.topic_vocab_size, scope="topic_project")
                topic_prob = tf.nn.softmax(self.topic_logits)
                #pred_attribute_embedding = tf.matmul(topic_prob, t_embedding)
                pred_topic = tf.argmax(topic_prob, 1)
                pred_attribute_embedding = embedding_ops.embedding_lookup(
                    t_embedding, pred_topic)
                if forward:
                    selected_attribute_embedding = pred_attribute_embedding
                else:
                    selected_attribute_embedding = attribute_embedding
                dec_inputs = tf.concat(
                    [gen_inputs, selected_attribute_embedding], 1)
            else:
                self.topic_logits = tf.zeros(
                    (batch_size, self.topic_vocab_size))
                selected_attribute_embedding = None
                dec_inputs = gen_inputs

            # Decoder
            if config.num_layer > 1:
                dec_init_state = [
                    layers.fully_connected(dec_inputs,
                                           self.dec_cell_size,
                                           activation_fn=None,
                                           scope="init_state-%d" % i)
                    for i in range(config.num_layer)
                ]
                dec_init_state = tuple(dec_init_state)
            else:
                dec_init_state = layers.fully_connected(dec_inputs,
                                                        self.dec_cell_size,
                                                        activation_fn=None,
                                                        scope="init_state")

        with variable_scope.variable_scope("decoder"):
            dec_cell = self.get_rnncell(config.cell_type, self.dec_cell_size,
                                        config.keep_prob, config.num_layer)
            dec_cell = OutputProjectionWrapper(dec_cell, self.vocab_size)

            if forward:
                loop_func = decoder_fn_lib.context_decoder_fn_inference(
                    None,
                    dec_init_state,
                    embedding,
                    start_of_sequence_id=self.go_id,
                    end_of_sequence_id=self.eos_id,
                    maximum_length=self.max_utt_len,
                    num_decoder_symbols=self.vocab_size,
                    context_vector=selected_attribute_embedding)
                dec_input_embedding = None
                dec_seq_lens = None
            else:
                loop_func = decoder_fn_lib.context_decoder_fn_train(
                    dec_init_state, selected_attribute_embedding)
                dec_input_embedding = embedding_ops.embedding_lookup(
                    embedding, self.output_tokens)
                dec_input_embedding = dec_input_embedding[:, 0:-1, :]
                dec_seq_lens = self.output_lens - 1

                if config.keep_prob < 1.0:
                    dec_input_embedding = tf.nn.dropout(
                        dec_input_embedding, config.keep_prob)

                # apply word dropping. Set dropped word to 0
                if config.dec_keep_prob < 1.0:
                    keep_mask = tf.less_equal(
                        tf.random_uniform((batch_size, max_out_len - 1),
                                          minval=0.0,
                                          maxval=1.0), config.dec_keep_prob)
                    keep_mask = tf.expand_dims(tf.to_float(keep_mask), 2)
                    dec_input_embedding = dec_input_embedding * keep_mask
                    dec_input_embedding = tf.reshape(
                        dec_input_embedding,
                        [-1, max_out_len - 1, config.embed_size])

            dec_outs, _, final_context_state = dynamic_rnn_decoder(
                dec_cell,
                loop_func,
                inputs=dec_input_embedding,
                sequence_length=dec_seq_lens)
            if final_context_state is not None:
                final_context_state = final_context_state[:, 0:array_ops.
                                                          shape(dec_outs)[1]]
                mask = tf.to_int32(tf.sign(tf.reduce_max(dec_outs, axis=2)))
                self.dec_out_words = tf.multiply(
                    tf.reverse(final_context_state, axis=[1]), mask)
            else:
                self.dec_out_words = tf.argmax(dec_outs, 2)

        if not forward:
            with variable_scope.variable_scope("loss"):
                labels = self.output_tokens[:, 1:]
                label_mask = tf.to_float(tf.sign(labels))

                rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=dec_outs, labels=labels)
                rc_loss = tf.reduce_sum(rc_loss * label_mask,
                                        reduction_indices=1)
                self.avg_rc_loss = tf.reduce_mean(rc_loss)
                # used only for perpliexty calculation. Not used for optimzation
                self.rc_ppl = tf.exp(
                    tf.reduce_sum(rc_loss) / tf.reduce_sum(label_mask))
                """ as n-trial multimodal distribution. """
                tile_bow_logits = tf.tile(tf.expand_dims(self.bow_logits, 1),
                                          [1, max_out_len - 1, 1])
                bow_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=tile_bow_logits, labels=labels) * label_mask
                bow_loss = tf.reduce_sum(bow_loss, reduction_indices=1)
                self.avg_bow_loss = tf.reduce_mean(bow_loss)
                bow_weights = tf.to_float(self.bow_weights)

                # reconstruct the meta info about X
                if config.use_hcf:
                    topic_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=self.topic_logits, labels=self.output_topics)
                    self.avg_topic_loss = tf.reduce_mean(topic_loss)
                else:
                    self.avg_topic_loss = 0.0

                kld = gaussian_kld(recog_mu, recog_logvar, prior_mu,
                                   prior_logvar)
                self.avg_kld = tf.reduce_mean(kld)
                if log_dir is not None:
                    kl_weights = tf.minimum(
                        tf.to_float(self.global_t) / config.full_kl_step, 1.0)
                else:
                    kl_weights = tf.constant(1.0)

                self.kl_w = kl_weights
                self.elbo = self.avg_rc_loss + kl_weights * self.avg_kld
                aug_elbo = bow_weights * self.avg_bow_loss + self.avg_topic_loss + self.elbo

                tf.summary.scalar("topic_loss", self.avg_topic_loss)
                tf.summary.scalar("rc_loss", self.avg_rc_loss)
                tf.summary.scalar("elbo", self.elbo)
                tf.summary.scalar("kld", self.avg_kld)
                tf.summary.scalar("bow_loss", self.avg_bow_loss)

                self.summary_op = tf.summary.merge_all()

                self.log_p_z = norm_log_liklihood(latent_sample, prior_mu,
                                                  prior_logvar)
                self.log_q_z_xy = norm_log_liklihood(latent_sample, recog_mu,
                                                     recog_logvar)
                self.est_marginal = tf.reduce_mean(rc_loss + bow_loss -
                                                   self.log_p_z +
                                                   self.log_q_z_xy)

            self.optimize(sess, config, aug_elbo, log_dir)

        self.saver = tf.train.Saver(tf.global_variables(),
                                    write_version=tf.train.SaverDef.V2)
Exemple #23
0
    def __init__(self, sess, config, api, log_dir, forward, scope=None):
        self.vocab = api.vocab
        self.rev_vocab = api.rev_vocab
        self.vocab_size = len(self.vocab)
        self.sess = sess
        self.scope = scope
        self.max_utt_len = config.max_utt_len
        self.go_id = self.rev_vocab["<s>"]
        self.eos_id = self.rev_vocab["</s>"]
        self.context_cell_size = config.cxt_cell_size
        self.sent_cell_size = config.sent_cell_size
        self.dec_cell_size = config.dec_cell_size
        self.num_topics = config.num_topics

        with tf.name_scope("io"):
            # all dialog context and known attributes
            self.input_contexts = tf.placeholder(dtype=tf.int32,
                                                 shape=(None, None,
                                                        self.max_utt_len),
                                                 name="dialog_context")
            self.floors = tf.placeholder(dtype=tf.float32,
                                         shape=(None, None),
                                         name="floor")  # TODO float
            self.floor_labels = tf.placeholder(dtype=tf.float32,
                                               shape=(None, 1),
                                               name="floor_labels")
            self.context_lens = tf.placeholder(dtype=tf.int32,
                                               shape=(None, ),
                                               name="context_lens")
            self.paragraph_topics = tf.placeholder(dtype=tf.float32,
                                                   shape=(None,
                                                          self.num_topics),
                                                   name="paragraph_topics")

            # target response given the dialog context
            self.output_tokens = tf.placeholder(dtype=tf.int32,
                                                shape=(None, None),
                                                name="output_token")
            self.output_lens = tf.placeholder(dtype=tf.int32,
                                              shape=(None, ),
                                              name="output_lens")
            self.output_das = tf.placeholder(dtype=tf.float32,
                                             shape=(None, self.num_topics),
                                             name="output_dialog_acts")

            # optimization related variables
            self.learning_rate = tf.Variable(float(config.init_lr),
                                             trainable=False,
                                             name="learning_rate")
            self.learning_rate_decay_op = self.learning_rate.assign(
                tf.multiply(self.learning_rate, config.lr_decay))
            self.global_t = tf.placeholder(dtype=tf.int32, name="global_t")

        max_dialog_len = array_ops.shape(self.input_contexts)[1]
        max_out_len = array_ops.shape(self.output_tokens)[1]
        batch_size = array_ops.shape(self.input_contexts)[0]

        with variable_scope.variable_scope("wordEmbedding"):
            self.embedding = tf.get_variable(
                "embedding", [self.vocab_size, config.embed_size],
                dtype=tf.float32)
            embedding_mask = tf.constant(
                [0 if i == 0 else 1 for i in range(self.vocab_size)],
                dtype=tf.float32,
                shape=[self.vocab_size, 1])
            embedding = self.embedding * embedding_mask

            # embed the input
            input_embedding = embedding_ops.embedding_lookup(
                embedding, tf.reshape(self.input_contexts, [-1]))
            input_embedding = tf.reshape(
                input_embedding, [-1, self.max_utt_len, config.embed_size])

            # encode input using RNN w/GRU
            sent_cell = self.get_rnncell("gru", self.sent_cell_size,
                                         config.keep_prob, 1)
            input_embedding, sent_size = get_rnn_encode(input_embedding,
                                                        sent_cell,
                                                        scope="sent_rnn")

            # reshape input
            input_embedding = tf.reshape(input_embedding,
                                         [-1, max_dialog_len, sent_size])
            if config.keep_prob < 1.0:
                input_embedding = tf.nn.dropout(input_embedding,
                                                config.keep_prob)

            # floor = probability that the next sentence is the last
            # TODO do we want this?
            floor = tf.reshape(self.floors, [-1, max_dialog_len, 1])

            joint_embedding = tf.concat([input_embedding, floor], 2,
                                        "joint_embedding")

        with variable_scope.variable_scope("contextRNN"):
            enc_cell = self.get_rnncell(config.cell_type,
                                        self.context_cell_size,
                                        keep_prob=1.0,
                                        num_layer=config.num_layer)
            # and enc_last_state will be same as the true last state
            _, enc_last_state = tf.nn.dynamic_rnn(
                enc_cell,
                joint_embedding,
                dtype=tf.float32,
                sequence_length=self.context_lens)

            if config.num_layer > 1:
                if config.cell_type == 'lstm':
                    enc_last_state = [temp.h for temp in enc_last_state]

                enc_last_state = tf.concat(enc_last_state, 1)
            else:
                if config.cell_type == 'lstm':
                    enc_last_state = enc_last_state.h

        # Final output from the encoder
        encoded_list = [self.paragraph_topics, enc_last_state]
        encoded_embedding = tf.concat(encoded_list, 1)

        with variable_scope.variable_scope("generationNetwork"):

            # predict whether the next sentence is the last one
            # TODO do we want this?
            self.paragraph_end_logits = layers.fully_connected(
                encoded_embedding,
                1,
                activation_fn=tf.tanh,
                scope="paragraph_end_fc1")

            # Decoder
            if config.num_layer > 1:
                dec_init_state = []
                for i in range(config.num_layer):
                    temp_init = layers.fully_connected(encoded_embedding,
                                                       self.dec_cell_size,
                                                       activation_fn=None,
                                                       scope="init_state-%d" %
                                                       i)
                    if config.cell_type == 'lstm':
                        # initializer thing for lstm
                        temp_init = rnn_cell.LSTMStateTuple(
                            temp_init, temp_init)

                    dec_init_state.append(temp_init)

                dec_init_state = tuple(dec_init_state)
            else:
                dec_init_state = layers.fully_connected(encoded_embedding,
                                                        self.dec_cell_size,
                                                        activation_fn=None,
                                                        scope="init_state")
                if config.cell_type == 'lstm':
                    dec_init_state = rnn_cell.LSTMStateTuple(
                        dec_init_state, dec_init_state)

        with variable_scope.variable_scope("decoder"):
            dec_cell = self.get_rnncell(config.cell_type, self.dec_cell_size,
                                        config.keep_prob, config.num_layer)
            # projects into thing of vocab size. TODO no softmax?
            dec_cell = OutputProjectionWrapper(dec_cell, self.vocab_size)

            if forward:
                loop_func = decoder_fn_lib.context_decoder_fn_inference(
                    None,
                    dec_init_state,
                    embedding,
                    start_of_sequence_id=self.go_id,
                    end_of_sequence_id=self.eos_id,
                    maximum_length=self.max_utt_len,
                    num_decoder_symbols=self.vocab_size,
                    context_vector=None)
                dec_input_embedding = None
                dec_seq_lens = None
            else:
                loop_func = decoder_fn_lib.context_decoder_fn_train(
                    dec_init_state, None)
                dec_input_embedding = embedding_ops.embedding_lookup(
                    embedding, self.output_tokens)
                dec_input_embedding = dec_input_embedding[:, 0:-1, :]
                dec_seq_lens = self.output_lens - 1

                if config.keep_prob < 1.0:
                    dec_input_embedding = tf.nn.dropout(
                        dec_input_embedding, config.keep_prob)

                # apply word dropping. Set dropped word to 0
                if config.dec_keep_prob < 1.0:
                    # get make of keep/throw-away
                    keep_mask = tf.less_equal(
                        tf.random_uniform((batch_size, max_out_len - 1),
                                          minval=0.0,
                                          maxval=1.0), config.dec_keep_prob)
                    keep_mask = tf.expand_dims(tf.to_float(keep_mask), 2)
                    dec_input_embedding = dec_input_embedding * keep_mask
                    dec_input_embedding = tf.reshape(
                        dec_input_embedding,
                        [-1, max_out_len - 1, config.embed_size])

            dec_outs, _, final_context_state = dynamic_rnn_decoder(
                dec_cell,
                loop_func,
                inputs=dec_input_embedding,
                sequence_length=dec_seq_lens,
                name='output_node')

            if final_context_state is not None:
                final_context_state = final_context_state[:, 0:array_ops.
                                                          shape(dec_outs)[1]]
                mask = tf.to_int32(tf.sign(tf.reduce_max(dec_outs, axis=2)))
                self.dec_out_words = tf.multiply(
                    tf.reverse(final_context_state, axis=[1]), mask)
            else:
                self.dec_out_words = tf.argmax(dec_outs, 2)

        if not forward:
            with variable_scope.variable_scope("loss"):

                labels = self.output_tokens[:, 1:]  # correct word tokens
                label_mask = tf.to_float(tf.sign(labels))

                # Loss between words
                print "dec outs shape", dec_outs.get_shape()
                print "labels shape", labels.get_shape()

                # Loss between words
                rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=dec_outs, labels=labels)
                rc_loss = tf.reduce_sum(rc_loss * label_mask,
                                        reduction_indices=1)
                self.avg_rc_loss = tf.reduce_mean(rc_loss)
                # used only for perpliexty calculation. Not used for optimzation
                self.rc_ppl = tf.exp(
                    tf.reduce_sum(rc_loss) / tf.reduce_sum(label_mask))

                # Predict 0/1 (1 = last sentence in paragraph)
                end_loss = tf.nn.softmax_cross_entropy_with_logits(
                    labels=self.floor_labels, logits=self.paragraph_end_logits)
                self.avg_end_loss = tf.reduce_mean(end_loss)
                print "size of end loss", self.avg_end_loss.get_shape()

                total_loss = self.avg_rc_loss + self.avg_end_loss

                tf.summary.scalar("rc_loss", self.avg_rc_loss)
                tf.summary.scalar("paragraph_end_loss", self.avg_end_loss)

                self.summary_op = tf.summary.merge_all()

            self.optimize(sess, config, total_loss, log_dir)

        self.saver = tf.train.Saver(tf.global_variables(),
                                    write_version=tf.train.SaverDef.V2)
Exemple #24
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   num_speakers,
                   speaker_id,
                   mel_targets=None,
                   linear_targets=None,
                   loss_coeff=None,
                   rnn_decoder_test_mode=False,
                   is_randomly_initialized=False):

        is_training = linear_targets is not None
        self.is_randomly_initialized = is_randomly_initialized

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            self.num_speakers = num_speakers
            if self.num_speakers > 1:
                if hp.speaker_embedding_size != 1:
                    speaker_embed_table = tf.get_variable(
                        'speaker_embedding',
                        [self.num_speakers, hp.speaker_embedding_size],
                        dtype=tf.float32,
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.5))
                    # [N, T_in, speaker_embedding_size]
                    speaker_embed = tf.nn.embedding_lookup(
                        speaker_embed_table, speaker_id)

                if hp.model_type == 'deepvoice':
                    if hp.speaker_embedding_size == 1:
                        before_highway = get_embed(speaker_id,
                                                   self.num_speakers,
                                                   hp.enc_prenet_sizes[-1],
                                                   "before_highway")
                        encoder_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers, hp.enc_rnn_size * 2,
                            "encoder_rnn_init_state")

                        attention_rnn_init_state = get_embed(
                            speaker_id, self.num_speakers,
                            hp.attention_state_size,
                            "attention_rnn_init_state")
                        decoder_rnn_init_states = [get_embed(
                                speaker_id, self.num_speakers,
                                hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \
                                        for idx in range(hp.dec_layer_num)]
                    else:
                        deep_dense = lambda x, dim: \
                                tf.layers.dense(x, dim, activation=tf.nn.softsign)

                        before_highway = deep_dense(speaker_embed,
                                                    hp.enc_prenet_sizes[-1])
                        encoder_rnn_init_state = deep_dense(
                            speaker_embed, hp.enc_rnn_size * 2)

                        attention_rnn_init_state = deep_dense(
                            speaker_embed, hp.attention_state_size)
                        decoder_rnn_init_states = [
                            deep_dense(speaker_embed, hp.dec_rnn_size)
                            for _ in range(hp.dec_layer_num)
                        ]

                    speaker_embed = None  # deepvoice does not use speaker_embed directly

                elif hp.model_type == 'simple':
                    before_highway = None
                    encoder_rnn_init_state = None
                    attention_rnn_init_state = None
                    decoder_rnn_init_states = None

                else:
                    raise Exception(
                        " [!] Unknown multi-speaker model type: {}".format(
                            hp.model_type))

            else:
                speaker_embed = None
                before_highway = None
                encoder_rnn_init_state = None
                attention_rnn_init_state = None
                decoder_rnn_init_states = None

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs,
                is_training,
                hp.enc_prenet_sizes,
                hp.dropout_prob,
                scope='prenet')  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.enc_bank_size,
                hp.enc_bank_channel_size,
                hp.enc_maxpool_width,
                hp.enc_highway_depth,
                hp.enc_rnn_size,
                hp.enc_proj_sizes,
                hp.enc_proj_width,
                scope="encoder_cbhg",
                before_highway=before_highway,
                encoder_rnn_init_state=encoder_rnn_init_state)

            # Attention
            # For manaul control of attention
            self.is_manual_attention = tf.placeholder(
                tf.bool,
                shape=(),
                name='is_manual_attention',
            )
            self.manual_alignments = tf.placeholder(
                tf.float32,
                shape=[None, None, None],
                name="manual_alignments",
            )

            dec_prenet_outputs = DecoderPrenetWrapper(
                GRUCell(hp.attention_state_size), speaker_embed, is_training,
                hp.dec_prenet_sizes, hp.dropout_prob)

            if hp.attention_type == 'bah_mon':
                attention_mechanism = BahdanauMonotonicAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type == 'bah_norm':
                attention_mechanism = BahdanauAttention(hp.attention_size,
                                                        encoder_outputs,
                                                        normalize=True)
            elif hp.attention_type == 'luong_scaled':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs,
                                                     scale=True)
            elif hp.attention_type == 'luong':
                attention_mechanism = LuongAttention(hp.attention_size,
                                                     encoder_outputs)
            elif hp.attention_type == 'bah':
                attention_mechanism = BahdanauAttention(
                    hp.attention_size, encoder_outputs)
            elif hp.attention_type.startswith('ntm2'):
                shift_width = int(hp.attention_type.split('-')[-1])
                attention_mechanism = NTMAttention2(hp.attention_size,
                                                    encoder_outputs,
                                                    shift_width=shift_width)
            else:
                raise Exception(" [!] Unkown attention type: {}".format(
                    hp.attention_type))

            attention_cell = AttentionWrapper(
                dec_prenet_outputs,
                attention_mechanism,
                self.is_manual_attention,
                self.manual_alignments,
                initial_cell_state=attention_rnn_init_state,
                alignment_history=True,
                output_attention=False)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            # [N, T_in, attention_size+attention_state_size]
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell, embed_to_concat=speaker_embed)

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.dec_rnn_size),
                    ResidualWrapper(GRUCell(hp.dec_rnn_size)),
                    ResidualWrapper(GRUCell(hp.dec_rnn_size)),
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.reduction_factor)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            if hp.model_type == "deepvoice":
                # decoder_init_state[0] : AttentionWrapperState
                # = cell_state + attention + time + alignments + alignment_history
                # decoder_init_state[0][0] = attention_rnn_init_state (already applied)
                decoder_init_state = list(decoder_init_state)

                for idx, cell in enumerate(decoder_rnn_init_states):
                    shape1 = decoder_init_state[idx + 1].get_shape().as_list()
                    shape2 = cell.get_shape().as_list()
                    if shape1 != shape2:
                        raise Exception(" [!] Shape {} and {} should be equal". \
                                format(shape1, shape2))
                    decoder_init_state[idx + 1] = cell

                decoder_init_state = tuple(decoder_init_state)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.reduction_factor,
                                            rnn_decoder_test_mode)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.reduction_factor)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            # [N, T_out, postnet_depth=256]
            post_outputs = cbhg(mel_outputs,
                                None,
                                is_training,
                                hp.post_bank_size,
                                hp.post_bank_channel_size,
                                hp.post_maxpool_width,
                                hp.post_highway_depth,
                                hp.post_rnn_size,
                                hp.post_proj_sizes,
                                hp.post_proj_width,
                                scope='post_cbhg')

            if speaker_embed is not None and hp.model_type == 'simple':
                expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
                tiled_speaker_embedding = tf.tile(
                    expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])

                # [N, T_out, 256 + alpha]
                post_outputs = tf.concat(
                    [tiled_speaker_embedding, post_outputs], axis=-1)

            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.speaker_id = speaker_id
            self.input_lengths = input_lengths
            self.loss_coeff = loss_coeff
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            self.final_decoder_state = final_decoder_state

            log('=' * 40)
            log(' model_type: %s' % hp.model_type)
            log('=' * 40)

            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            if speaker_embed is not None:
                log('    speaker embedding:        %d' %
                    speaker_embed.shape[-1])
            else:
                log('    speaker embedding:        None')
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            log('  attention out:           %d' % attention_cell.output_size)
            log('  concat attn & out:       %d' % concat_cell.output_size)
            log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemple #25
0
def embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell,
                               num_symbols, output_projection=None,
                               feed_previous=False, dtype=dtypes.float32,
                               scope=None):
  """Embedding RNN sequence-to-sequence model with tied (shared) parameters.

  This model first embeds encoder_inputs by a newly created embedding (of shape
  [num_symbols x cell.input_size]). Then it runs an RNN to encode embedded
  encoder_inputs into a state vector. Next, it embeds decoder_inputs using
  the same embedding. Then it runs RNN decoder, initialized with the last
  encoder state, on embedded decoder_inputs.

  Args:
    encoder_inputs: a list of 1D int32 Tensors of shape [batch_size].
    decoder_inputs: a list of 1D int32 Tensors of shape [batch_size].
    cell: RNNCell defining the cell function and size.
    num_symbols: integer; number of symbols for both encoder and decoder.
    output_projection: None or a pair (W, B) of output projection weights and
      biases; W has shape [cell.output_size x num_symbols] and B has
      shape [num_symbols]; if provided and feed_previous=True, each
      fed previous output will first be multiplied by W and added B.
    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
      of decoder_inputs will be used (the "GO" symbol), and all other decoder
      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
      If False, decoder_inputs are used as given (the standard decoder case).
    dtype: The dtype to use for the initial RNN states (default: tf.float32).
    scope: VariableScope for the created subgraph; defaults to
      "embedding_tied_rnn_seq2seq".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors with
      shape [batch_size x num_decoder_symbols] containing the generated outputs.
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when output_projection has the wrong shape.
  """
  if output_projection is not None:
    proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype)
    proj_weights.get_shape().assert_is_compatible_with([cell.output_size,
                                                        num_symbols])
    proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
    proj_biases.get_shape().assert_is_compatible_with([num_symbols])

  with vs.variable_scope(scope or "embedding_tied_rnn_seq2seq"):
    with ops.device("/cpu:0"):
      embedding = vs.get_variable("embedding", [num_symbols, cell.input_size])

    emb_encoder_inputs = [embedding_ops.embedding_lookup(embedding, x)
                          for x in encoder_inputs]
    emb_decoder_inputs = [embedding_ops.embedding_lookup(embedding, x)
                          for x in decoder_inputs]

    def extract_argmax_and_embed(prev, _):
      """Loop_function that extracts the symbol from prev and embeds it."""
      if output_projection is not None:
        prev = nn_ops.xw_plus_b(
            prev, output_projection[0], output_projection[1])
      prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1))
      return embedding_ops.embedding_lookup(embedding, prev_symbol)

    if output_projection is None:
      cell = OutputProjectionWrapper(cell, num_symbols)

    if isinstance(feed_previous, bool):
      loop_function = extract_argmax_and_embed if feed_previous else None
      return tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell,
                              loop_function=loop_function, dtype=dtype)
    else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
      outputs1, states1 = tied_rnn_seq2seq(
          emb_encoder_inputs, emb_decoder_inputs, cell,
          loop_function=extract_argmax_and_embed, dtype=dtype)
      vs.get_variable_scope().reuse_variables()
      outputs2, states2 = tied_rnn_seq2seq(
          emb_encoder_inputs, emb_decoder_inputs, cell, dtype=dtype)

      outputs = control_flow_ops.cond(feed_previous,
                                      lambda: outputs1, lambda: outputs2)
      states = control_flow_ops.cond(feed_previous,
                                     lambda: states1, lambda: states2)
      return outputs, states
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None,
                   pml_targets=None,
                   is_training=False,
                   gta=False,
                   locked_alignments=None,
                   logs_enabled=True):
        """
        Initializes the model for inference.

        Sets "mel_outputs", "linear_outputs", and "alignments" fields.

        Args:
          inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
            steps in the input time series, and values are character IDs
          input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
            of each sequence in inputs.
          mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
            of steps in the output time series, M is num_mels, and values are entries in the mel
            spectrogram. Only needed for training.
          linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
            of steps in the output time series, F is num_freq, and values are entries in the linear
            spectrogram. Only needed for training.
          pml_targets: float32 Tensor with shape [N, T_out, P] where N is batch_size, T_out is number of
            steps in the PML vocoder features trajectories, P is pml_dimension, and values are PML vocoder
            features. Only needed for training.
          is_training: boolean flag that is set to True during training
          gta: boolean flag that is set to True when ground truth alignment is required
          locked_alignments: when explicit attention alignment is required, the locked alignments are passed in this
            parameter and the attention alignments are locked to these values
          logs_enabled: boolean flag that defaults to True, if False no construction logs output
        """
        # fix the alignments shape to (batch_size, encoder_steps, decoder_steps) if not already including
        # batch dimension
        locked_alignments_ = locked_alignments

        if locked_alignments_ is not None:
            if np.ndim(locked_alignments_) < 3:
                locked_alignments_ = np.expand_dims(locked_alignments_, 0)

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            encoder_outputs = conv_and_gru(  # [N, T_in, 2*encoder_gru_units=512]
                embedded_inputs,
                input_lengths,
                conv_layers=hp.encoder_conv_layers,
                conv_width=hp.encoder_conv_width,
                conv_channels=hp.encoder_conv_channels,
                gru_units_unidirectional=hp.encoder_gru_units,
                is_training=is_training,
                scope='encoder',
            )

            # Attention
            attention_cell = LockableAttentionWrapper(
                GRUCell(hp.attention_depth),
                LocationSensitiveAttention(hp.attention_depth,
                                           encoder_outputs),
                alignment_history=True,
                locked_alignments=locked_alignments_,
                output_attention=False,
                name='attention_wrapper')  # [N, T_in, attention_depth=256]

            # Concatenate attention context vector and RNN cell output into a
            # 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            cells = [
                GRUCell(hp.decoder_gru_units)
                for _ in range(hp.decoder_gru_layers)
            ]
            decoder_cell = MultiRNNCell(
                [concat_cell] + cells,
                state_is_tuple=True)  # [N, T_in, decoder_depth=1024]

            # Project onto r PML feature vectors (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.pml_dimension * hp.outputs_per_step)

            if is_training or gta:
                if hp.scheduled_sampling:
                    helper = TacoScheduledOutputTrainingHelper(
                        inputs, pml_targets, hp.pml_dimension,
                        hp.outputs_per_step, hp.scheduled_sampling_probability)
                else:
                    helper = TacoTrainingHelper(inputs, pml_targets,
                                                hp.pml_dimension,
                                                hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.pml_dimension,
                                        hp.outputs_per_step)

            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, P*r]

            # Reshape outputs to be one output per entry
            pml_intermediates = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.pml_dimension])  # [N, T_out, P]

            # Add Post-Processing Conv and GRU layer:
            expand_outputs = conv_and_gru(  # [N, T_in, 2*expand_gru_units=512]
                pml_intermediates,
                None,
                conv_layers=hp.expand_conv_layers,
                conv_width=hp.expand_conv_width,
                conv_channels=hp.expand_conv_channels,
                gru_units_unidirectional=hp.expand_gru_units,
                is_training=is_training,
                scope='expand',
            )

            pml_outputs = tf.layers.dense(expand_outputs,
                                          hp.pml_dimension)  # [N, T_out, P]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.pml_intermediates = pml_intermediates
            self.pml_outputs = pml_outputs
            self.alignments = alignments
            self.pml_targets = pml_targets

            if logs_enabled:
                log('Initialized Tacotron model. Dimensions: ')
                log('  Train mode:              {}'.format(is_training))
                log('  GTA mode:                {}'.format(is_training))
                log('  Embedding:               {}'.format(
                    embedded_inputs.shape[-1]))
                log('  Encoder out:             {}'.format(
                    encoder_outputs.shape[-1]))
                log('  Attention out:           {}'.format(
                    attention_cell.output_size))
                log('  Concat attn & out:       {}'.format(
                    concat_cell.output_size))
                log('  Decoder cell out:        {}'.format(
                    decoder_cell.output_size))
                log('  Decoder out ({} frames):  {}'.format(
                    hp.outputs_per_step, decoder_outputs.shape[-1]))
                log('  Decoder out (1 frame):   {}'.format(
                    pml_intermediates.shape[-1]))
                log('  Expand out:              {}'.format(
                    expand_outputs.shape[-1]))
                log('  PML out:                 {}'.format(
                    pml_outputs.shape[-1]))
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings
            embedding_table = tf.get_variable(
                'embedding', [len(symbols), hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs, input_lengths, is_training,
                hp.encoder_depth)  # [N, T_in, encoder_depth=256]

            # Attention
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_depth,
                encoder_outputs)  # [N, T_in, attention_depth=256]

            # Decoder (layers specified bottom to top):
            multi_rnn_cell = MultiRNNCell(
                [
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            decoder_cell = TacotronDecoderWrapper(is_training,
                                                  attention_mechanism,
                                                  multi_rnn_cell)

            if is_training:
                helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels,
                                            hp.outputs_per_step)
            else:
                helper = TacoTestHelper(batch_size, hp.num_mels,
                                        hp.outputs_per_step)

            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(
                     OutputProjectionWrapper(decoder_cell, hp.num_mels *
                                             hp.outputs_per_step), helper,
                     decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(
                mel_outputs, hp.num_mels, is_training,
                hp.postnet_depth)  # [N, T_out, postnet_depth=256]
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:               %d' % embedded_inputs.shape[-1])
            log('  prenet out:              %d' % prenet_outputs.shape[-1])
            log('  encoder out:             %d' % encoder_outputs.shape[-1])
            # log('  attention out:           %d' % attention_cell.output_size)
            # log('  concat attn & out:       %d' % concat_cell.output_size)
            # log('  decoder cell out:        %d' % decoder_cell.output_size)
            log('  decoder out (%d frames):  %d' %
                (hp.outputs_per_step, decoder_outputs.shape[-1]))
            log('  decoder out (1 frame):   %d' % mel_outputs.shape[-1])
            log('  postnet out:             %d' % post_outputs.shape[-1])
            log('  linear out:              %d' % linear_outputs.shape[-1])
Exemple #28
0
def decoder(inputs, encoder_outputs, is_training, batch_size, mel_targets):
    """ Decoder
  
  Prenet -> Attention RNN
  Postprocessing CBHG

  @param    encoder_outputs   outputs from the encoder wtih shape [N, T_in, prenet_depth=256]
  @param    inputs              int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
                                steps in the input time series, and values are character IDs
  @param    is_training         flag for training or eval
  @param    batch_size          number of samples per batch
  @param    mel_targets         float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
                                of steps in the output time series, M is num_mels, and values are entries in the mel
  @param    output_cell         attention cell
  @param    decoder_init_state  initial state of the decoder

  @return                       linear_outputs, mel_outputs and alignments
  """

    if (is_training):
        helper = TacoTrainingHelper(inputs, mel_targets, hparams.num_mels,
                                    hparams.outputs_per_step)
    else:
        helper = TacoTestHelper(batch_size, hparams.num_mels,
                                hparams.outputs_per_step)

    # Attention
    attention_cell = AttentionWrapper(
        GRUCell(hparams.attention_depth),
        BahdanauAttention(hparams.attention_depth, encoder_outputs),
        alignment_history=True,
        output_attention=False)  # [N, T_in, attention_depth=256]

    # Apply prenet before concatenation in AttentionWrapper.
    attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                          hparams.prenet_depths)

    # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
    concat_cell = ConcatOutputAndAttentionWrapper(
        attention_cell)  # [N, T_in, 2*attention_depth=512]

    # Decoder (layers specified bottom to top):
    decoder_cell = MultiRNNCell(
        [
            OutputProjectionWrapper(concat_cell, hparams.decoder_depth),
            ResidualWrapper(GRUCell(hparams.decoder_depth)),
            ResidualWrapper(GRUCell(hparams.decoder_depth))
        ],
        state_is_tuple=True)  # [N, T_in, decoder_depth=256]

    # Project onto r mel spectrograms (predict r outputs at each RNN step):
    output_cell = OutputProjectionWrapper(
        decoder_cell, hparams.num_mels * hparams.outputs_per_step)

    decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                dtype=tf.float32)

    (decoder_outputs,
     _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
         BasicDecoder(output_cell, helper, decoder_init_state),
         maximum_iterations=hparams.max_iters)  # [N, T_out/r, M*r]

    # Reshape outputs to be one output per entry
    mel_outputs = tf.reshape(
        decoder_outputs, [batch_size, -1, hparams.num_mels])  # [N, T_out, M]

    # Add post-processing CBHG:
    post_outputs = post_cbhg(
        mel_outputs,
        hparams.num_mels,
        is_training,  # [N, T_out, postnet_depth=256]
        hparams.postnet_depth)
    linear_outputs = tf.layers.dense(post_outputs,
                                     hparams.num_freq)  # [N, T_out, F]

    # Grab alignments from the final decoder state:
    alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(),
                              [1, 2, 0])

    log('Decoder Network ...')
    log('  attention out:             %d' % attention_cell.output_size)
    log('  concat attn & out:         %d' % concat_cell.output_size)
    log('  decoder cell out:          %d' % decoder_cell.output_size)
    log('  decoder out (%d frames):   %d' %
        (hparams.outputs_per_step, decoder_outputs.shape[-1]))
    log('  decoder out (1 frame):     %d' % mel_outputs.shape[-1])
    log('  postnet out:               %d' % post_outputs.shape[-1])
    log('  linear out:                %d' % linear_outputs.shape[-1])

    return linear_outputs, mel_outputs, alignments
Exemple #29
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   linear_targets=None):
        '''Initializes the model for inference.

    Sets "mel_outputs", "linear_outputs", and "alignments" fields.

    Args:
      inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
        steps in the input time series, and values are character IDs
      input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
        of each sequence in inputs.
      mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
        of steps in the output time series, M is num_mels, and values are entries in the mel
        spectrogram. Only needed for training.
      linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
        of steps in the output time series, F is num_freq, and values are entries in the linear
        spectrogram. Only needed for training.
    '''
        with tf.variable_scope('inference') as scope:
            is_training = linear_targets is not None
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings

            symbols_length = 149  # BASED ON PREVIOUS LENGTH OF LIST

            embedding_table = tf.get_variable(
                'embedding', [symbols_length, hp.embed_depth],
                dtype=tf.float32,
                initializer=tf.truncated_normal_initializer(stddev=0.5))
            embedded_inputs = tf.nn.embedding_lookup(
                embedding_table, inputs)  # [N, T_in, embed_depth=256]

            # Encoder
            prenet_outputs = prenet(
                embedded_inputs, is_training,
                hp.prenet_depths)  # [N, T_in, prenet_depths[-1]=128]
            encoder_outputs = encoder_cbhg(
                prenet_outputs,
                input_lengths,
                is_training,  # [N, T_in, encoder_depth=256]
                hp.encoder_depth)

            # Attention
            attention_cell = AttentionWrapper(
                GRUCell(hp.attention_depth),
                BahdanauAttention(hp.attention_depth, encoder_outputs),
                alignment_history=True,
                output_attention=False)  # [N, T_in, attention_depth=256]

            # Apply prenet before concatenation in AttentionWrapper.
            attention_cell = DecoderPrenetWrapper(attention_cell, is_training,
                                                  hp.prenet_depths)

            # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
            concat_cell = ConcatOutputAndAttentionWrapper(
                attention_cell)  # [N, T_in, 2*attention_depth=512]

            # Decoder (layers specified bottom to top):
            decoder_cell = MultiRNNCell(
                [
                    OutputProjectionWrapper(concat_cell, hp.decoder_depth),
                    ResidualWrapper(GRUCell(hp.decoder_depth)),
                    ResidualWrapper(GRUCell(hp.decoder_depth))
                ],
                state_is_tuple=True)  # [N, T_in, decoder_depth=256]

            # Project onto r mel spectrograms (predict r outputs at each RNN step):
            output_cell = OutputProjectionWrapper(
                decoder_cell, hp.num_mels * hp.outputs_per_step)
            decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                        dtype=tf.float32)

            helper = TacoTestHelper(batch_size, hp.num_mels,
                                    hp.outputs_per_step)

            (decoder_outputs,
             _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
                 BasicDecoder(output_cell, helper, decoder_init_state),
                 maximum_iterations=hp.max_iters)  # [N, T_out/r, M*r]

            # Reshape outputs to be one output per entry
            mel_outputs = tf.reshape(
                decoder_outputs,
                [batch_size, -1, hp.num_mels])  # [N, T_out, M]

            # Add post-processing CBHG:
            post_outputs = post_cbhg(
                mel_outputs,
                hp.num_mels,
                is_training,  # [N, T_out, postnet_depth=256]
                hp.postnet_depth)
            linear_outputs = tf.layers.dense(post_outputs,
                                             hp.num_freq)  # [N, T_out, F]

            # Grab alignments from the final decoder state:
            alignments = tf.transpose(
                final_decoder_state[0].alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.mel_outputs = mel_outputs
            self.linear_outputs = linear_outputs
            self.alignments = alignments
            self.mel_targets = mel_targets
            self.linear_targets = linear_targets
    def __init__(self,
                 sess,
                 config,
                 api,
                 log_dir,
                 forward,
                 scope=None):  # forward???
        self.vocab = api.vocab
        self.rev_vocab = api.rev_vocab
        self.vocab_size = len(self.vocab)
        self.seen_intent = api.seen_intent
        self.rev_seen_intent = api.rev_seen_intent
        self.seen_intent_size = len(self.rev_seen_intent)
        self.unseen_intent = api.unseen_intent
        self.rev_unseen_intent = api.rev_unseen_intent
        self.unseen_intent_size = len(self.rev_unseen_intent)
        self.sess = sess
        self.scope = scope
        self.max_utt_len = config.max_utt_len
        self.go_id = self.rev_vocab["<s>"]
        self.eos_id = self.rev_vocab["</s>"]
        self.sent_cell_size = config.sent_cell_size
        self.dec_cell_size = config.dec_cell_size
        self.label_embed_size = config.label_embed_size
        self.latent_size = config.latent_size

        self.seed = config.seed
        self.use_ot_label = config.use_ot_label
        self.use_rand_ot_label = config.use_rand_ot_label  # Only valid if use_ot_label is true, whether use all other label
        self.use_rand_fixed_ot_label = config.use_rand_fixed_ot_label  # valid when use_ot_label=true and use_rand_ot_label=true
        if self.use_ot_label:
            self.rand_ot_label_num = config.rand_ot_label_num  # valid when use_ot_label=true and use_rand_ot_label=true
        else:
            self.rand_ot_label_num = self.seen_intent_size - 1

        with tf.name_scope("io"):
            # all dialog context and known attributes
            self.labels = tf.placeholder(
                dtype=tf.int32, shape=(None, ),
                name="labels")  # each utterance have a label, [batch_size,]
            self.ot_label_rand = tf.placeholder(dtype=tf.int32,
                                                shape=(None, None),
                                                name="ot_labels_rand")
            self.ot_labels_all = tf.placeholder(
                dtype=tf.int32, shape=(None, None),
                name="ot_labels_all")  #(batch_size, len(api.label_vocab)-1)

            # target response given the dialog context
            self.io_tokens = tf.placeholder(dtype=tf.int32,
                                            shape=(None, None),
                                            name="output_tokens")
            self.io_lens = tf.placeholder(dtype=tf.int32,
                                          shape=(None, ),
                                          name="output_lens")
            self.output_labels = tf.placeholder(dtype=tf.int32,
                                                shape=(None, ),
                                                name="output_labels")

            # optimization related variables
            self.learning_rate = tf.Variable(float(config.init_lr),
                                             trainable=False,
                                             name="learning_rate")
            self.learning_rate_decay_op = self.learning_rate.assign(
                tf.multiply(self.learning_rate, config.lr_decay))
            self.global_t = tf.placeholder(dtype=tf.int32, name="global_t")
            self.use_prior = tf.placeholder(
                dtype=tf.bool, name="use_prior")  # whether use prior
            self.prior_mulogvar = tf.placeholder(
                dtype=tf.float32,
                shape=(None, config.latent_size * 2),
                name="prior_mulogvar")

            self.batch_size = tf.placeholder(dtype=tf.int32, name="batch_size")

        max_out_len = array_ops.shape(self.io_tokens)[1]
        # batch_size = array_ops.shape(self.io_tokens)[0]
        batch_size = self.batch_size

        with variable_scope.variable_scope("labelEmbedding",
                                           reuse=tf.AUTO_REUSE):
            self.la_embedding = tf.get_variable(
                "embedding", [self.seen_intent_size, config.label_embed_size],
                dtype=tf.float32)
            label_embedding = embedding_ops.embedding_lookup(
                self.la_embedding, self.output_labels)  # not use

        with variable_scope.variable_scope("wordEmbedding",
                                           reuse=tf.AUTO_REUSE):
            self.embedding = tf.get_variable(
                "embedding", [self.vocab_size, config.embed_size],
                dtype=tf.float32,
                trainable=False)
            embedding_mask = tf.constant(
                [0 if i == 0 else 1 for i in range(self.vocab_size)],
                dtype=tf.float32,
                shape=[self.vocab_size, 1])
            embedding = self.embedding * embedding_mask  # boardcast, first row is all 0.

            io_embedding = embedding_ops.embedding_lookup(
                embedding, self.io_tokens)  # 3 dim

            if config.sent_type == "bow":
                io_embedding, _ = get_bow(io_embedding)

            elif config.sent_type == "rnn":
                sent_cell = self.get_rnncell("gru", self.sent_cell_size,
                                             config.keep_prob, 1)
                io_embedding, _ = get_rnn_encode(io_embedding,
                                                 sent_cell,
                                                 self.io_lens,
                                                 scope="sent_rnn",
                                                 reuse=tf.AUTO_REUSE)
            elif config.sent_type == "bi_rnn":
                fwd_sent_cell = self.get_rnncell("gru",
                                                 self.sent_cell_size,
                                                 keep_prob=1.0,
                                                 num_layer=1)
                bwd_sent_cell = self.get_rnncell("gru",
                                                 self.sent_cell_size,
                                                 keep_prob=1.0,
                                                 num_layer=1)
                io_embedding, _ = get_bi_rnn_encode(
                    io_embedding,
                    fwd_sent_cell,
                    bwd_sent_cell,
                    self.io_lens,
                    scope="sent_bi_rnn",
                    reuse=tf.AUTO_REUSE
                )  # equal to x of the graph, (batch_size, 300*2)
            else:
                raise ValueError(
                    "Unknown sent_type. Must be one of [bow, rnn, bi_rnn]")

            # print('==========================', io_embedding) # Tensor("models_2/wordEmbedding/sent_bi_rnn/concat:0", shape=(?, 600), dtype=float32)

            # convert label into 1 hot
            my_label_one_hot = tf.one_hot(tf.reshape(self.labels, [-1]),
                                          depth=self.seen_intent_size,
                                          dtype=tf.float32)  # 2 dim
            if config.use_ot_label:
                if config.use_rand_ot_label:
                    ot_label_one_hot = tf.one_hot(tf.reshape(
                        self.ot_label_rand, [-1]),
                                                  depth=self.seen_intent_size,
                                                  dtype=tf.float32)
                    ot_label_one_hot = tf.reshape(
                        ot_label_one_hot,
                        [-1, self.seen_intent_size * self.rand_ot_label_num])
                else:
                    ot_label_one_hot = tf.one_hot(tf.reshape(
                        self.ot_labels_all, [-1]),
                                                  depth=self.seen_intent_size,
                                                  dtype=tf.float32)
                    ot_label_one_hot = tf.reshape(
                        ot_label_one_hot, [
                            -1, self.seen_intent_size *
                            (self.seen_intent_size - 1)
                        ]
                    )  # (batch_size, len(api.label_vocab)*(len(api.label_vocab)-1))

        with variable_scope.variable_scope("recognitionNetwork",
                                           reuse=tf.AUTO_REUSE):
            recog_input = io_embedding
            self.recog_mulogvar = recog_mulogvar = layers.fully_connected(
                recog_input,
                config.latent_size * 2,
                activation_fn=None,
                scope="muvar")  # config.latent_size=200
            recog_mu, recog_logvar = tf.split(
                recog_mulogvar, 2, axis=1
            )  # recognition network output. (batch_size, config.latent_size)

        with variable_scope.variable_scope("priorNetwork",
                                           reuse=tf.AUTO_REUSE):
            # p(xyz) = p(z)p(x|z)p(y|xz)
            # prior network parameter, assum the normal distribution
            # prior_mulogvar = tf.constant([[1] * config.latent_size + [0] * config.latent_size]*batch_size,
            #                              dtype=tf.float32, name="muvar") # can not use by this manner
            prior_mulogvar = self.prior_mulogvar
            prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=1)

            # use sampled Z or posterior Z
            latent_sample = tf.cond(
                self.use_prior,  # bool input
                lambda: sample_gaussian(prior_mu, prior_logvar
                                        ),  # equal to shape(prior_logvar)
                lambda: sample_gaussian(recog_mu, recog_logvar)
            )  # if ... else ..., (batch_size, config.latent_size)
            self.z = latent_sample

        with variable_scope.variable_scope("generationNetwork",
                                           reuse=tf.AUTO_REUSE):
            bow_loss_inputs = latent_sample  # (part of) response network input
            label_inputs = latent_sample
            dec_inputs = latent_sample

            # BOW loss
            if config.use_bow_loss:
                bow_fc1 = layers.fully_connected(
                    bow_loss_inputs,
                    400,
                    activation_fn=tf.tanh,
                    scope="bow_fc1")  # MLPb network fc layer
                # error1:ValueError: The last dimension of the inputs to `Dense` should be defined. Found `None`.
                if config.keep_prob < 1.0:
                    bow_fc1 = tf.nn.dropout(bow_fc1, config.keep_prob)
                self.bow_logits = layers.fully_connected(
                    bow_fc1,
                    self.vocab_size,
                    activation_fn=None,
                    scope="bow_project")  # MLPb network fc output

            # Y loss, include the other y.
            my_label_fc1 = layers.fully_connected(label_inputs,
                                                  400,
                                                  activation_fn=tf.tanh,
                                                  scope="my_label_fc1")
            if config.keep_prob < 1.0:
                my_label_fc1 = tf.nn.dropout(my_label_fc1, config.keep_prob)

            # my_label_fc2 = layers.fully_connected(my_label_fc1, 400, activation_fn=tf.tanh, scope="my_label_fc2")
            # if config.keep_prob < 1.0:
            #     my_label_fc2 = tf.nn.dropout(my_label_fc2, config.keep_prob)

            self.my_label_logits = layers.fully_connected(
                my_label_fc1, self.seen_intent_size,
                scope="my_label_project")  # MLPy fc output
            my_label_prob = tf.nn.softmax(
                self.my_label_logits
            )  # softmax output, (batch_size, label_vocab_size)
            self.my_label_prob = my_label_prob
            pred_my_label_embedding = tf.matmul(
                my_label_prob, self.la_embedding
            )  # predicted my label y. (batch_size, label_embed_size)

            if config.use_ot_label:
                if config.use_rand_ot_label:  # use one random other label
                    ot_label_fc1 = layers.fully_connected(
                        label_inputs,
                        400,
                        activation_fn=tf.tanh,
                        scope="ot_label_fc1")
                    if config.keep_prob < 1.0:
                        ot_label_fc1 = tf.nn.dropout(ot_label_fc1,
                                                     config.keep_prob)
                    self.ot_label_logits = layers.fully_connected(
                        ot_label_fc1,
                        self.rand_ot_label_num * self.seen_intent_size,
                        scope="ot_label_rand_project")
                    ot_label_logits_split = tf.reshape(
                        self.ot_label_logits,
                        [-1, self.rand_ot_label_num, self.seen_intent_size])
                    ot_label_prob_short = tf.nn.softmax(ot_label_logits_split)
                    ot_label_prob = tf.reshape(
                        ot_label_prob_short,
                        [-1, self.rand_ot_label_num * self.seen_intent_size]
                    )  # (batch_size, self.rand_ot_label_num*self.label_vocab_size)
                    pred_ot_label_embedding = tf.reshape(
                        tf.matmul(ot_label_prob_short, self.la_embedding),
                        [self.label_embed_size * self.rand_ot_label_num
                         ])  # predicted other label y2.
                else:
                    ot_label_fc1 = layers.fully_connected(
                        label_inputs,
                        400,
                        activation_fn=tf.tanh,
                        scope="ot_label_fc1")
                    if config.keep_prob < 1.0:
                        ot_label_fc1 = tf.nn.dropout(ot_label_fc1,
                                                     config.keep_prob)
                    self.ot_label_logits = layers.fully_connected(
                        ot_label_fc1,
                        self.seen_intent_size * (self.seen_intent_size - 1),
                        scope="ot_label_all_project")
                    ot_label_logits_split = tf.reshape(
                        self.ot_label_logits,
                        [-1, self.seen_intent_size - 1, self.seen_intent_size])
                    ot_label_prob_short = tf.nn.softmax(ot_label_logits_split)
                    ot_label_prob = tf.reshape(
                        ot_label_prob_short, [
                            -1, self.seen_intent_size *
                            (self.seen_intent_size - 1)
                        ]
                    )  # (batch_size, self.label_vocab_size*(self.label_vocab_size-1))
                    pred_ot_label_embedding = tf.reshape(
                        tf.matmul(ot_label_prob_short, self.la_embedding),
                        [self.label_embed_size * (self.seen_intent_size - 1)]
                    )  # predicted other all label y. (batch_size, self.label_embed_size*(self.label_vocab_size-1))
                    # note:matmul can calc (3, 4, 5) × (5, 4) = (3, 4, 4)
            else:  # only use label y.
                self.ot_label_logits = None
                pred_ot_label_embedding = None

            # Decoder, Response Network
            if config.num_layer > 1:
                dec_init_state = []
                for i in range(config.num_layer):
                    temp_init = layers.fully_connected(dec_inputs,
                                                       self.dec_cell_size,
                                                       activation_fn=None,
                                                       scope="init_state-%d" %
                                                       i)
                    if config.cell_type == 'lstm':
                        temp_init = rnn_cell.LSTMStateTuple(
                            temp_init, temp_init)

                    dec_init_state.append(temp_init)

                dec_init_state = tuple(dec_init_state)
            else:
                dec_init_state = layers.fully_connected(dec_inputs,
                                                        self.dec_cell_size,
                                                        activation_fn=None,
                                                        scope="init_state")
                if config.cell_type == 'lstm':
                    dec_init_state = rnn_cell.LSTMStateTuple(
                        dec_init_state, dec_init_state)

        with variable_scope.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            dec_cell = self.get_rnncell(config.cell_type, self.dec_cell_size,
                                        config.keep_prob, config.num_layer)
            dec_cell = OutputProjectionWrapper(dec_cell, self.vocab_size)

            if forward:  # test
                loop_func = decoder_fn_lib.context_decoder_fn_inference(
                    None,
                    dec_init_state,
                    embedding,
                    start_of_sequence_id=self.go_id,
                    end_of_sequence_id=self.eos_id,
                    maximum_length=self.max_utt_len,
                    num_decoder_symbols=self.vocab_size,
                    context_vector=None)  # a function
                dec_input_embedding = None
                dec_seq_lens = None
            else:  # train
                loop_func = decoder_fn_lib.context_decoder_fn_train(
                    dec_init_state, None)
                dec_input_embedding = embedding_ops.embedding_lookup(
                    embedding, self.io_tokens
                )  # x 's embedding (batch_size, utt_len, embed_size)
                dec_input_embedding = dec_input_embedding[:, 0:
                                                          -1, :]  # ignore the last </s>
                dec_seq_lens = self.io_lens - 1  # input placeholder

                if config.keep_prob < 1.0:
                    dec_input_embedding = tf.nn.dropout(
                        dec_input_embedding, config.keep_prob)

                # apply word dropping. Set dropped word to 0
                if config.dec_keep_prob < 1.0:
                    keep_mask = tf.less_equal(
                        tf.random_uniform((batch_size, max_out_len - 1),
                                          minval=0.0,
                                          maxval=1.0), config.dec_keep_prob)
                    keep_mask = tf.expand_dims(tf.to_float(keep_mask), 2)
                    dec_input_embedding = dec_input_embedding * keep_mask
                    dec_input_embedding = tf.reshape(
                        dec_input_embedding,
                        [-1, max_out_len - 1, config.embed_size])

                # print("=======", dec_input_embedding) # Tensor("models/decoder/strided_slice:0", shape=(?, ?, 200), dtype=float32)

            dec_outs, _, final_context_state = dynamic_rnn_decoder(
                dec_cell,
                loop_func,
                inputs=dec_input_embedding,
                sequence_length=dec_seq_lens
            )  # dec_outs [batch_size, seq, features]

            if final_context_state is not None:
                final_context_state = final_context_state[:, 0:array_ops.
                                                          shape(dec_outs)[1]]
                mask = tf.to_int32(tf.sign(tf.reduce_max(
                    dec_outs, axis=2)))  # get softmax vec's max index
                self.dec_out_words = tf.multiply(
                    tf.reverse(final_context_state, axis=[1]), mask)
            else:
                self.dec_out_words = tf.argmax(
                    dec_outs,
                    2)  # (batch_size, utt_len), each element is index of word

        if not forward:
            with variable_scope.variable_scope("loss", reuse=tf.AUTO_REUSE):
                labels = self.io_tokens[:,
                                        1:]  # not include the first word <s>, (batch_size, utt_len)
                label_mask = tf.to_float(tf.sign(labels))

                labels = tf.one_hot(labels,
                                    depth=self.vocab_size,
                                    dtype=tf.float32)

                print(dec_outs)
                print(labels)
                # Tensor("models_1/decoder/dynamic_rnn_decoder/transpose_1:0", shape=(?, ?, 892), dtype=float32)
                # Tensor("models_1/loss/strided_slice:0", shape=(?, ?), dtype=int32)
                # rc_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=dec_outs, labels=labels) # response network loss
                rc_loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=dec_outs, labels=labels)  # response network loss
                # logits_size=[390,892] labels_size=[1170,892]
                rc_loss = tf.reduce_sum(
                    rc_loss * label_mask,
                    reduction_indices=1)  # (batch_size,), except the word unk
                self.avg_rc_loss = tf.reduce_mean(rc_loss)  # scalar
                # used only for perpliexty calculation. Not used for optimzation
                self.rc_ppl = tf.exp(
                    tf.reduce_sum(rc_loss) / tf.reduce_sum(label_mask))
                """ as n-trial multimodal distribution. """
                tile_bow_logits = tf.tile(
                    tf.expand_dims(self.bow_logits, 1),
                    [1, max_out_len - 1, 1
                     ])  # (batch_size, max_out_len-1, vocab_size)
                bow_loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=tile_bow_logits, labels=labels
                ) * label_mask  # labels shape less than logits shape, (batch_size, max_out_len-1)
                bow_loss = tf.reduce_sum(bow_loss,
                                         reduction_indices=1)  # (batch_size, )
                self.avg_bow_loss = tf.reduce_mean(bow_loss)  # scalar

                # the label y
                my_label_loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=my_label_prob,
                    labels=my_label_one_hot)  # label (batch_size,)
                self.avg_my_label_loss = tf.reduce_mean(my_label_loss)
                if config.use_ot_label:
                    ot_label_loss = -tf.nn.softmax_cross_entropy_with_logits(
                        logits=ot_label_prob, labels=ot_label_one_hot)
                    self.avg_ot_label_loss = tf.reduce_mean(ot_label_loss)
                else:
                    self.avg_ot_label_loss = 0.0

                kld = gaussian_kld(
                    recog_mu, recog_logvar, prior_mu,
                    prior_logvar)  # kl divergence, (batch_size,)
                self.avg_kld = tf.reduce_mean(kld)  # scalar
                if log_dir is not None:
                    kl_weights = tf.minimum(
                        tf.to_float(self.global_t) / config.full_kl_step, 1.0)
                else:
                    kl_weights = tf.constant(1.0)

                self.kl_w = kl_weights
                self.elbo = self.avg_rc_loss + kl_weights * self.avg_kld  # Restructure loss and kl divergence
                #=====================================================================================================total loss====================================================#
                if config.use_rand_ot_label:
                    aug_elbo = self.avg_bow_loss + 1000 * self.avg_my_label_loss + 10 * self.avg_ot_label_loss + self.elbo  # augmented loss
                    # (1/self.rand_ot_label_num)*
                else:
                    aug_elbo = self.avg_bow_loss + 1000 * self.avg_my_label_loss + 10 * self.avg_ot_label_loss + self.elbo  # augmented loss
                    # (1/(self.label_vocab_size-1))*

                tf.summary.scalar("rc_loss", self.avg_rc_loss)
                tf.summary.scalar("elbo", self.elbo)
                tf.summary.scalar("kld", self.avg_kld)
                tf.summary.scalar("bow_loss", self.avg_bow_loss)
                tf.summary.scalar("my_label_loss", self.avg_my_label_loss)
                tf.summary.scalar("ot_label_loss", self.avg_ot_label_loss)

                self.summary_op = tf.summary.merge_all()

                self.log_p_z = norm_log_liklihood(latent_sample, prior_mu,
                                                  prior_logvar)  # probability
                self.log_q_z_xy = norm_log_liklihood(
                    latent_sample, recog_mu, recog_logvar)  # probability
                self.est_marginal = tf.reduce_mean(rc_loss + bow_loss -
                                                   self.log_p_z +
                                                   self.log_q_z_xy)

            self.optimize(sess, config, aug_elbo, log_dir)

        self.saver = tf.train.Saver(tf.global_variables(),
                                    write_version=tf.train.SaverDef.V2)
        print('model establish finish!')