Esempio n. 1
0
  def _decode(self, z, helper, max_length=None, x_input=None):
    """Decodes the given batch of latent vectors vectors, which may be 0-length.

    Args:
      z: Batch of latent vectors, sized `[batch_size, z_size]`, where `z_size`
        may be 0 for unconditioned decoding.
      helper: A seq2seq.Helper to use. If a TrainingHelper is passed and a
        CudnnLSTM has previously been defined, it will be used instead.
      max_length: (Optinal) The maximum iterations to decode.
      x_input: (Optional) The inputs to the decoder for teacher forcing.
        Required if CudnnLSTM is to be used.

    Returns:
      final_output: The final seq2seq.BasicDecoderOutput.
    """
    initial_state = initial_cell_state_from_embedding(
        self._dec_cell, z, name='decoder/z_to_initial_state')

    # CudnnLSTM does not support sampling so it can only replace TrainingHelper.
    if  self._cudnn_dec_lstm and type(helper) == seq2seq.TrainingHelper:  # pylint:disable=unidiomatic-typecheck
      rnn_output, _ = self._cudnn_dec_lstm(
          tf.transpose(x_input, [1, 0, 2]),
          initial_state=_cudnn_lstm_state(initial_state),
          training=self._is_training)
      with tf.variable_scope('decoder'):
        rnn_output = self._output_layer(rnn_output)
      final_output = seq2seq.BasicDecoderOutput(
          rnn_output=tf.transpose(rnn_output, [1, 0, 2]), sample_id=None)
    else:
      if self._cudnn_dec_lstm:
        tf.logging.warning(
            'CudnnLSTM does not support sampling. Using `dynamic_decode` '
            'instead.')
      decoder = seq2seq.BasicDecoder(
          self._dec_cell,
          helper,
          initial_state=initial_state,
          output_layer=self._output_layer)
      final_output, _, _ = seq2seq.dynamic_decode(
          decoder,
          maximum_iterations=max_length,
          swap_memory=True,
          scope='decoder')
    return final_output
Esempio n. 2
0
    def build_graph(self):
        print('Building the TensorFlow graph...')
        opts = self.options

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.enc_input = tf.placeholder(
                tf.int32,
                shape=[opts.max_hist_len, opts.batch_size, opts.max_uttr_len])
            self.enc_input_e = tf.placeholder(
                tf.float32,
                shape=[opts.batch_size, opts.max_hist_len, opts.n_emot])
            self.dec_input = tf.placeholder(
                tf.int32, shape=[opts.batch_size, opts.max_uttr_len + 1])
            self.target = tf.placeholder(
                tf.int32, shape=[opts.batch_size, opts.max_uttr_len + 1])

            self.enc_input_len = tf.placeholder(
                tf.int32, shape=[opts.max_hist_len, opts.batch_size])
            self.dec_input_len = tf.placeholder(tf.int32,
                                                shape=[opts.batch_size])
            self.hist_len = tf.placeholder(tf.int32, shape=[opts.batch_size])

            with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE):
                # word_embeddings = tf.Variable(tf.random_uniform([opts.vocab_size, opts.word_embed_size], -1.0, 1.0),
                #     name = 'word_embeddings')
                word_embeddings = tf.Variable(opts.word_embeddings,
                                              name='word_embeddings')
                enc_input_embed = tf.nn.embedding_lookup(
                    word_embeddings, self.enc_input)
                dec_input_embed = tf.nn.embedding_lookup(
                    word_embeddings, self.dec_input)

            with tf.variable_scope('word_level_encoding', reuse=tf.AUTO_REUSE):
                outputs_enc = []
                cell_fw = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc_s)
                cell_bw = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc_s)
                for i in range(opts.max_hist_len):
                    outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw,
                        cell_bw,
                        inputs=enc_input_embed[i, :, :, :],
                        sequence_length=self.enc_input_len[i, :],
                        dtype=tf.float32)
                    outputs_enc.append(tf.concat(outputs, 2))
                outputs_enc = tf.stack(outputs_enc)

            with tf.variable_scope('emotion_encoding', reuse=tf.AUTO_REUSE):
                emot_input_layer = tf.layers.Dense(
                    opts.emot_input_layer_size,
                    activation=tf.sigmoid,
                    kernel_initializer=tf.truncated_normal_initializer(
                        stddev=0.1),
                    name='emot_input_layer')
                enc_input_e = emot_input_layer(self.enc_input_e)

                cell_emot = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc_e)
                _, final_state = tf.nn.dynamic_rnn(
                    cell_emot,
                    inputs=enc_input_e,
                    sequence_length=self.hist_len,
                    dtype=tf.float32)
                emot_vector = final_state * opts.beta

            if opts.mode == 'PREDICT':
                outputs_enc = tf.transpose(outputs_enc, perm=[1, 0, 2, 3])
                outputs_enc = tile_batch(outputs_enc,
                                         multiplier=opts.beam_width)
                outputs_enc = tf.transpose(outputs_enc, perm=[1, 0, 2, 3])
                tiled_enc_input_len = tile_batch(tf.transpose(
                    self.enc_input_len),
                                                 multiplier=opts.beam_width)
                tiled_enc_input_len = tf.transpose(tiled_enc_input_len)
                tiled_hist_len = tile_batch(self.hist_len,
                                            multiplier=opts.beam_width)
                tiled_emot_vector = tile_batch(emot_vector,
                                               multiplier=opts.beam_width)
            else:
                tiled_enc_input_len = self.enc_input_len
                tiled_hist_len = self.hist_len
                tiled_emot_vector = emot_vector

            with tf.variable_scope('decoding', reuse=tf.AUTO_REUSE) as vs:
                attn_mechanism = UttrLevelAttentionMechanism(
                    word_level_num_units=opts.word_level_attn_depth,
                    uttr_level_num_units=opts.uttr_level_attn_depth,
                    n_hidden_units=opts.n_hidden_units_enc_s,
                    memory=outputs_enc,
                    memory_sequence_length=tiled_enc_input_len,
                    hist_length=tiled_hist_len)
                cell_dec = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_dec)
                cell_dec = MyAttentionWrapper(cell_dec, attn_mechanism,
                                              tiled_emot_vector)
                output_layer = tf.layers.Dense(
                    units=opts.vocab_size - 1,
                    kernel_initializer=tf.truncated_normal_initializer(
                        stddev=0.1),
                    name='output_layer')

                # Train
                if opts.mode == 'TRAIN':
                    outputs_dec, _ = tf.nn.dynamic_rnn(
                        cell=cell_dec,
                        inputs=dec_input_embed,
                        sequence_length=self.dec_input_len,
                        initial_state=cell_dec.zero_state(
                            opts.batch_size, tf.float32),
                        dtype=tf.float32,
                        scope=vs)
                    logits = output_layer.apply(outputs_dec)
                    weights = tf.sequence_mask(self.dec_input_len,
                                               maxlen=opts.max_uttr_len + 1,
                                               dtype=tf.float32)
                    self.loss = sequence_loss(logits, self.target, weights)
                    self.loss_batch = sequence_loss(logits,
                                                    self.target,
                                                    weights,
                                                    average_across_batch=False)
                    self.optimizer = tf.train.AdamOptimizer(
                        opts.learning_rate).minimize(self.loss)
                    self.init = tf.global_variables_initializer()

                # Predict
                if opts.mode == 'PREDICT':
                    start_tokens = tf.constant(opts.go_index,
                                               dtype=tf.int32,
                                               shape=[opts.batch_size])
                    bs_decoder = BeamSearchDecoder(
                        cell=cell_dec,
                        embedding=word_embeddings,
                        start_tokens=start_tokens,
                        end_token=opts.eos_index,
                        initial_state=cell_dec.zero_state(
                            opts.batch_size * opts.beam_width, tf.float32),
                        beam_width=opts.beam_width,
                        output_layer=output_layer)
                    final_outputs, final_state, _ = dynamic_decode(
                        bs_decoder,
                        impute_finished=False,
                        maximum_iterations=opts.max_uttr_len + 1,
                        scope=vs)
                    self.predicted_ids = final_outputs.predicted_ids
                    self.scores = final_outputs.beam_search_decoder_output.scores
                    self.uttr_level_alignments = final_state[
                        0].alignment_history_ul.stack()
                    self.word_level_alignments = final_state[
                        0].alignment_history_wl.stack()
                    self.final_sequence_lengths = final_state[3]

            self.tvars = tf.trainable_variables()
            self.saver = tf.train.Saver(max_to_keep=100)
Esempio n. 3
0
    def _create_seq2seq(self):

        if self.core == "blstm":
            # Mutilayer  BLSTM Encoder
            with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE):
                for layer_i in range(self.encoder_layers):
                    cell_fw, cell_bw = self._create_blstmcell(layer_i)
                    (self.encoder_inputs_embedded, self.encoder_final_state) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw=cell_fw,
                        cell_bw=cell_bw,
                        inputs=self.encoder_inputs_embedded,
                        dtype=tf.float32)
                    self.encoder_inputs_embedded = tf.add_n(self.encoder_inputs_embedded)
                    if self.is_train == 0:
                        self.encoder_inputs_embedded = tf.multiply(self.encoder_inputs_embedded, self.keep_prob)

                self.encoder_final_state_c = tf.concat(
                    (self.encoder_final_state[0].c, self.encoder_final_state[1].c), 1)
                self.encoder_final_state_h = tf.concat(
                    (self.encoder_final_state[0].h, self.encoder_final_state[1].h), 1)
                self.encoder_final_state = contrib.rnn.LSTMStateTuple(
                    c=self.encoder_final_state_c,
                    h=self.encoder_final_state_h)

            # Basic Attention based LSTM Decoder(train and infer)
            with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE):
                self.decoder_cell = tf.nn.rnn_cell.LSTMCell(num_units=self.decoder_hidden_units,
                                                            state_is_tuple=True)

                self.attention_state = self.encoder_inputs_embedded
                self.attention_mechanism = contrib.seq2seq.LuongAttention(num_units=self.decoder_hidden_units,
                                                                          memory=self.attention_state,
                                                                          memory_sequence_length=self.encoder_length)
                self.attn_cell = contrib.seq2seq.AttentionWrapper(cell=self.decoder_cell,
                                                                  attention_mechanism=self.attention_mechanism,
                                                                  name="decoder_attention_cell",
                                                                  alignment_history=False
                                                                  )
                self.fc_layer = tf.layers.Dense(self.vocab_size, name='dense_layer')

                # for train
                with tf.variable_scope('decoder_train', reuse=tf.AUTO_REUSE):
                    self.helper_train = contrib.seq2seq.TrainingHelper(inputs=self.decoder_inputs_embedded,
                                                                       sequence_length=self.decoder_length)
                    self.decoder_initial_state = self.attn_cell.zero_state(self.batch_size, dtype=tf.float32).clone(
                        cell_state=self.encoder_final_state)
                    self.decoder_train = contrib.seq2seq.BasicDecoder(cell=self.attn_cell,
                                                                      initial_state=self.decoder_initial_state,
                                                                      helper=self.helper_train,
                                                                      output_layer=self.fc_layer
                                                                      )
                    self.decoder_train_logits, _, _ = s2s.dynamic_decode(decoder=self.decoder_train
                                                                         )

                # for infer
                with tf.variable_scope('decoder_infer', reuse=tf.AUTO_REUSE):
                    self.start_tokens = tf.tile([19654], [self.batch_size])
                    self.end_tokens = 19655
                    self.helper_infer = contrib.seq2seq.GreedyEmbeddingHelper(embedding=self.embeddings_trainable,
                                                                              start_tokens=self.start_tokens,
                                                                              end_token=self.end_tokens)
                    self.decoder_infer = contrib.seq2seq.BasicDecoder(cell=self.attn_cell,
                                                                      initial_state=self.decoder_initial_state,
                                                                      helper=self.helper_infer,
                                                                      output_layer=self.fc_layer)
                    self.decoder_infer_logits, _, _ = s2s.dynamic_decode(decoder=self.decoder_infer,
                                                                         maximum_iterations=20
                                                                         )

        elif self.core == "bgru":
            # single layer bgru encoder
            with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE):
                inputs = self.encoder_inputs_embedded
                cell_fw, cell_bw = self._create_bgrucell()
                with tf.variable_scope(None, default_name="encoder"):
                    (output, self.encoder_final_state) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw=cell_fw,
                        cell_bw=cell_bw,
                        inputs=inputs,
                        dtype=tf.float32)

                self.encoder_final_state = tf.concat(self.encoder_final_state, 1)

            # basic gru Decoder for train and infer
            with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE):
                self.decoder_cell = tf.nn.rnn_cell.GRUCell(num_units=self.decoder_hidden_units,
                                                           name='decoder_cell')
                self.attention_state = self.encoder_inputs_embedded
                self.attention_mechanism = contrib.seq2seq.LuongAttention(num_units=self.decoder_hidden_units,
                                                                          memory=self.attention_state,
                                                                          memory_sequence_length=self.encoder_length)
                self.attn_cell = contrib.seq2seq.AttentionWrapper(cell=self.decoder_cell,
                                                                  attention_mechanism=self.attention_mechanism,
                                                                  name="decoder_attention_cell",
                                                                  alignment_history=False
                                                                  )
                self.fc_layer = tf.layers.Dense(self.vocab_size, name='dense_layer')

                with tf.variable_scope('decoder_train', reuse=tf.AUTO_REUSE):
                    # for train
                    self.helper_train = contrib.seq2seq.TrainingHelper(inputs=self.decoder_inputs_embedded,
                                                                       sequence_length=self.decoder_length)
                    self.decoder_initial_state = self.attn_cell.zero_state(self.batch_size, dtype=tf.float32).clone(
                        cell_state=self.encoder_final_state)
                    self.decoder_train = contrib.seq2seq.BasicDecoder(cell=self.attn_cell,
                                                                      initial_state=self.decoder_initial_state,
                                                                      helper=self.helper_train,
                                                                      output_layer=self.fc_layer
                                                                      )
                    self.decoder_train_logits, _, _ = s2s.dynamic_decode(decoder=self.decoder_train
                                                                         )
                with tf.variable_scope('decoder_infer', reuse=tf.AUTO_REUSE):
                    # for infer
                    self.start_tokens = tf.fill([self.batch_size], 19654)
                    self.end_tokens = 19655
                    self.helper_infer = contrib.seq2seq.GreedyEmbeddingHelper(embedding=self.embeddings_trainable,
                                                                              start_tokens=self.start_tokens,
                                                                              end_token=self.end_tokens)
                    self.decoder_infer = contrib.seq2seq.BasicDecoder(cell=self.attn_cell,
                                                                      initial_state=self.decoder_initial_state,
                                                                      helper=self.helper_infer,
                                                                      output_layer=self.fc_layer)
                    self.decoder_infer_logits, _, _ = s2s.dynamic_decode(self.decoder_infer,
                                                                         maximum_iterations=20
                                                                         )
Esempio n. 4
0
    def build_predict_decoder(self):
        # start_tokens: [batch_size,]
        start_tokens = tf.ones([
            self.batch_size,
        ], tf.int32) * self.start_token
        end_token = self.end_token

        if not self.use_beamsearch_decode:

            # Helper to feed inputs for greedy decoding: use the argmax of the output
            if self.predict_mode == 'sample':
                print('Building sample decoder...')
                decoding_helper = seq2seq.SampleEmbeddingHelper(
                    start_tokens=start_tokens,
                    end_token=end_token,
                    embedding=lambda inputs: tf.nn.embedding_lookup(
                        self.embedding, inputs))
            elif self.predict_mode == 'greedy':
                print('Building greedy decoder...')
                decoding_helper = seq2seq.GreedyEmbeddingHelper(
                    start_tokens=start_tokens,
                    end_token=end_token,
                    embedding=lambda inputs: tf.nn.embedding_lookup(
                        self.embedding, inputs))
            else:
                raise NotImplementedError(
                    'Predict mode: {} is not yet implemented'.format(
                        self.predict_mode))

            inference_decoder = seq2seq.BasicDecoder(
                cell=self.decoder_cell,
                helper=decoding_helper,
                initial_state=self.decoder_initial_state,
                output_layer=self.output_layer)
        else:
            raise NotImplementedError(
                'Beamsearch decode is not yet implemented.')

        self.decoder_outputs_decode, self.decoder_last_state_decode, self.decoder_outputs_length_decode = seq2seq.dynamic_decode(
            decoder=inference_decoder,
            output_time_major=False,
            maximum_iterations=self.max_decode_step)

        if not self.use_beamsearch_decode:
            self.decoder_pred_decode = tf.expand_dims(
                self.decoder_outputs_decode.sample_id, -1)
        else:
            raise NotImplementedError('{} mode is not recognized.'.format(
                self.mode))
Esempio n. 5
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if not gta and self._hparams.predict_linear == True and linear_targets is None and is_training:
            raise ValueError(
                'Model is set to  use post processing to  predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                'Model set to mask paddings but no targets lengths provided for the mask!'
            )
        if is_training and is_evaluating:
            raise RuntimeError(
                'Model can not be in training and evaluation modes at the same time!'
            )

        with tf.variable_scope('inference') as scope:
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams
            assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                        'scheduled')
            if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                assert global_step is not None

            #GTA is only used for predicting mels to  train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
            post_condition = hp.predict_linear and not gta

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hparams=hp,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layers_sizes=hp.prenet_layers,
                            drop_rate=hp.tacotron_dropout_rate,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                hparams=hp,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing,
                cumulate_weights=hp.cumulative_weights)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_LSTM')
            #Frames Projection layer
            frame_projection = FrameProjection(
                hp.num_mels * hp.outputs_per_step,
                scope='linear_transform_projection')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training or is_evaluating,
                                             shape=hp.outputs_per_step,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet, attention_mechanism,
                                               decoder_lstm, frame_projection,
                                               stop_projection)

            #Define the helper for our decoder
            if is_training or is_evaluating or gta:
                self.helper = TacoTrainingHelper(batch_size, mel_targets,
                                                 stop_token_targets, hp, gta,
                                                 is_evaluating, global_step)
            else:
                self.helper = TacoTestHelper(batch_size, hp)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not (is_training
                                             or is_evaluating) else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=False,
                 maximum_iterations=max_iters,
                 swap_memory=hp.tacotron_swap_with_cpu)

            # Reshape outputs to  be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              hparams=hp,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to  same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            if post_condition:
                # Add post-processing CBHG. This does a great job at extracting features from mels before projection to  Linear specs.
                post_cbhg = CBHG(hp.cbhg_kernels,
                                 hp.cbhg_conv_channels,
                                 hp.cbhg_pool_size,
                                 [hp.cbhg_projection, hp.num_mels],
                                 hp.cbhg_projection_kernel_size,
                                 hp.cbhg_highwaynet_layers,
                                 hp.cbhg_highway_units,
                                 hp.cbhg_rnn_units,
                                 is_training,
                                 name='CBHG_postnet')

                #[batch_size, decoder_steps(mel_frames), cbhg_channels]
                post_outputs = post_cbhg(mel_outputs, None)

                #Linear projection of extracted features to  make linear spectrogram
                linear_specs_projection = FrameProjection(
                    hp.num_freq, scope='cbhg_linear_specs_projection')

                #[batch_size, decoder_steps(linear_frames), num_freq]
                linear_outputs = linear_specs_projection(post_outputs)

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.all_vars = tf.trainable_variables()

            if is_training:
                self.ratio = self.helper._ratio
            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            if post_condition:
                self.linear_outputs = linear_outputs
                self.linear_targets = linear_targets
            self.mel_targets = mel_targets
            self.targets_lengths = targets_lengths
            log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
            log('  Train mode:               {}'.format(is_training))
            log('  Eval mode:                {}'.format(is_evaluating))
            log('  GTA mode:                 {}'.format(gta))
            log('  Synthesis mode:           {}'.format(not (
                is_training or is_evaluating)))
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    linear_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))
            log('  Tacotron Parameters       {:.3f} Million.'.format(
                np.sum(
                    [np.prod(v.get_shape().as_list())
                     for v in self.all_vars]) / 1_000_000))
Esempio n. 6
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   gta=False):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')
        if gta == False and self._hparams.predict_linear == True and linear_targets is None:
            raise ValueError(
                'Model is set to use post processing to predict linear spectrograms in training but no linear targets given!'
            )
        if gta and linear_targets is not None:
            raise ValueError(
                'Linear spectrogram prediction is not supported in GTA mode!')

        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None and not gta
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams
            #GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
            post_condition = hp.predict_linear and not gta

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable('inputs_embedding',
                                              [len(symbols), hp.embedding_dim],
                                              dtype=tf.float32)
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    kernel_size=hp.enc_conv_kernel_size,
                                    channels=hp.enc_conv_channels,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layer_sizes=hp.prenet_layers,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim,
                encoder_outputs,
                mask_encoder=hp.mask_encoder,
                memory_sequence_length=input_lengths,
                smoothing=hp.smoothing)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_lstm')
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step,
                                               scope='linear_transform')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               mask_finished=hp.mask_finished)

            #Define the helper for our decoder
            if (is_training or gta) == True:
                self.helper = TacoTrainingHelper(
                    batch_size, mel_targets, stop_token_targets, hp.num_mels,
                    hp.outputs_per_step, hp.tacotron_teacher_forcing_ratio,
                    gta)
            else:
                self.helper = TacoTestHelper(batch_size, hp.num_mels,
                                             hp.outputs_per_step)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not is_training else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=hp.impute_finished,
                 maximum_iterations=max_iters)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              kernel_size=hp.postnet_kernel_size,
                              channels=hp.postnet_channels,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            if post_condition:
                #Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
                #Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
                post_processing_cell = TacotronEncoderCell(
                    EncoderConvolutions(is_training,
                                        kernel_size=hp.enc_conv_kernel_size,
                                        channels=hp.enc_conv_channels,
                                        scope='post_processing_convolutions'),
                    EncoderRNN(is_training,
                               size=hp.encoder_lstm_units,
                               zoneout=hp.tacotron_zoneout_rate,
                               scope='post_processing_LSTM'))

                expand_outputs = post_processing_cell(mel_outputs)
                linear_outputs = FrameProjection(
                    hp.num_freq,
                    scope='post_processing_projection')(expand_outputs)

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            if post_condition:
                self.linear_outputs = linear_outputs
                self.linear_targets = linear_targets
            self.mel_targets = mel_targets
            log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            if post_condition:
                log('  linear out:               {}'.format(
                    linear_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   linear_targets=None,
                   targets_lengths=None,
                   gta=False,
                   global_step=None,
                   is_training=False,
                   is_evaluating=False):

        hp = self._hparams
        batch_size = tf.shape(inputs)[0]
        gta = False

        T2_output_range = (-hp.max_abs_value,
                           hp.max_abs_value) if hp.symmetric_mels else (
                               0, hp.max_abs_value)

        with tf.variable_scope('inference') as scope:
            assert hp.tacotron_teacher_forcing_mode in ('constant',
                                                        'scheduled')
            if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
                assert global_step is not None

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            self.embedding_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32)

            embedded_inputs = tf.nn.embedding_lookup(self.embedding_table,
                                                     inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    hparams=hp,
                                    scope='encoder_convolutions'),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.tacotron_zoneout_rate,
                           scope='encoder_LSTM'))

            self.encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            self.enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training,
                            layers_sizes=hp.prenet_layers,
                            drop_rate=hp.tacotron_dropout_rate,
                            scope='decoder_prenet')
            #Attention Mechanism
            attention_mechanism = GMMAttention(self.encoder_outputs,
                                               input_lengths, is_training)

            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.tacotron_zoneout_rate,
                                      scope='decoder_LSTM')
            #Frames Projection layer
            frame_projection = FrameProjection(
                hp.num_mels * hp.outputs_per_step,
                scope='linear_transform_projection')
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training or is_evaluating,
                                             shape=hp.outputs_per_step,
                                             scope='stop_token_projection')

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet,
                                               attention_mechanism,
                                               decoder_lstm,
                                               frame_projection,
                                               stop_projection,
                                               num_attn_mixture=5)

            #Define the helper for our decoder
            if is_training or is_evaluating or gta:
                self.helper = TacoTrainingHelper(batch_size, mel_targets, hp,
                                                 gta, is_evaluating,
                                                 global_step)
            else:
                self.helper = TacoTestHelper(batch_size, hp)

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Only use max iterations at synthesis time
            max_iters = hp.max_iters if not (is_training
                                             or is_evaluating) else None

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=False,
                 maximum_iterations=max_iters,
                 swap_memory=hp.tacotron_swap_with_cpu)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            self.decoder_output = tf.reshape(frames_prediction,
                                             [batch_size, -1, hp.num_mels])
            self.stop_token_prediction = tf.reshape(stop_token_prediction,
                                                    [batch_size, -1])

            if hp.clip_outputs:
                self.decoder_output = tf.minimum(
                    tf.maximum(self.decoder_output,
                               T2_output_range[0] - hp.lower_bound_decay),
                    T2_output_range[1])

            #Postnet
            postnet = Postnet(is_training,
                              hparams=hp,
                              scope='postnet_convolutions')

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(self.decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels,
                                                  scope='postnet_projection')
            self.projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            self.mel_outputs = self.decoder_output + self.projected_residual

            if hp.clip_outputs:
                self.mel_outputs = tf.minimum(
                    tf.maximum(self.mel_outputs,
                               T2_output_range[0] - hp.lower_bound_decay),
                    T2_output_range[1])

            # Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
            post_cbhg = CBHG(hp.cbhg_kernels,
                             hp.cbhg_conv_channels,
                             hp.cbhg_pool_size,
                             [hp.cbhg_projection, hp.num_mels],
                             hp.cbhg_projection_kernel_size,
                             hp.cbhg_highwaynet_layers,
                             hp.cbhg_highway_units,
                             hp.cbhg_rnn_units,
                             hp.batch_norm_position,
                             is_training,
                             name='CBHG_postnet')

            #[batch_size, decoder_steps(mel_frames), cbhg_channels]
            self.post_outputs = post_cbhg(self.mel_outputs, None)

            #Linear projection of extracted features to make linear spectrogram
            linear_specs_projection = FrameProjection(
                hp.num_freq, scope='cbhg_linear_specs_projection')

            #[batch_size, decoder_steps(linear_frames), num_freq]
            self.linear_outputs = linear_specs_projection(self.post_outputs)

            if hp.clip_outputs:
                self.linear_outputs = tf.minimum(
                    tf.maximum(self.linear_outputs,
                               T2_output_range[0] - hp.lower_bound_decay),
                    T2_output_range[1])

            #Grab alignments from the final decoder state
            self.alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            log('initialisation done.')

        if is_training:
            self.ratio = self.helper._ratio

        self.inputs = inputs
        self.input_lengths = input_lengths
        self.mel_targets = mel_targets
        self.linear_targets = linear_targets
        self.targets_lengths = targets_lengths
        self.stop_token_targets = stop_token_targets
        self.gta = gta
        self.all_vars = tf.trainable_variables()
        self.is_training = is_training
        self.is_evaluating = is_evaluating

        log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
        log('  Train mode:               {}'.format(is_training))
        log('  Eval mode:                {}'.format(is_evaluating))
        log('  GTA mode:                 {}'.format(gta))
        log('  Synthesis mode:           {}'.format(not (
            is_training or is_evaluating)))
        log('  Input:                    {}'.format(inputs.shape))
        log('  embedding:                {}'.format(embedded_inputs.shape))
        log('  enc conv out:             {}'.format(
            self.enc_conv_output_shape))
        log('  encoder out:              {}'.format(
            self.encoder_outputs.shape))
        log('  decoder out:              {}'.format(self.decoder_output.shape))
        log('  residual out:             {}'.format(residual.shape))
        log('  projected residual out:   {}'.format(
            self.projected_residual.shape))
        log('  mel out:                  {}'.format(self.mel_outputs.shape))
        log('  linear out:               {}'.format(self.linear_outputs.shape))
        log('  <stop_token> out:         {}'.format(
            self.stop_token_prediction.shape))

        #1_000_000 is causing syntax problems for some people?! Python please :)
        log('  Tacotron Parameters       {:.3f} Million.'.format(
            np.sum([np.prod(v.get_shape().as_list())
                    for v in self.all_vars]) / 1000000))
Esempio n. 8
0
    def __init__(self, vocab_size, hidden_size, dropout,
                 num_layers, max_gradient_norm, batch_size, learning_rate,
                 lr_decay_factor, max_target_length,
                 max_source_length, decoder_mode=False):
        '''
        vocab_size: number of vocab tokens
        buckets: buckets of max sequence lengths
        hidden_size: dimension of hidden layers
        num_layers: number of hidden layers
        max_gradient_norm: maximum gradient magnitude
        batch_size: number of training examples fed to network at once
        learning_rate: starting learning rate of network
        lr_decay_factor: amount by which to decay learning rate
        num_samples: number of samples for sampled softmax
        decoder_mode: Whether to build backpass nodes or not
        '''
        GO_ID = config.GO_ID
        EOS_ID = config.EOS_ID
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.global_step = tf.Variable(0, trainable=False)
        self.learning_rate = learning_rate
        self.encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
        self.source_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name='source_lengths')

        self.decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')
        self.target_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name="target_lengths")

        with tf.variable_scope('embeddings') as scope:
            embeddings = tf.Variable(tf.random_uniform([vocab_size, hidden_size], -1.0, 1.0), dtype=tf.float32)
            encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, self.encoder_inputs)
            targets_embedding = tf.nn.embedding_lookup(embeddings, self.decoder_targets)


        with tf.variable_scope('encoder') as scope:
            encoder_cell = rnn.LSTMCell(hidden_size)
            encoder_cell = rnn.DropoutWrapper(encoder_cell,
                                              input_keep_prob=dropout)

            encoder_cell = rnn.MultiRNNCell([encoder_cell] * num_layers)

            _, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell,
                                                               cell_bw=encoder_cell,
                                                               sequence_length=self.source_lengths,
                                                               inputs=encoder_inputs_embedded,
                                                               dtype=tf.float32,
                                                               time_major=False)

        with tf.variable_scope('decoder') as scope:
            decoder_cell = rnn.LSTMCell(hidden_size)
            decoder_cell = rnn.DropoutWrapper(decoder_cell,
                                              input_keep_prob=dropout)

            decoder_cell = rnn.MultiRNNCell([decoder_cell] * num_layers)

            #TODO add attention
            #seq2seq.BahdanauAttention(num_units=,memory=encoder_output)

            #decoder_cell = seq2seq.AttentionWrapper(cell=decoder_cell,
            #                                        attention_mechanism=)

        if decoder_mode:
            decoder = seq2seq.BeamSearchDecoder(embedding=embeddings,
                                                start_tokens=tf.tile([GOD_ID], [batch_size]),
                                                end_token=EOS_ID,
                                                initial_state=encoder_state[0],
                                                beam_width=2)
        else:
            helper = seq2seq.TrainingHelper(inputs=targets_embedding,
                                            sequence_length=self.target_lengths)

            decoder = seq2seq.BasicDecoder(cell=decoder_cell,
                                           helper=helper,
                                           initial_state=encoder_state[-1],
                                           output_layer=Dense(vocab_size))

        final_outputs, final_state, final_sequence_lengths =\
                            seq2seq.dynamic_decode(decoder=decoder)

        self.logits = final_outputs.rnn_output

        if not decoder_mode:
            with tf.variable_scope("loss") as scope:
                #have to pad logits, dynamic decode produces results not consistent
                #in shape with targets
                pad_size = self.max_target_length - tf.reduce_max(final_sequence_lengths)
                self.logits = tf.pad(self.logits, [[0, 0], [0,pad_size], [0, 0]])

                weights = tf.sequence_mask(lengths=final_sequence_lengths,
                                           maxlen=self.max_target_length,
                                           dtype=tf.float32,
                                           name='weights')

                x_entropy_loss = seq2seq.sequence_loss(logits=self.logits,
                                                       targets=self.decoder_targets,
                                                       weights=weights)

                self.loss = tf.reduce_mean(x_entropy_loss)

            optimizer = tf.train.AdamOptimizer()
            gradients = optimizer.compute_gradients(x_entropy_loss)
            capped_grads = [(tf.clip_by_value(grad, -max_gradient_norm, max_gradient_norm), var) for grad, var in gradients]
            self.train_op = optimizer.apply_gradients(capped_grads,
                                                      global_step=self.global_step)
            self.saver = tf.train.Saver(tf.global_variables())
Esempio n. 9
0
    def buildModel(self):
        T_in = self.args.T_in
        T_out = self.args.T_out
        D_in = self.args.D_in
        D_out = self.args.D_out
        E = self.args.embedding_dim
        H = self.args.hidden_dim
        SOS = self.args.SOS
        EOS = self.args.EOS
        PAD = self.args.PAD
        beam_width = 3

        # Input
        with tf.name_scope('input'):
            x = tf.placeholder(shape=(None, T_in),
                               dtype=tf.int32,
                               name='encoder_inputs')
            # N, T_out
            y = tf.placeholder(shape=(None, T_out),
                               dtype=tf.int32,
                               name='decoder_inputs')
            # N
            x_len = tf.placeholder(shape=(None, ), dtype=tf.int32)
            # N
            y_len = tf.placeholder(shape=(None, ), dtype=tf.int32)
            # dynamic sample num
            batch_size = tf.shape(x)[0]

            # symbol mask
            sos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * SOS
            eos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * EOS
            pad = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * PAD

            # input mask
            x_mask = tf.sequence_mask(x_len, T_in, dtype=tf.float32)
            y_with_sos_mask = tf.sequence_mask(y_len,
                                               T_out + 1,
                                               dtype=tf.float32)
            y_with_pad = tf.concat([y, pad], axis=1)
            eos_mask = tf.one_hot(y_len, depth=T_out + 1, dtype=tf.int32) * EOS

            # masked inputs
            y_with_eos = y_with_pad + eos_mask
            y_with_sos = tf.concat([sos, y], axis=1)

        ## Embedding
        with tf.name_scope('embedding'):
            if self.args.use_pretrained:
                embedding_pretrained = np.fromfile(self.args.pretrained_file,
                                                   dtype=np.float32).reshape(
                                                       (-1, E))
                embedding = tf.Variable(embedding_pretrained, trainable=False)
            else:
                embedding = tf.get_variable(name='embedding',
                                            shape=(D_in, E),
                                            dtype=tf.float32,
                                            initializer=xavier_initializer())
            e_x = tf.nn.embedding_lookup(embedding, x)
            e_y = tf.nn.embedding_lookup(embedding, y_with_sos)
            if self.args.mode == 'train':
                e_x = tf.nn.dropout(e_x, self.args.keep_prob)

        ## Encoder
        with tf.name_scope('encoder'):
            ## Multi-BiLSTM
            fw_cell = rnn.MultiRNNCell([
                rnn.BasicLSTMCell(num_units=H)
                for i in range(self.args.layer_size)
            ])
            bw_cell = rnn.MultiRNNCell([
                rnn.BasicLSTMCell(num_units=H)
                for i in range(self.args.layer_size)
            ])
            bi_encoder_output, bi_encoder_state = tf.nn.bidirectional_dynamic_rnn(
                fw_cell,
                bw_cell,
                e_x,
                sequence_length=x_len,
                dtype=tf.float32,
                time_major=False,
                scope=None)
            encoder_output = bi_encoder_output[0] + bi_encoder_output[1]
            encoder_final_state = bi_encoder_state[0]

        ## Decoder
        with tf.name_scope('decoder'):
            decoder_cell = rnn.MultiRNNCell([
                rnn.BasicLSTMCell(num_units=H)
                for i in range(self.args.layer_size)
            ])
            decoder_lengths = tf.ones(shape=[batch_size],
                                      dtype=tf.int32) * (T_out + 1)

            ## Trainning decoder
            with tf.variable_scope('attention'):
                attention_mechanism = LuongAttention(
                    num_units=H,
                    memory=encoder_output,
                    memory_sequence_length=x_len,
                    name='attention_fn')
            projection_layer = Dense(units=D_out,
                                     kernel_initializer=xavier_initializer())

            train_decoder_cell = AttentionWrapper(
                cell=decoder_cell,
                attention_mechanism=attention_mechanism,
                attention_layer_size=H)
            train_decoder_init_state = train_decoder_cell.zero_state(
                batch_size=batch_size,
                dtype=tf.float32).clone(cell_state=encoder_final_state)
            training_helper = TrainingHelper(e_y,
                                             decoder_lengths,
                                             time_major=False)
            train_decoder = BasicDecoder(
                cell=train_decoder_cell,
                helper=training_helper,
                initial_state=train_decoder_init_state,
                output_layer=projection_layer)
            train_decoder_outputs, _, _ = dynamic_decode(
                train_decoder,
                impute_finished=True,
                maximum_iterations=T_out + 1)
            # N, T_out+1, D_out
            train_decoder_outputs = ln(train_decoder_outputs.rnn_output)

            ## Beam_search decoder
            beam_memory = tile_batch(encoder_output, beam_width)
            beam_memory_state = tile_batch(encoder_final_state, beam_width)
            beam_memory_length = tile_batch(x_len, beam_width)

            with tf.variable_scope('attention', reuse=True):
                beam_attention_mechanism = LuongAttention(
                    num_units=H,
                    memory=beam_memory,
                    memory_sequence_length=beam_memory_length,
                    name='attention_fn')
            beam_decoder_cell = AttentionWrapper(
                cell=decoder_cell,
                attention_mechanism=beam_attention_mechanism,
                attention_layer_size=None)
            beam_decoder_init_state = beam_decoder_cell.zero_state(
                batch_size=batch_size * beam_width,
                dtype=tf.float32).clone(cell_state=beam_memory_state)
            start_tokens = tf.ones((batch_size), dtype=tf.int32) * SOS
            beam_decoder = BeamSearchDecoder(
                cell=beam_decoder_cell,
                embedding=embedding,
                start_tokens=start_tokens,
                end_token=EOS,
                initial_state=beam_decoder_init_state,
                beam_width=beam_width,
                output_layer=projection_layer)
            beam_decoder_outputs, _, _ = dynamic_decode(
                beam_decoder,
                scope=tf.get_variable_scope(),
                maximum_iterations=T_out + 1)
            beam_decoder_result_ids = beam_decoder_outputs.predicted_ids

        with tf.name_scope('loss'):
            logits = tf.nn.softmax(train_decoder_outputs)
            cross_entropy = tf.keras.losses.sparse_categorical_crossentropy(
                y_with_eos, logits)
            loss_mask = tf.sequence_mask(y_len + 1,
                                         T_out + 1,
                                         dtype=tf.float32)
            loss = tf.reduce_sum(cross_entropy * loss_mask) / tf.cast(
                batch_size, dtype=tf.float32)
            prediction = tf.argmax(logits, 2)

        ## train_op
        with tf.name_scope('train'):
            global_step = tf.train.get_or_create_global_step()
            lr = noam_scheme(self.args.lr, global_step, self.args.warmup_steps)
            optimizer = tf.train.AdamOptimizer(lr)

            ## gradient clips
            trainable_params = tf.trainable_variables()
            gradients = tf.gradients(loss, trainable_params)
            clip_gradients, _ = tf.clip_by_global_norm(
                gradients, self.args.gradient_clip_num)
            train_op = optimizer.apply_gradients(zip(clip_gradients,
                                                     trainable_params),
                                                 global_step=global_step)

        # Summary
        with tf.name_scope('summary'):
            tf.summary.scalar('lr', lr)
            tf.summary.scalar('loss', loss)
            tf.summary.scalar('global_step', global_step)
            summaries = tf.summary.merge_all()
        return x, y, x_len, y_len, logits, loss, prediction, beam_decoder_result_ids, global_step, train_op, summaries
Esempio n. 10
0
    def build_decoder(self):

        print 'Building Decoder'

        with tf.variable_scope('decoder'):
            self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell(
            )

            initializer = tf.random_uniform_initializer(-math.sqrt(3),
                                                        math.sqrt(3),
                                                        dtype=tf.float32)
            self.decoder_embeddings = tf.get_variable(
                "decoder_embeddings",
                [self.tgt_vocab_size, self.input_embedding_size],
                initializer=initializer,
                dtype=tf.float32)

            input_layer = Dense(self.decoder_hidden_units,
                                dtype=tf.float32,
                                name='input_projection')

            # Output projection layer to convert cell_outputs to logits
            output_layer = Dense(self.tgt_vocab_size, name='output_projection')

            if self.mode == 'train':

                # Train Mode
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                    params=self.decoder_embeddings,
                    ids=self.decoder_inputs_train)

                self.decoder_inputs_embedded = input_layer(
                    self.decoder_inputs_embedded)

                training_helper = seq2seq.TrainingHelper(
                    inputs=self.decoder_inputs_embedded,
                    sequence_length=self.decoder_inputs_length_train,
                    time_major=False,
                    name='training_helper')

                training_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=training_helper,
                    initial_state=self.decoder_initial_state,
                    output_layer=output_layer)

                # Maximum decoder time_steps in current batch
                max_decoder_length = tf.reduce_max(
                    self.decoder_inputs_length_train)

                (self.decoder_output_train, self.decoder_last_state_train,
                 self.decoder_outputs_length_train) = (seq2seq.dynamic_decode(
                     decoder=training_decoder,
                     output_time_major=False,
                     impute_finished=True,
                     maximum_iterations=max_decoder_length))

                # [batch_size, max_time_step + 1, num_decoder_symbols]
                self.decoder_logits_train = tf.identity(
                    self.decoder_output_train.rnn_output)
                # Use argmax to extract decoder symbols to emit
                self.decoder_pred_train = tf.argmax(self.decoder_logits_train,
                                                    axis=1,
                                                    name='decoder_pred_train')

                # [batch_size, max_time_steps + 1]
                masks = tf.sequence_mask(
                    lengths=self.decoder_inputs_length_train,
                    maxlen=max_decoder_length,
                    dtype=tf.float32,
                    name='masks')

                self.loss = tf.contrib.seq2seq.sequence_loss(
                    logits=self.decoder_logits_train,
                    targets=self.decoder_targets_train,
                    weights=masks,
                    average_across_timesteps=True,
                    average_across_batch=True)

                # Training summary for the current batch_loss
                tf.summary.scalar('loss', self.loss)

                # Contruct graphs for minimizing loss
                self.init_optimizer()

            elif self.mode == 'decode':

                # Decode mode

                start_tokens = tf.ones([
                    self.batch_size,
                ], tf.int32) * start_token

                # end_token = end_token

                def embed_and_input_proj(inputs):

                    return input_layer(
                        tf.nn.embedding_lookup(self.decoder_embeddings,
                                               inputs))

                # Feeds input for greedy decoding: uses argmax for the output
                decoding_helper = seq2seq.GreedyEmbeddingHelper(
                    start_tokens=start_tokens,
                    end_token=end_token,
                    embedding=embed_and_input_proj)

                inference_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=decoding_helper,
                    initial_state=self.decoder_initial_state,
                    output_layer=output_layer)

                (self.decoder_outputs_decode, self.decoder_last_state_decode,
                 self.decoder_outputs_length_decode) = (seq2seq.dynamic_decode(
                     decoder=inference_decoder,
                     output_time_major=False,
                     maximum_iterations=self.max_decode_step))

                # To be compatible in case of use of beam search
                # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False)
                self.decoder_pred_decode = tf.expand_dims(
                    self.decoder_outputs_decode.sample_id, -1)
Esempio n. 11
0
    def build_model(self):
        """
        build model
        :return:
        """
        with tf.variable_scope('g_model'):
            # 1 定义模型的placeholder
            # encoder
            self.encoder_inputs = tf.placeholder(
                tf.int32, [self.max_length_encoder, None],
                name='encoder_inputs')
            self.encoder_inputs_length = tf.placeholder(
                tf.int32, [None], name='encoder_inputs_length')
            # decoder
            self.decoder_targets = tf.placeholder(
                tf.int32, [self.max_length_decoder, None],
                name='decoder_targets')
            self.decoder_targets_length = tf.placeholder(
                tf.int32, [None], name='decoder_targets_length')
            self.max_target_sequence_length = tf.reduce_max(
                self.decoder_targets_length, name='max_target_len')
            self.mask = tf.sequence_mask(self.decoder_targets_length,
                                         self.max_target_sequence_length,
                                         dtype=tf.float32,
                                         name='masks')

            # for updating
            self.reward = tf.placeholder(tf.float32,
                                         [self.max_length_decoder, None],
                                         name='reward')
            self.start_tokens = tf.placeholder(
                tf.int32, [None], name='start_tokens')  # for partial-sampling
            self.max_inference_length = tf.placeholder(
                tf.int32, [None], name='max_inference_length')  # for inference

            # 2 定义模型的encoder部分
            with tf.variable_scope('encoder'):
                encoder_cell = self.create_rnn_cell()
                embedding = tf.get_variable(
                    'embedding', [self.vocab_size, self.embedding_size])
                encoder_inputs_embedded = tf.nn.embedding_lookup(
                    embedding, self.encoder_inputs)
                encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
                    encoder_cell,
                    encoder_inputs_embedded,
                    sequence_length=self.encoder_inputs_length,
                    dtype=tf.float32)

            # 3 定义模型的decoder部分
            with tf.variable_scope('decoder'):
                encoder_inputs_length = self.encoder_inputs_length
                # 定义要使用的attention机制
                attention_mechanism = seq2seq.BahdanauAttention(
                    num_units=self.lstm_size,
                    memory=encoder_outputs,
                    memory_sequence_length=encoder_inputs_length)
                decoder_cell = self.create_rnn_cell()
                decoder_cell = seq2seq.AttentionWrapper(
                    cell=decoder_cell,
                    attention_mechanism=attention_mechanism,
                    attention_layer_size=self.lstm_size,
                    name='Attention_Wrapper')
                # 定义decoder阶段的初始状态,直接使用encoder阶段的最后一个隐层状态进行赋值
                decoder_initial_state = decoder_cell.zero_state(
                    batch_size=self.batch_size,
                    dtype=tf.float32).clone(cell_state=encoder_state)
                output_layer = tf.layers.Dense(
                    self.vocab_size,
                    kernel_initializer=tf.truncated_normal_initializer(
                        mean=0.0, stddev=0.1))

                ending = tf.strided_slice(self.decoder_targets, [0, 0],
                                          [self.batch_size, -1], [1, 1])
                decoder_inputs = tf.concat([
                    tf.fill([self.batch_size, 1], tf.cast(
                        GO_ID, dtype=tf.int32)), ending
                ], 1)
                decoder_inputs_embedded = tf.nn.embedding_lookup(
                    embedding, decoder_inputs)

                # train
                helper_train = seq2seq.TrainingHelper(
                    decoder_inputs_embedded,
                    self.decoder_targets_length,
                    time_major=True)
                decoder_train = seq2seq.BasicDecoder(decoder_cell,
                                                     helper_train,
                                                     decoder_initial_state,
                                                     output_layer=output_layer)
                decoder_output_train, decoder_state_train, _ = seq2seq.dynamic_decode(
                    decoder_train,
                    swap_memory=True,
                    output_time_major=True,
                    impute_finished=True,
                    maximum_iterations=self.decoder_targets_length)
                self.decoder_logits_train = tf.identity(
                    decoder_output_train.rnn_output)
                self.decoder_predict_train = tf.argmax(
                    self.decoder_logits_train,
                    axis=-1,
                    name='decoder_pred_train')
                self.loss_pretrain = seq2seq.sequence_loss(
                    logits=self.decoder_logits_train,
                    targets=self.decoder_targets,
                    weights=self.mask)
                tf.summary.scalar('loss', self.loss_pretrain)
                self.summary_op = tf.summary.merge_all()
Esempio n. 12
0
    def build_decoder(self, decoder_init_embed):
        print("building attention and decoder...")
        with tf.variable_scope('decoder'):
            self.decoder_cell, self.decoder_initial_state, self.beam_decoder_cell, self.beam_decoder_initial_state \
                               = self._build_decoder_cell()
            # initializer
            self.decoder_embeddings = tf.Variable(decoder_init_embed, name="decoder_embedding", dtype=self.dtype)
            self.decoder_vocab_size = len(decoder_init_embed)
            input_layer = Dense(self.hidden_units, dtype=self.dtype, name="input_projection")
            output_layer = Dense(decoder_init_embed.shape[0], name="output_projection")

            # generate_mode
            decoder_start_tokens = tf.ones(shape=[self.batch_size, ], dtype=tf.int32) * params.start_token
            decoder_end_token =  params.end_token
            def embed_and_input_proj(inputs):
                return input_layer(tf.nn.embedding_lookup(self.decoder_embeddings, inputs))


            print('greedy decoding...')
            generate_decoding_helper = seq2seq.GreedyEmbeddingHelper(start_tokens=decoder_start_tokens, \
                                                            end_token=decoder_end_token, \
                                                            embedding=embed_and_input_proj)
            generate_inference_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell,
                                                         helper=generate_decoding_helper,
                                                         initial_state=self.decoder_initial_state,
                                                         output_layer=output_layer)
            with tf.variable_scope('decode_with_shared_attention'):
                self.gen_outputs, decoder_last_state, gen_outputs_len = (seq2seq.dynamic_decode(
                    decoder=generate_inference_decoder, \
                    output_time_major=False, \
                    maximum_iterations=self.max_sent_len)) # params.max_decoder_len              
            # self.gen_x: batch_size, max_decoder_len
            self.gen_x = self.gen_outputs.sample_id

            
            print("beam decoding...")
            beam_generate_inference_decoder = beam_search_decoder.BeamSearchDecoder(cell=self.beam_decoder_cell, \
                                                                                    embedding=embed_and_input_proj, \
                                                                                    start_tokens=decoder_start_tokens, \
                                                                                    end_token=decoder_end_token, \
                                                                                    initial_state=self.beam_decoder_initial_state, \
                                                                                    beam_width=self.beam_width, \
                                                                                    output_layer=output_layer)
            with tf.variable_scope('decode_with_shared_attention', reuse=True):
                self.beam_gen_outputs, beam_decoder_last_state, beam_gen_outputs_len = (seq2seq.dynamic_decode(
                    decoder=beam_generate_inference_decoder, \
                    output_time_major=False, \
                    maximum_iterations=self.max_sent_len)) # params.max_decoder_len
            self.beam_gen_x = self.beam_gen_outputs.predicted_ids



            print("decoder for rollout")
            # decoder inputs in train and rollout mode
            self.decoder_inputs_embedded = input_layer(tf.nn.embedding_lookup(params=self.decoder_embeddings, \
                                                                              ids=self.decoder_inputs))
           # rollout mode
            rollout_helper = seq2seq.GreedyEmbeddingHelper(start_tokens=decoder_start_tokens, \
                                                           end_token=decoder_end_token, \
                                                           embedding=embed_and_input_proj)
            rollout_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, \
                                                    helper=rollout_helper, \
                                                    initial_state=self.decoder_initial_state, \
                                                    output_layer=output_layer)
            # calc samples for each time step (fix sent[:given_time+1], roll out sent[given_time+1:])
            self.rollout_decoder_state = self.decoder_initial_state
            # rollout_outputs shape: max_decoder_len, batch_size

            rollout_outputs = tensor_array_ops.TensorArray(dtype=tf.int32, size=self.max_sent_len, \
                                                                dynamic_size=False, infer_shape=True)
            init_inputs_embedded = embed_and_input_proj(decoder_start_tokens)
            i = tf.constant(0)
            while_condition = lambda i, inputs_embedded, decoder_state, rollout_outputs, given_time: tf.less(i, given_time)
            def feed_body(i, inputs_embedded, decoder_state, rollout_outputs, given_time):
                print("feed body iter:", i)
                next_outputs, decoder_state, next_inputs, decoder_finished = rollout_decoder.step(i, inputs_embedded, decoder_state)
                inputs =  tf.reshape(tf.gather(params=self.decoder_inputs, indices=[i], axis=1), shape=[self.batch_size, ])
                inputs_embedded = embed_and_input_proj(inputs)
                rollout_outputs = rollout_outputs.write(i, inputs)
                return i+1, inputs_embedded, decoder_state, rollout_outputs, given_time               
            i, inputs_embedded, self.rollout_decoder_state, self.rollout_outputs,  _ = tf.while_loop(while_condition, feed_body, \
                                                                                            (0, init_inputs_embedded, \
                                                                                             self.rollout_decoder_state, rollout_outputs, self.given_time))
            # next_outputs shape: (batch_size, decoder_vocab_size)
            inputs =  tf.reshape(tf.gather(params=self.decoder_inputs, indices=[self.given_time], axis=1), shape=[self.batch_size, ])
            inputs_embedded = input_layer(tf.nn.embedding_lookup(params=self.decoder_embeddings, \
                                                             ids=inputs))
            # rollout outputs: sample from output probability
            i = self.given_time
            while_condition = lambda i, inputs_embedded, decoder_state, rollout_outputs, max_len: tf.less(i, self.max_sent_len)
            def pred_body(i, inputs_embedded, decoder_state, rollout_outputs, max_len):
                print("pred body iter", i)
                # record rollout sentences
                next_outputs, decoder_state, next_inputs, decoder_finished = rollout_decoder.step(i, inputs_embedded, \
                                                                           decoder_state)
                inputs = tf.cast(tf.reshape(tf.multinomial(next_outputs.rnn_output, 1), [self.batch_size, ]), tf.int32)
                inputs_embedded =  embed_and_input_proj(inputs)
                rollout_outputs = rollout_outputs.write(i, inputs)
                return i+1, inputs_embedded, decoder_state, rollout_outputs, max_len
            i, inputs_embedded, self.rollout_decoder_state, self.rollout_outputs, _ = tf.while_loop(while_condition, pred_body, (i, inputs_embedded, self.rollout_decoder_state, \
                                                           self.rollout_outputs, self.max_sent_len))
            self.rollout_outputs = self.rollout_outputs.stack()
            self.rollout_outputs = tf.transpose(self.rollout_outputs, perm=[1,0])


            # train mode
            print("decoder for both pre-training and RL training")
            decoder_start_token_train= tf.ones(shape=[self.batch_size, 1], dtype=tf.int32) * params.start_token
            decoder_end_token_train= tf.ones(shape=[self.batch_size, 1], dtype=tf.int32) * params.end_token
            self.decoder_inputs_train = tf.concat([decoder_start_token_train, self.decoder_inputs], axis=1)
            self.decoder_inputs_length_train= self.decoder_inputs_length + 1
            self.decoder_targets_train = tf.concat([self.decoder_inputs, decoder_end_token_train], axis=1)
            self.decoder_inputs_embedded_train = tf.nn.embedding_lookup(params=self.decoder_embeddings, \
                                                                        ids=self.decoder_inputs_train)
            self.decoder_inputs_embedded_train = input_layer(self.decoder_inputs_embedded_train)

            training_helper = seq2seq.TrainingHelper(inputs=self.decoder_inputs_embedded_train, \
                                                     sequence_length=self.decoder_inputs_length_train, \
                                                     time_major=False,
                                                     name="training_helper")
            training_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, \
                                                    helper=training_helper, \
                                                    initial_state=self.decoder_initial_state, \
                                                    output_layer=output_layer)
            max_decoder_length = tf.reduce_max(self.decoder_inputs_length_train)
            self.decoder_outputs_train,  self.decoder_last_state_train, self.decoder_ouputs_len_train \
                                        = seq2seq.dynamic_decode(\
                                            decoder = training_decoder, \
                                            output_time_major = False, \
                                            impute_finished = True, \
                                            maximum_iterations = max_decoder_length)
            # flat-and-pad: rnn_output: batch_size * max_decoder_length * decoder_vocab_size
            self.decoder_logits_train = tf.identity(self.decoder_outputs_train.rnn_output)
            logits_padding = tf.one_hot(indices=tf.ones(shape=[self.batch_size, self.max_sent_len+1-max_decoder_length], dtype=tf.int32) * params.end_token, \
                                        depth=self.decoder_vocab_size, on_value=10.0, off_value=-20.0, axis=-1, dtype=self.dtype)
            # decoder_logits_train_pad: batch_size * (params.max_decoder_len+1 )* decoder_vocab_size
            self.decoder_logits_train_pad = tf.concat([self.decoder_logits_train, logits_padding], axis=1)
            
            # pre-train loss
            masks = tf.sequence_mask(lengths=self.decoder_inputs_length_train, maxlen=self.max_sent_len+1, \
                                     dtype=self.dtype, name="masks")
            self.pretrain_g_loss = seq2seq.sequence_loss(logits=tf.identity(self.decoder_logits_train_pad), \
                                                         targets=self.decoder_targets_train, \
                                                         weights=masks,\
                                                         average_across_timesteps=True,\
                                                         average_across_batch=True)
            # rl loss
            self.gen_prob = tf.nn.softmax(self.decoder_logits_train_pad)
            self.g_loss = -1.0 * tf.reduce_sum(
                tf.reduce_sum(
                    tf.one_hot(tf.to_int32(tf.reshape(self.decoder_targets_train, [-1])), self.decoder_vocab_size, 1.0, 0.0) * tf.log(
                        tf.clip_by_value(tf.reshape(self.gen_prob, [-1, self.decoder_vocab_size]), 1e-20, 1.0)), 1) \
                              * tf.reshape(self.rewards, [-1]))
            self.init_optimizer()
Esempio n. 13
0
    def decode(self, encoder_outputs, batch_size):
        # Attention
        attention_cell = AttentionWrapper(
            DecoderPrenetWrapper(GRUCell(self._hparams.get('attention_depth')),
                                 self._is_training,
                                 self._hparams.get('prenet_depths')),
            BahdanauAttention(self._hparams.get('attention_depth'),
                              encoder_outputs),
            alignment_history=True,
            output_attention=False)  # [N, T_in, attention_depth=256]

        # Concatenate attention context vector and RNN cell output into a 2*attention_depth=512D vector.
        concat_cell = ConcatOutputAndAttentionWrapper(
            attention_cell)  # [N, T_in, 2*attention_depth=512]

        # Decoder (layers specified bottom to top):
        decoder_cell = MultiRNNCell(
            [
                OutputProjectionWrapper(concat_cell,
                                        self._hparams.get('decoder_depth')),
                ResidualWrapper(GRUCell(self._hparams.get('decoder_depth'))),
                ResidualWrapper(GRUCell(self._hparams.get('decoder_depth')))
            ],
            state_is_tuple=True)  # [N, T_in, decoder_depth=256]

        # Project onto r mel spectrograms (predict r outputs at each RNN step):
        output_cell = OutputProjectionWrapper(
            decoder_cell,
            self._hparams.get('num_mels') *
            self._hparams.get('outputs_per_step'))
        decoder_init_state = output_cell.zero_state(batch_size=batch_size,
                                                    dtype=tf.float32)

        (decoder_outputs, _), final_decoder_state, _ = dynamic_decode(
            BasicDecoder(output_cell, self._helper, decoder_init_state),
            maximum_iterations=self._hparams.get(
                'max_iters'))  # [N, T_out/r, M*r]

        mel_outputs = tf.reshape(
            decoder_outputs,
            [batch_size, -1, self._hparams.get('num_mels')])

        # Post processing CHBG
        kwargs = {
            'K': self._hparams.get('decoder_K'),
            'bank_num_filters': self._hparams.get('decoder_bank_num_filters'),
            'pooling_stride': self._hparams.get('decoder_pooling_stride'),
            'pooling_width': self._hparams.get('decoder_pooling_width'),
            'proj_num_filters': self._hparams.get('decoder_proj_num_filters'),
            'proj_filter_width':
            self._hparams.get('decoder_proj_filter_width'),
            'num_highway_layers':
            self._hparams.get('decoder_num_highway_layers'),
            'highway_depth': self._hparams.get('decoder_highway_depth'),
            'gru_num_cells': self._hparams.get('decoder_gru_num_cells')
        }
        post_out = cbhg(mel_outputs, None, self._is_training, 'post_cbhg',
                        **kwargs)
        lin_outputs = tf.layers.dense(post_out, self._hparams.get('num_freq'))

        return mel_outputs, lin_outputs, final_decoder_state
Esempio n. 14
0
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
                   target_dict_dim, is_generating, beam_size,
                   max_generation_length):
    src_word_idx = tf.placeholder(tf.int32, shape=[None, None])
    src_sequence_length = tf.placeholder(tf.int32, shape=[None, ])

    src_embedding_weights = tf.get_variable("source_word_embeddings",
                                            [source_dict_dim, embedding_dim])
    src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx)

    src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
    src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
    # no peephole
    encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=src_forward_cell,
        cell_bw=src_reversed_cell,
        inputs=src_embedding,
        sequence_length=src_sequence_length,
        dtype=tf.float32)

    # concat the forward outputs and backward outputs
    encoded_vec = tf.concat(encoder_outputs, axis=2)

    # project the encoder outputs to size of decoder lstm
    encoded_proj = tf.contrib.layers.fully_connected(
        inputs=tf.reshape(
            encoded_vec, shape=[-1, embedding_dim * 2]),
        num_outputs=decoder_size,
        activation_fn=None,
        biases_initializer=None)
    encoded_proj_reshape = tf.reshape(
        encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size])

    # get init state for decoder lstm's H
    backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1])
    decoder_boot = tf.contrib.layers.fully_connected(
        inputs=tf.reshape(
            backword_first, shape=[-1, embedding_dim]),
        num_outputs=decoder_size,
        activation_fn=tf.nn.tanh,
        biases_initializer=None)

    # prepare the initial state for decoder lstm
    cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32)
    initial_state = LSTMStateTuple(cell_init, decoder_boot)

    # create decoder lstm cell
    decoder_cell = LSTMCellWithSimpleAttention(
        decoder_size,
        encoded_vec
        if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size),
        encoded_proj_reshape if not is_generating else
        seq2seq.tile_batch(encoded_proj_reshape, beam_size),
        src_sequence_length if not is_generating else
        seq2seq.tile_batch(src_sequence_length, beam_size),
        forget_bias=0.0)

    output_layer = Dense(target_dict_dim, name='output_projection')

    if not is_generating:
        trg_word_idx = tf.placeholder(tf.int32, shape=[None, None])
        trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
        trg_embedding_weights = tf.get_variable(
            "target_word_embeddings", [target_dict_dim, embedding_dim])
        trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights,
                                               trg_word_idx)

        training_helper = seq2seq.TrainingHelper(
            inputs=trg_embedding,
            sequence_length=trg_sequence_length,
            time_major=False,
            name='training_helper')

        training_decoder = seq2seq.BasicDecoder(
            cell=decoder_cell,
            helper=training_helper,
            initial_state=initial_state,
            output_layer=output_layer)

        # get the max length of target sequence
        max_decoder_length = tf.reduce_max(trg_sequence_length)

        decoder_outputs_train, _, _ = seq2seq.dynamic_decode(
            decoder=training_decoder,
            output_time_major=False,
            impute_finished=True,
            maximum_iterations=max_decoder_length)

        decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output)
        decoder_pred_train = tf.argmax(
            decoder_logits_train, axis=-1, name='decoder_pred_train')
        masks = tf.sequence_mask(
            lengths=trg_sequence_length,
            maxlen=max_decoder_length,
            dtype=tf.float32,
            name='masks')

        # place holder of label sequence
        lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None])

        # compute the loss
        loss = seq2seq.sequence_loss(
            logits=decoder_logits_train,
            targets=lbl_word_idx,
            weights=masks,
            average_across_timesteps=True,
            average_across_batch=True)

        # return feeding list and loss operator
        return {
            'src_word_idx': src_word_idx,
            'src_sequence_length': src_sequence_length,
            'trg_word_idx': trg_word_idx,
            'trg_sequence_length': trg_sequence_length,
            'lbl_word_idx': lbl_word_idx
        }, loss
    else:
        start_tokens = tf.ones([tf.shape(src_word_idx)[0], ],
                               tf.int32) * START_TOKEN_IDX
        # share the same embedding weights with target word
        trg_embedding_weights = tf.get_variable(
            "target_word_embeddings", [target_dict_dim, embedding_dim])

        inference_decoder = beam_search_decoder.BeamSearchDecoder(
            cell=decoder_cell,
            embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens),
            start_tokens=start_tokens,
            end_token=END_TOKEN_IDX,
            initial_state=tf.nn.rnn_cell.LSTMStateTuple(
                tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size),
                tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)),
            beam_width=beam_size,
            output_layer=output_layer)

        decoder_outputs_decode, _, _ = seq2seq.dynamic_decode(
            decoder=inference_decoder,
            output_time_major=False,
            #impute_finished=True,# error occurs
            maximum_iterations=max_generation_length)

        predicted_ids = decoder_outputs_decode.predicted_ids

        return {
            'src_word_idx': src_word_idx,
            'src_sequence_length': src_sequence_length
        }, predicted_ids
Esempio n. 15
0
    def BuildNetwork(self, learningRate):
        self.dataInput = tensorflow.placeholder(dtype=tensorflow.float32, shape=[None, None, self.featureShape],
                                                name='DataInput')
        self.seqInput = tensorflow.placeholder(dtype=tensorflow.int32, shape=[None], name='SeqInput')

        #############################################################################
        # Batch Parameters
        #############################################################################

        self.parameters['BatchSize'], self.parameters['TimeStep'], _ = tensorflow.unstack(
            tensorflow.shape(input=self.dataInput, name='DataShape'))

        ###################################################################################################
        # Encoder
        ###################################################################################################

        with tensorflow.variable_scope('Encoder_AE'):
            self.parameters['Encoder_Cell_Forward_AE'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                cells=[rnn.LSTMCell(num_units=self.hiddenNodules) for _ in range(self.rnnLayers)], state_is_tuple=True)
            self.parameters['Encoder_Cell_Backward_AE'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                cells=[rnn.LSTMCell(num_units=self.hiddenNodules) for _ in range(self.rnnLayers)], state_is_tuple=True)

            self.parameters['Encoder_Output_AE'], self.parameters['Encoder_FinalState_AE'] = \
                tensorflow.nn.bidirectional_dynamic_rnn(
                    cell_fw=self.parameters['Encoder_Cell_Forward_AE'],
                    cell_bw=self.parameters['Encoder_Cell_Backward_AE'],
                    inputs=self.dataInput, sequence_length=self.seqInput, dtype=tensorflow.float32)

        if self.attention is None:
            self.parameters['Decoder_InitalState_AE'] = []
            for index in range(self.rnnLayers):
                self.parameters['Encoder_Cell_Layer%d_AE' % index] = rnn.LSTMStateTuple(
                    c=tensorflow.concat([self.parameters['Encoder_FinalState_AE'][index][0].c,
                                         self.parameters['Encoder_FinalState_AE'][index][1].c], axis=1),
                    h=tensorflow.concat([self.parameters['Encoder_FinalState_AE'][index][0].h,
                                         self.parameters['Encoder_FinalState_AE'][index][1].h], axis=1))
                self.parameters['Decoder_InitalState_AE'].append(self.parameters['Encoder_Cell_Layer%d_AE' % index])
            self.parameters['Decoder_InitalState_AE'] = tuple(self.parameters['Decoder_InitalState_AE'])
        else:
            self.attentionList = self.attention(dataInput=self.parameters['Encoder_Output_AE'],
                                                scopeName=self.attentionName, hiddenNoduleNumber=2 * self.hiddenNodules,
                                                attentionScope=self.attentionScope, blstmFlag=True)
            self.parameters['Decoder_InitalState_AE'] = []
            for index in range(self.rnnLayers):
                self.parameters['Encoder_Cell_Layer%d_AE' % index] = rnn.LSTMStateTuple(
                    c=self.attentionList['FinalResult'],
                    h=tensorflow.concat(
                        [self.parameters['Encoder_FinalState_AE'][index][0].h,
                         self.parameters['Encoder_FinalState_AE'][index][1].h],
                        axis=1))
                self.parameters['Decoder_InitalState_AE'].append(self.parameters['Encoder_Cell_Layer%d_AE' % index])
            self.parameters['Decoder_InitalState_AE'] = tuple(self.parameters['Decoder_InitalState_AE'])

        #############################################################################
        # Decoder Label Pretreatment
        #############################################################################

        self.parameters['Decoder_Helper_AE'] = seq2seq.TrainingHelper(
            inputs=self.dataInput, sequence_length=self.seqInput, name='Decoder_Helper_AE')
        with tensorflow.variable_scope('Decoder_AE'):
            self.parameters['Decoder_FC_AE'] = Dense(self.featureShape)
            self.parameters['Decoder_Cell_AE'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                cells=[rnn.LSTMCell(num_units=self.hiddenNodules * 2) for _ in range(self.rnnLayers)],
                state_is_tuple=True)

            self.parameters['Decoder_AE'] = seq2seq.BasicDecoder(
                cell=self.parameters['Decoder_Cell_AE'], helper=self.parameters['Decoder_Helper_AE'],
                initial_state=self.parameters['Decoder_InitalState_AE'], output_layer=self.parameters['Decoder_FC_AE'])

            self.parameters['Decoder_Logits_AE'], self.parameters['Decoder_FinalState_AE'], self.parameters[
                'Decoder_FinalSeq_AE'] = seq2seq.dynamic_decode(decoder=self.parameters['Decoder_AE'])

        #############################################################################
        # Losses
        #############################################################################

        self.parameters['Loss_AE'] = tensorflow.losses.absolute_difference(
            labels=self.dataInput, predictions=self.parameters['Decoder_Logits_AE'][0], weights=self.weight)
        self.train = tensorflow.train.AdamOptimizer(learning_rate=learningRate).minimize(self.parameters['Loss_AE'])
Esempio n. 16
0
    def BuildNetwork(self, learningRate):
        self.dataInput = tensorflow.placeholder(
            dtype=tensorflow.float32,
            shape=[self.batchSize, 1000, 40],
            name='dataInput')
        self.dataSeqInput = tensorflow.placeholder(dtype=tensorflow.int32,
                                                   shape=[self.batchSize],
                                                   name='dataSeqInput')
        self.labelInput = tensorflow.placeholder(dtype=tensorflow.int32,
                                                 shape=[self.batchSize, None],
                                                 name='labelInput')
        self.labelSeqInput = tensorflow.placeholder(dtype=tensorflow.int32,
                                                    shape=[self.batchSize],
                                                    name='labelSeqInput')

        self.parameters['EmbeddingDictionary'] = tensorflow.Variable(
            initial_value=tensorflow.truncated_normal([50, 256]),
            dtype=tensorflow.float32,
            name='EmbeddingDictionary')
        self.parameters['EmbeddingResult'] = tensorflow.nn.embedding_lookup(
            params=self.parameters['EmbeddingDictionary'],
            ids=self.labelInput,
            name='EmbeddingResult')

        with tensorflow.name_scope('Encoder'):
            self.parameters['Encoder_FW_Cell'] = rnn.LSTMCell(
                num_units=self.hiddenNoduleNumber, name='Encoder_FW_Cell')
            self.parameters['Encoder_BW_Cell'] = rnn.LSTMCell(
                num_units=self.hiddenNoduleNumber, name='Encoder_BW_Cell')
            [self.parameters['Encoder_FW_Output'], self.parameters['Encoder_BW_Output']], \
            [self.parameters['Encoder_FW_FinalState'], self.parameters['Encoder_BW_FinalState']] = \
                tensorflow.nn.bidirectional_dynamic_rnn(
                    cell_fw=self.parameters['Encoder_FW_Cell'], cell_bw=self.parameters['Encoder_BW_Cell'],
                    inputs=self.dataInput, sequence_length=self.dataSeqInput, dtype=tensorflow.float32)
            self.parameters['EncoderOutput'] = tensorflow.concat(
                [
                    self.parameters['Encoder_FW_Output'],
                    self.parameters['Encoder_BW_Output']
                ],
                axis=2,
                name='EncoderOutput')
            self.parameters['Encoder_FinalState_C'] = tensorflow.concat(
                [
                    self.parameters['Encoder_FW_FinalState'].c,
                    self.parameters['Encoder_BW_FinalState'].c
                ],
                axis=1,
                name='Encoder_FinalState_C')
            self.parameters['Encoder_FinalState_H'] = tensorflow.concat(
                [
                    self.parameters['Encoder_FW_FinalState'].h,
                    self.parameters['Encoder_BW_FinalState'].h
                ],
                axis=1,
                name='Encoder_FinalState_H')
            self.parameters['Encoder_FinalState'] = rnn.LSTMStateTuple(
                c=self.parameters['Encoder_FinalState_C'],
                h=self.parameters['Encoder_FinalState_H'])

        #################################################################################

        self.parameters['Helper'] = seq2seq.GreedyEmbeddingHelper(
            embedding=self.parameters['EmbeddingDictionary'],
            start_tokens=tensorflow.ones(self.batchSize,
                                         dtype=tensorflow.int32) * 40,
            end_token=0)
        self.parameters['Decoder_Cell'] = rnn.LSTMCell(num_units=2 *
                                                       self.hiddenNoduleNumber)
        self.parameters['Decoder'] = seq2seq.BasicDecoder(
            cell=self.parameters['Decoder_Cell'],
            helper=self.parameters['Helper'],
            initial_state=self.parameters['Encoder_FinalState'])
        self.parameters['DecoderOutput'], self.parameters['DecoderFinalState'], self.parameters['DecoderSeqLen'] = \
            seq2seq.dynamic_decode(decoder=self.parameters['Decoder'], output_time_major=False,
                                   maximum_iterations=tensorflow.reduce_max(self.labelSeqInput))

        self.parameters['Logits'] = tensorflow.layers.dense(
            inputs=self.parameters['DecoderOutput'][0],
            units=50,
            activation=None,
            name='Logits')
        # self.parameters['Mask'] = tensorflow.to_float(tensorflow.not_equal(self.labelInput, 0))
        self.parameters['Loss'] = tensorflow.reduce_mean(
            tensorflow.nn.softmax_cross_entropy_with_logits_v2(
                labels=tensorflow.one_hot(self.labelInput,
                                          depth=50,
                                          dtype=tensorflow.float32),
                logits=self.parameters['Logits']),
            name='Loss')
        self.train = tensorflow.train.AdamOptimizer(
            learning_rate=learningRate).minimize(self.parameters['Loss'])
Esempio n. 17
0
    def _init_tensorflow(self, infer: bool=False) -> 'tf':
        """
        Deferred importing of tensorflow and initializing model for training
        or sampling.

        This is necessary for two reasons: first, the tensorflow graph is
        different for training and inference, so must be reset when switching
        between modes. Second, importing tensorflow takes a long time, so
        we only want to do it if we actually need to.

        Parameters
        ----------
        infer : bool
            If True, initialize model for inference. If False, initialize
            model for training.

        Returns
        -------
        module
            TensorFlow module.
        """
        # quiet tensorflow. See: https://github.com/tensorflow/tensorflow/issues/1258
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

        self.cell_fn = {
            "lstm": rnn.BasicLSTMCell,
            "gru": rnn.GRUCell,
            "rnn": rnn.BasicRNNCell
        }.get(self.model_type, None)
        if self.cell_fn is None:
            raise clgen.UserError("Unrecognized model type")

        # reset the graph when switching between training and inference
        tf.reset_default_graph()

        # corpus info:
        batch_size = 1 if infer else self.corpus.batch_size
        seq_length = 1 if infer else self.corpus.seq_length
        vocab_size = self.corpus.vocab_size

        cells_lst = [self.cell_fn(self.rnn_size, state_is_tuple=True) for _ in range(self.num_layers)]
        self.cell = rnn.MultiRNNCell(cells_lst, state_is_tuple=True)

        with tf.device("/cpu:0"):
            # Inputs 
            self.encoder_input = tf.placeholder(tf.int32, [batch_size, seq_length])
            self.decoder_input = tf.placeholder(tf.int32, [batch_size, seq_length])
            self.target_weights = tf.placeholder(tf.int32, [batch_size, seq_length])
            self.lengths = tf.placeholder(tf.int32, [batch_size])

            self.q = tf.FIFOQueue(capacity=4,
                dtypes=[tf.int32, tf.int32, tf.int32, tf.int32],
                shapes=[tf.TensorShape([batch_size, seq_length]), 
                    tf.TensorShape([batch_size, seq_length]),
                    tf.TensorShape([batch_size, seq_length]),
                    tf.TensorShape([batch_size])])
            self.enqueue_op = self.q.enqueue((self.encoder_input, self.decoder_input, self.target_weights, self.lengths))

            next_example = self.q.dequeue()

            self.inputs = next_example[0]
            self.dec_inp = next_example[1]
            self.tweights = tf.to_float(next_example[2])
            self.lens = next_example[3]
        

        scope_name = 'rnnlm'
        with tf.variable_scope(scope_name):
            softmax_w = tf.get_variable("softmax_w", [self.rnn_size, vocab_size])
            softmax_b = tf.get_variable("softmax_b", [vocab_size])

            with tf.device("/cpu:0"):
                embedding_dec = tf.get_variable("embedding_dec", [vocab_size, self.rnn_size])
                dec_inp2 = tf.nn.embedding_lookup(embedding_dec, self.dec_inp)

        encoder = SeqEncoder(self.model_type, self.rnn_size, self.num_layers, batch_size, vocab_size)
        encoder_state = encoder.encode(self.inputs, self.lens)

        self.mean_latent, self.logvar_latent = encoder_to_latent(encoder_state, self.rnn_size, 32, self.num_layers, tf.float32)
        self.latent, self.KL_obj, self.KL_cost = sample(self.mean_latent, self.logvar_latent, 32)
        self.decoder_initial_state = latent_to_decoder(self.latent, self.rnn_size, 32, self.num_layers, tf.float32)


        decoder_initial_state2 = tuple([rnn.LSTMStateTuple(*single_layer_state) for single_layer_state in self.decoder_initial_state])

        helper = seq2seq.TrainingHelper(dec_inp2, self.lens, time_major=False)
        decoder = seq2seq.BasicDecoder(self.cell, helper, decoder_initial_state2, Dense(vocab_size))
        self.final_outputs, self.final_state = seq2seq.dynamic_decode(decoder, output_time_major=False, impute_finished=True, swap_memory=True, scope='rnnlm')

        self.final_out = self.final_outputs.rnn_output

        self.probs = tf.nn.softmax(self.final_out)
        self.cost = seq2seq.sequence_loss(self.final_out, self.inputs, self.tweights)

        self.learning_rate = tf.Variable(0.0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(self.cost + self.KL_obj, tvars, aggregation_method = 2), self.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        return tf
Esempio n. 18
0
    def _build(self,
               decoding_strategy="train_greedy",
               initial_state=None,
               inputs=None,
               sequence_length=None,
               embedding=None,
               start_tokens=None,
               end_token=None,
               softmax_temperature=None,
               max_decoding_length=None,
               impute_finished=False,
               output_time_major=False,
               input_time_major=False,
               helper=None,
               mode=None,
               **kwargs):
        """Performs decoding. This is a shared interface for both
        :class:`~texar.tf.modules.BasicRNNDecoder` and
        :class:`~texar.tf.modules.AttentionRNNDecoder`.

        The function provides **3 ways** to specify the
        decoding method, with varying flexibility:

        1. The :attr:`decoding_strategy` argument: A string taking value of:

            - **"train_greedy"**: decoding in teacher-forcing fashion \
              (i.e., feeding \
              `ground truth` to decode the next step), and each sample is \
              obtained by taking the `argmax` of the RNN output logits. \
              Arguments :attr:`(inputs, sequence_length, input_time_major)` \
              are required for this strategy, and argument :attr:`embedding` \
              is optional.
            - **"infer_greedy"**: decoding in inference fashion (i.e., feeding \
              the `generated` sample to decode the next step), and each sample\
              is obtained by taking the `argmax` of the RNN output logits.\
              Arguments :attr:`(embedding, start_tokens, end_token)` are \
              required for this strategy, and argument \
              :attr:`max_decoding_length` is optional.
            - **"infer_sample"**: decoding in inference fashion, and each
              sample is obtained by `random sampling` from the RNN output
              distribution. Arguments \
              :attr:`(embedding, start_tokens, end_token)` are \
              required for this strategy, and argument \
              :attr:`max_decoding_length` is optional.

          This argument is used only when argument :attr:`helper` is `None`.

          Example:

            .. code-block:: python

                embedder = WordEmbedder(vocab_size=data.vocab.size)
                decoder = BasicRNNDecoder(vocab_size=data.vocab.size)

                # Teacher-forcing decoding
                outputs_1, _, _ = decoder(
                    decoding_strategy='train_greedy',
                    inputs=embedder(data_batch['text_ids']),
                    sequence_length=data_batch['length']-1)

                # Random sample decoding. Gets 100 sequence samples
                outputs_2, _, sequence_length = decoder(
                    decoding_strategy='infer_sample',
                    start_tokens=[data.vocab.bos_token_id]*100,
                    end_token=data.vocab.eos.token_id,
                    embedding=embedder,
                    max_decoding_length=60)

        2. The :attr:`helper` argument: An instance of subclass of \
           :class:`texar.tf.modules.Helper`. This
           provides a superset of decoding strategies than above, for example:

            - :class:`~texar.tf.modules.TrainingHelper` corresponding to the \
              "train_greedy" strategy.
            - :class:`~texar.tf.modules.GreedyEmbeddingHelper` and \
              :class:`~texar.tf.modules.SampleEmbeddingHelper` corresponding to \
              the "infer_greedy" and "infer_sample", respectively.
            - :class:`~texar.tf.modules.TopKSampleEmbeddingHelper` for Top-K \
              sample decoding.
            - :class:`ScheduledEmbeddingTrainingHelper` and \
              :class:`ScheduledOutputTrainingHelper` for scheduled \
              sampling.
            - :class:`~texar.tf.modules.SoftmaxEmbeddingHelper` and \
              :class:`~texar.tf.modules.GumbelSoftmaxEmbeddingHelper` for \
              soft decoding and gradient backpropagation.

          Helpers give the maximal flexibility of configuring the decoding\
          strategy.

          Example:

            .. code-block:: python

                embedder = WordEmbedder(vocab_size=data.vocab.size)
                decoder = BasicRNNDecoder(vocab_size=data.vocab.size)

                # Teacher-forcing decoding, same as above with
                # `decoding_strategy='train_greedy'`
                helper_1 = tx.modules.TrainingHelper(
                    inputs=embedders(data_batch['text_ids']),
                    sequence_length=data_batch['length']-1)
                outputs_1, _, _ = decoder(helper=helper_1)

                # Gumbel-softmax decoding
                helper_2 = GumbelSoftmaxEmbeddingHelper(
                    embedding=embedder,
                    start_tokens=[data.vocab.bos_token_id]*100,
                    end_token=data.vocab.eos_token_id,
                    tau=0.1)
                outputs_2, _, sequence_length = decoder(
                    max_decoding_length=60, helper=helper_2)

        3. :attr:`hparams["helper_train"]` and :attr:`hparams["helper_infer"]`:\
           Specifying the helper through hyperparameters. Train and infer \
           strategy is toggled based on :attr:`mode`. Appriopriate arguments \
           (e.g., :attr:`inputs`, :attr:`start_tokens`, etc) are selected to \
           construct the helper. Additional arguments for helper constructor \
           can be provided either through :attr:`**kwargs`, or through \
           :attr:`hparams["helper_train/infer"]["kwargs"]`.

           This means is used only when both :attr:`decoding_strategy` and \
           :attr:`helper` are `None`.

           Example:

             .. code-block:: python

                 h = {
                     "helper_infer": {
                         "type": "GumbelSoftmaxEmbeddingHelper",
                         "kwargs": { "tau": 0.1 }
                     }
                 }
                 embedder = WordEmbedder(vocab_size=data.vocab.size)
                 decoder = BasicRNNDecoder(vocab_size=data.vocab.size, hparams=h)

                 # Gumbel-softmax decoding
                 output, _, _ = decoder(
                     decoding_strategy=None, # Sets to None explicit
                     embedding=embedder,
                     start_tokens=[data.vocab.bos_token_id]*100,
                     end_token=data.vocab.eos_token_id,
                     max_decoding_length=60,
                     mode=tf.estimator.ModeKeys.PREDICT)
                         # PREDICT mode also shuts down dropout

        Args:
            decoding_strategy (str): A string specifying the decoding
                strategy. Different arguments are required based on the
                strategy.
                Ignored if :attr:`helper` is given.
            initial_state (optional): Initial state of decoding.
                If `None` (default), zero state is used.

            inputs (optional): Input tensors for teacher forcing decoding.
                Used when `decoding_strategy` is set to "train_greedy", or
                when `hparams`-configured helper is used.

                - If :attr:`embedding` is `None`, `inputs` is directly \
                fed to the decoder. E.g., in `"train_greedy"` strategy, \
                `inputs` must be a 3D Tensor of shape \
                `[batch_size, max_time, emb_dim]` (or \
                `[max_time, batch_size, emb_dim]` if `input_time_major`==True).
                - If `embedding` is given, `inputs` is used as index \
                to look up embeddings and feed in the decoder. \
                E.g., if `embedding` is an instance of \
                :class:`~texar.tf.modules.WordEmbedder`, \
                then :attr:`inputs` is usually a 2D int Tensor \
                `[batch_size, max_time]` (or \
                `[max_time, batch_size]` if `input_time_major`==True) \
                containing the token indexes.
            sequence_length (optional): A 1D int Tensor containing the
                sequence length of :attr:`inputs`.
                Used when `decoding_strategy="train_greedy"` or
                `hparams`-configured helper is used.
            embedding (optional): Embedding used when:

                - "infer_greedy" or "infer_sample" `decoding_strategy` is \
                used. This can be a callable or the `params` argument for \
                :tf_main:`embedding_lookup <nn/embedding_lookup>`. \
                If a callable, it can take a vector tensor of token `ids`, \
                or take two arguments (`ids`, `times`), where `ids` \
                is a vector tensor of token ids, and `times` is a vector tensor\
                of time steps (i.e., position ids). The latter case can be used\
                when attr:`embedding` is a combination of word embedding and\
                position embedding. `embedding` is required in this case.
                - "train_greedy" `decoding_strategy` is used.\
                This can be a callable or the `params` argument for \
                :tf_main:`embedding_lookup <nn/embedding_lookup>`. \
                If a callable, it can take :attr:`inputs` and returns \
                the input embedding. `embedding` is optional in this case.
            start_tokens (optional): A int Tensor of shape `[batch_size]`,
                the start tokens. Used when `decoding_strategy="infer_greedy"`
                or `"infer_sample"`, or when the helper specified in `hparams`
                is used.

                Example:

                    .. code-block:: python

                        data = tx.data.MonoTextData(hparams)
                        iterator = DataIterator(data)
                        batch = iterator.get_next()

                        bos_token_id = data.vocab.bos_token_id
                        start_tokens=tf.ones_like(batch['length'])*bos_token_id

            end_token (optional): A int 0D Tensor, the token that marks end
                of decoding.
                Used when `decoding_strategy="infer_greedy"` or
                `"infer_sample"`, or when the helper specified in `hparams`
                is used.
            softmax_temperature (optional): A float 0D Tensor, value to divide
                the logits by before computing the softmax. Larger values
                (above 1.0) result in more random samples. Must > 0. If `None`,
                1.0 is used.
                Used when `decoding_strategy="infer_sample"`.
            max_decoding_length: A int scalar Tensor indicating the maximum
                allowed number of decoding steps. If `None` (default), either
                `hparams["max_decoding_length_train"]` or
                `hparams["max_decoding_length_infer"]` is used
                according to :attr:`mode`.
            impute_finished (bool): If `True`, then states for batch
                entries which are marked as finished get copied through and
                the corresponding outputs get zeroed out.  This causes some
                slowdown at each time step, but ensures that the final state
                and outputs have the correct values and that backprop ignores
                time steps that were marked as finished.
            output_time_major (bool): If `True`, outputs are returned as
                time major tensors. If `False` (default), outputs are returned
                as batch major tensors.
            input_time_major (optional): Whether the :attr:`inputs` tensor is
                time major.
                Used when `decoding_strategy="train_greedy"` or
                `hparams`-configured helper is used.
            helper (optional): An instance of
                :class:`texar.tf.modules.Helper`
                that defines the decoding strategy. If given,
                `decoding_strategy`
                and helper configs in :attr:`hparams` are ignored.
            mode (str, optional): A string taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`. If
                `TRAIN`, training related hyperparameters are used (e.g.,
                `hparams['max_decoding_length_train']`), otherwise,
                inference related hyperparameters are used (e.g.,
                `hparams['max_decoding_length_infer']`).
                If `None` (default), `TRAIN` mode is used.
            **kwargs: Other keyword arguments for constructing helpers
                defined by `hparams["helper_trainn"]` or
                `hparams["helper_infer"]`.

        Returns:
            `(outputs, final_state, sequence_lengths)`, where

            - **`outputs`**: an object containing the decoder output on all \
            time steps.
            - **`final_state`**: is the cell state of the final time step.
            - **`sequence_lengths`**: is an int Tensor of shape `[batch_size]` \
            containing the length of each sample.
        """
        # Helper
        if helper is not None:
            pass
        elif decoding_strategy is not None:
            if decoding_strategy == "train_greedy":
                helper = rnn_decoder_helpers._get_training_helper(
                    inputs, sequence_length, embedding, input_time_major)
            elif decoding_strategy == "infer_greedy":
                helper = tx_helper.GreedyEmbeddingHelper(
                    embedding, start_tokens, end_token)
            elif decoding_strategy == "infer_sample":
                helper = tx_helper.SampleEmbeddingHelper(
                    embedding, start_tokens, end_token, softmax_temperature)
            else:
                raise ValueError(
                    "Unknown decoding strategy: {}".format(decoding_strategy))
        else:
            if is_train_mode_py(mode):
                kwargs_ = copy.copy(self._hparams.helper_train.kwargs.todict())
                helper_type = self._hparams.helper_train.type
            else:
                kwargs_ = copy.copy(self._hparams.helper_infer.kwargs.todict())
                helper_type = self._hparams.helper_infer.type
            kwargs_.update({
                "inputs": inputs,
                "sequence_length": sequence_length,
                "time_major": input_time_major,
                "embedding": embedding,
                "start_tokens": start_tokens,
                "end_token": end_token,
                "softmax_temperature": softmax_temperature
            })
            kwargs_.update(kwargs)
            helper = rnn_decoder_helpers.get_helper(helper_type, **kwargs_)
        self._helper = helper

        # Initial state
        if initial_state is not None:
            self._initial_state = initial_state
        else:
            self._initial_state = self.zero_state(batch_size=self.batch_size,
                                                  dtype=tf.float32)

        # Maximum decoding length
        max_l = max_decoding_length
        if max_l is None:
            max_l_train = self._hparams.max_decoding_length_train
            if max_l_train is None:
                max_l_train = utils.MAX_SEQ_LENGTH
            max_l_infer = self._hparams.max_decoding_length_infer
            if max_l_infer is None:
                max_l_infer = utils.MAX_SEQ_LENGTH
            max_l = tf.cond(is_train_mode(mode), lambda: max_l_train,
                            lambda: max_l_infer)
        self.max_decoding_length = max_l
        # Decode
        outputs, final_state, sequence_lengths = dynamic_decode(
            decoder=self,
            impute_finished=impute_finished,
            maximum_iterations=max_l,
            output_time_major=output_time_major)

        if not self._built:
            self._add_internal_trainable_variables()
            # Add trainable variables of `self._cell` which may be
            # constructed externally.
            self._add_trainable_variable(
                layers.get_rnn_cell_trainable_variables(self._cell))
            if isinstance(self._output_layer, tf.layers.Layer):
                self._add_trainable_variable(
                    self._output_layer.trainable_variables)
            # Add trainable variables of `self._beam_search_rnn_cell` which
            # may already be constructed and used.
            if self._beam_search_cell is not None:
                self._add_trainable_variable(
                    self._beam_search_cell.trainable_variables)

            self._built = True

        return outputs, final_state, sequence_lengths
Esempio n. 19
0
    def decode(self, encoder_outputs, encoder_state, source_sequence_length):
        with tf.variable_scope("Decoder") as scope:
            beam_width = self.beam_width
            decoder_type = self.decoder_type
            seq_max_len = self.seq_max_len
            batch_size = tf.shape(encoder_outputs)[0]

            if self.path_embed_method == "lstm":
                self.decoder_cell = self._build_decode_cell()
                if self.mode == "test" and beam_width > 0:
                    memory = seq2seq.tile_batch(self.encoder_outputs, multiplier=beam_width)
                    source_sequence_length = seq2seq.tile_batch(self.source_sequence_length, multiplier=beam_width)
                    encoder_state = seq2seq.tile_batch(self.encoder_state, multiplier=beam_width)
                    batch_size = self.batch_size * beam_width
                else:
                    memory = encoder_outputs
                    source_sequence_length = source_sequence_length
                    encoder_state = encoder_state

                attention_mechanism = seq2seq.BahdanauAttention(self.hidden_layer_dim, memory,
                                                                memory_sequence_length=source_sequence_length)
                self.decoder_cell = seq2seq.AttentionWrapper(self.decoder_cell, attention_mechanism,
                                                             attention_layer_size=self.hidden_layer_dim)
                self.decoder_initial_state = self.decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state)

            projection_layer = Dense(self.word_vocab_size, use_bias=False)

            """For training the model"""
            if self.mode == "train":
                decoder_train_helper = tf.contrib.seq2seq.TrainingHelper(self.decoder_train_inputs_embedded,
                                                                         self.decoder_train_length)
                decoder_train = seq2seq.BasicDecoder(self.decoder_cell, decoder_train_helper,
                                                     self.decoder_initial_state,
                                                     projection_layer)
                decoder_outputs_train, decoder_states_train, decoder_seq_len_train = seq2seq.dynamic_decode(decoder_train)
                decoder_logits_train = decoder_outputs_train.rnn_output
                self.decoder_logits_train = tf.reshape(decoder_logits_train, [batch_size, -1, self.word_vocab_size])

            """For test the model"""
            # if self.mode == "infer" or self.if_pred_on_dev:
            if decoder_type == "greedy":
                decoder_infer_helper = seq2seq.GreedyEmbeddingHelper(self.word_embeddings,
                                                                     tf.ones([batch_size], dtype=tf.int32),
                                                                     self.EOS)
                decoder_infer = seq2seq.BasicDecoder(self.decoder_cell, decoder_infer_helper,
                                                     self.decoder_initial_state, projection_layer)
            elif decoder_type == "beam":
                decoder_infer = seq2seq.BeamSearchDecoder(cell=self.decoder_cell, embedding=self.word_embeddings,
                                                          start_tokens=tf.ones([batch_size], dtype=tf.int32),
                                                          end_token=self.EOS,
                                                          initial_state=self.decoder_initial_state,
                                                          beam_width=beam_width,
                                                          output_layer=projection_layer)

            decoder_outputs_infer, decoder_states_infer, decoder_seq_len_infer = seq2seq.dynamic_decode(decoder_infer,
                                                                                                        maximum_iterations=seq_max_len)

            if decoder_type == "beam":
                self.decoder_logits_infer = tf.no_op()
                self.sample_id = decoder_outputs_infer.predicted_ids

            elif decoder_type == "greedy":
                self.decoder_logits_infer = decoder_outputs_infer.rnn_output
                self.sample_id = decoder_outputs_infer.sample_id
Esempio n. 20
0
    def initialize(self,
                   inputs,
                   input_lengths,
                   mel_targets=None,
                   stop_token_targets=None,
                   gta=False):
        """
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError(
                'no mel targets were provided but token_targets were given')
        if mel_targets is not None and stop_token_targets is None:
            raise ValueError(
                'Mel targets are provided without corresponding token_targets')

        print(stop_token_targets[0])
        with tf.variable_scope('inference') as scope:
            is_training = mel_targets is not None and not gta
            batch_size = tf.shape(inputs)[0]
            hp = self._hparams

            # Embeddings ==> [batch_size, sequence_length, embedding_dim]
            embedding_table = tf.get_variable(
                'inputs_embedding', [len(symbols), hp.embedding_dim],
                dtype=tf.float32,
                initializer=tf.contrib.layers.xavier_initializer())
            embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)

            #Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
            encoder_cell = TacotronEncoderCell(
                EncoderConvolutions(is_training,
                                    kernel_size=hp.enc_conv_kernel_size,
                                    channels=hp.enc_conv_channels),
                EncoderRNN(is_training,
                           size=hp.encoder_lstm_units,
                           zoneout=hp.zoneout_rate,
                           scope='encoder_LSTM'))

            encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

            #For shape visualization purpose
            enc_conv_output_shape = encoder_cell.conv_output_shape

            #Decoder Parts
            #Attention Decoder Prenet
            prenet = Prenet(is_training, layer_sizes=hp.prenet_layers)
            #Attention Mechanism
            attention_mechanism = LocationSensitiveAttention(
                hp.attention_dim, encoder_outputs)
            #Decoder LSTM Cells
            decoder_lstm = DecoderRNN(is_training,
                                      layers=hp.decoder_layers,
                                      size=hp.decoder_lstm_units,
                                      zoneout=hp.zoneout_rate)
            #Frames Projection layer
            frame_projection = FrameProjection(hp.num_mels *
                                               hp.outputs_per_step)
            #<stop_token> projection layer
            stop_projection = StopProjection(is_training)

            #Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
            decoder_cell = TacotronDecoderCell(prenet, attention_mechanism,
                                               decoder_lstm, frame_projection,
                                               stop_projection)

            #Define the helper for our decoder
            if (is_training or gta) == True:
                self.helper = TacoTrainingHelper(inputs, mel_targets,
                                                 hp.num_mels,
                                                 hp.outputs_per_step)
            else:
                self.helper = TacoTestHelper(batch_size, hp.num_mels,
                                             hp.outputs_per_step)

            #We'll only limit decoder time steps during inference (consult hparams.py to modify the value)
            max_iterations = None if is_training else hp.max_iters

            #initial decoder state
            decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                         dtype=tf.float32)

            #Decode
            (frames_prediction, stop_token_prediction,
             _), final_decoder_state, _ = dynamic_decode(
                 CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                 impute_finished=hp.impute_finished,
                 maximum_iterations=max_iterations)

            # Reshape outputs to be one output per entry
            #==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
            decoder_output = tf.reshape(frames_prediction,
                                        [batch_size, -1, hp.num_mels])
            stop_token_prediction = tf.reshape(stop_token_prediction,
                                               [batch_size, -1])

            #Postnet
            postnet = Postnet(is_training,
                              kernel_size=hp.postnet_kernel_size,
                              channels=hp.postnet_channels)

            #Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
            residual = postnet(decoder_output)

            #Project residual to same dimension as mel spectrogram
            #==> [batch_size, decoder_steps * r, num_mels]
            residual_projection = FrameProjection(hp.num_mels)
            projected_residual = residual_projection(residual)

            #Compute the mel spectrogram
            mel_outputs = decoder_output + projected_residual

            #Grab alignments from the final decoder state
            alignments = tf.transpose(
                final_decoder_state.alignment_history.stack(), [1, 2, 0])

            self.inputs = inputs
            self.input_lengths = input_lengths
            self.decoder_output = decoder_output
            self.alignments = alignments
            self.stop_token_prediction = stop_token_prediction
            self.stop_token_targets = stop_token_targets
            self.mel_outputs = mel_outputs
            self.mel_targets = mel_targets
            log('Initialized Tacotron model. Dimensions: ')
            log('  embedding:                {}'.format(embedded_inputs.shape))
            log('  enc conv out:             {}'.format(enc_conv_output_shape))
            log('  encoder out:              {}'.format(encoder_outputs.shape))
            log('  decoder out:              {}'.format(decoder_output.shape))
            log('  residual out:             {}'.format(residual.shape))
            log('  projected residual out:   {}'.format(
                projected_residual.shape))
            log('  mel out:                  {}'.format(mel_outputs.shape))
            log('  <stop_token> out:         {}'.format(
                stop_token_prediction.shape))
Esempio n. 21
0
def rbmE_gruD(mode, features, labels, params):
    inp = features["x"]

    if state != "Infering":
        ids = features["ids"]
        weights = features["weights"]

    batch_size = params["batch_size"]

    #Encoder
    enc_cell = rnn.NASCell(num_units=NUM_UNITS)
    enc_out, enc_state = tf.nn.dynamic_rnn(enc_cell,
                                           inp,
                                           time_major=False,
                                           dtype=tf.float32)

    #Decoder
    cell = rnn.NASCell(num_units=NUM_UNITS)

    _, embeddings = load_processed_embeddings(sess=tf.InteractiveSession())
    out_lengths = tf.constant(seq_len, shape=[batch_size])
    if state != "Infering":
        #sampling method for training
        train_helper = seq2seq.TrainingHelper(labels,
                                              out_lengths,
                                              time_major=False)
        '''
        train_helper=seq2seq.ScheduledEmbeddingTrainingHelper(inputs=labels,
                                                              sequence_length=out_lengths,
                                                              embedding=embeddings,
                                                              sampling_probability=probs)
        '''
    #sampling method for evaluation
    start_tokens = tf.zeros([batch_size], dtype=tf.int32)
    infer_helper = seq2seq.GreedyEmbeddingHelper(embedding=embeddings,
                                                 start_tokens=start_tokens,
                                                 end_token=END)
    #infer_helper = seq2seq.SampleEmbeddingHelper(embeddings,start_tokens=start_tokens,end_token=END)
    #infer_helper=seq2seq.ScheduledEmbeddingTrainingHelper(inputs=inp,sequence_length=out_lengths,embedding=embeddings,sampling_probability=1.0)
    projection_layer = layers_core.Dense(vocab_size, use_bias=False)

    def decode(helper):
        decoder = seq2seq.BasicDecoder(cell=cell,
                                       helper=helper,
                                       initial_state=enc_state,
                                       output_layer=projection_layer)
        #decoder.tracks_own_finished=True
        (dec_outputs, _,
         _) = seq2seq.dynamic_decode(decoder, maximum_iterations=seq_len)
        #(dec_outputs,_,_) = seq2seq.dynamic_decode(decoder)
        dec_ids = dec_outputs.sample_id
        logits = dec_outputs.rnn_output
        return dec_ids, logits

    #equalize logits, labels and weight lengths incase of early finish in decoder
    def norm_logits_loss(logts, ids, weights):
        current_ts = tf.to_int32(
            tf.minimum(tf.shape(ids)[1],
                       tf.shape(logts)[1]))
        logts = tf.slice(logts, begin=[0, 0, 0], size=[-1, current_ts, -1])
        ids = tf.slice(ids, begin=[0, 0], size=[-1, current_ts])
        weights = tf.slice(weights, begin=[0, 0], size=[-1, current_ts])
        return logts, ids, weights

    #training mode
    if state == "Training":
        dec_ids, logits = decode(train_helper)
        # some sample_id are overwritten with '-1's
        #dec_ids = tf.argmax(logits, axis=2)
        tf.identity(dec_ids, name="predictions")
        logits, ids, weights = norm_logits_loss(logits, ids, weights)
        loss = tf.contrib.seq2seq.sequence_loss(logits, ids, weights=weights)
        learning_rate = 0.001  #0.0001

        tf.identity(learning_rate, name="learning_rate")

    #evaluation mode
    if state == "Evaluating" or state == "Testing":
        eval_dec_ids, eval_logits = decode(infer_helper)
        #eval_dec_ids = tf.argmax(eval_logits, axis=2)
        tf.identity(eval_dec_ids, name="predictions")

        #equalize logits, labels and weight lengths incase of early finish in decoder
        eval_logits, ids, weights = norm_logits_loss(eval_logits, ids, weights)
        '''
        current_ts = tf.to_int32(tf.minimum(tf.shape(ids)[1], tf.shape(eval_logits)[1]))
        ids = tf.slice(ids, begin=[0, 0], size=[-1, current_ts])
        weights = tf.slice(weights, begin=[0, 0], size=[-1, current_ts])
        #mask_ = tf.sequence_mask(lengths=target_sequence_length, maxlen=current_ts, dtype=eval_logits.dtype)
        eval_logits = tf.slice(eval_logits, begin=[0,0,0], size=[-1, current_ts, -1])       
        '''
        eval_loss = tf.contrib.seq2seq.sequence_loss(eval_logits,
                                                     ids,
                                                     weights=weights)

    #beamSearch decoder
    init_state = tf.contrib.seq2seq.tile_batch(enc_state, multiplier=5)
    beamSearch_decoder = seq2seq.BeamSearchDecoder(
        cell,
        embeddings,
        start_tokens,
        end_token=END,
        initial_state=init_state,
        beam_width=5,
        output_layer=projection_layer)
    (infer_outputs, _, _) = seq2seq.dynamic_decode(beamSearch_decoder,
                                                   maximum_iterations=seq_len)
    infer_ids = infer_outputs.predicted_ids
    infer_probs = infer_outputs.beam_search_decoder_output.scores
    infer_probs = tf.reduce_prod(infer_probs, axis=1)
    infer_pos = tf.argmax(infer_probs, axis=1)
    infers = {"ids": infer_ids, "pos": infer_pos}

    if mode == tf.estimator.ModeKeys.TRAIN:
        train_op = layers.optimize_loss(loss,
                                        tf.train.get_global_step(),
                                        optimizer='Adam',
                                        learning_rate=learning_rate,
                                        clip_gradients=5.0)

        spec = tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=dec_ids,
                                          loss=loss,
                                          train_op=train_op)
    #evaluation mode
    elif mode == tf.estimator.ModeKeys.EVAL:
        spec = tf.estimator.EstimatorSpec(mode=mode,
                                          loss=eval_loss,
                                          predictions=eval_dec_ids)
    else:
        spec = tf.estimator.EstimatorSpec(mode=mode, predictions=infers)
    return spec
Esempio n. 22
0
	def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False,
			global_step=None, is_training=False, is_evaluating=False, split_infos=None):
		"""
		Initializes the model for inference
		sets "mel_outputs" and "alignments" fields.
		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
		log("tacotron.py:initialize():row42")
		if mel_targets is None and stop_token_targets is not None:
			raise ValueError('no multi targets were provided but token_targets were given')
		if mel_targets is not None and stop_token_targets is None and not gta:
			raise ValueError('Mel targets are provided without corresponding token_targets')
		if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training:
			raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!')
		if gta and linear_targets is not None:
			raise ValueError('Linear spectrogram prediction is not supported in GTA mode!')
		if is_training and self._hparams.mask_decoder and targets_lengths is None:
			raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!')
		if is_training and is_evaluating:
			raise RuntimeError('Model can not be in training and evaluation modes at the same time!')
		
		log("tacotron.py:initialize():row56")
		split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:{}'.format(self._hparams.tacotron_gpu_start_idx)
		with tf.device(split_device):
			hp = self._hparams
			lout_int = [tf.int32]*hp.tacotron_num_gpus
			lout_float = [tf.float32]*hp.tacotron_num_gpus

			tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
			tower_targets_lengths = tf.split(targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths

			p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int)
			p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:,1]], lout_float) if mel_targets is not None else mel_targets
			p_stop_token_targets = tf.py_func(split_func, [stop_token_targets, split_infos[:,2]], lout_float) if stop_token_targets is not None else stop_token_targets
			p_linear_targets = tf.py_func(split_func, [linear_targets, split_infos[:,3]], lout_float) if linear_targets is not None else linear_targets

			tower_inputs = []
			tower_mel_targets = []
			tower_stop_token_targets = []
			tower_linear_targets = []

			batch_size = tf.shape(inputs)[0]
			mel_channels = hp.num_mels
			linear_channels = hp.num_freq
			for i in range (hp.tacotron_num_gpus):
				tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
				if p_mel_targets is not None:
					tower_mel_targets.append(tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels]))
				if p_stop_token_targets is not None:
					tower_stop_token_targets.append(tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
				if p_linear_targets is not None:
					tower_linear_targets.append(tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels]))

		self.tower_decoder_output = []
		self.tower_alignments = []
		self.tower_stop_token_prediction = []
		self.tower_mel_outputs = []
		self.tower_linear_outputs = []

		tower_embedded_inputs = []
		tower_enc_conv_output_shape = []
		tower_encoder_outputs = []
		tower_residual = []
		tower_projected_residual = []
		
		log("tacotron.py:initialize():row100")
		# 1. Declare GPU Devices
		gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx+hp.tacotron_num_gpus)]
		for i in range(hp.tacotron_num_gpus):
			with tf.device(tf.train.replica_device_setter(ps_tasks=1,ps_device="/cpu:0",worker_device=gpus[i])):
				with tf.variable_scope('inference') as scope:
					assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled')
					if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
						assert global_step is not None

					#GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
					post_condition = hp.predict_linear and not gta
					
					log("tacotron.py:initialize():row113")
					# Embeddings ==> [batch_size, sequence_length, embedding_dim]
					self.embedding_table = tf.get_variable(
						'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
					embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i])

					log("tacotron.py:initialize():row119")
					#Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
					encoder_cell = TacotronEncoderCell(
						EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'),
						EncoderRNN(is_training, size=hp.encoder_lstm_units,
							zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM'))

					encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i])

					#For shape visualization purpose
					enc_conv_output_shape = encoder_cell.conv_output_shape
					
					log("tacotron.py:initialize():row131")
					#Decoder Parts
					#Attention Decoder Prenet
					prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet')
					#Attention Mechanism
					attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp,
						mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape(tower_input_lengths[i], [-1]), smoothing=hp.smoothing,
						cumulate_weights=hp.cumulative_weights)
					#Decoder LSTM Cells
					decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
						size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM')
					#Frames Projection layer
					frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection')
					#<stop_token> projection layer
					stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection')

					log("tacotron.py:initialize():row147")
					#Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
					decoder_cell = TacotronDecoderCell(
						prenet,
						attention_mechanism,
						decoder_lstm,
						frame_projection,
						stop_projection)


					#Define the helper for our decoder
					if is_training or is_evaluating or gta:
						self.helper = TacoTrainingHelper(batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step)
					else:
						self.helper = TacoTestHelper(batch_size, hp)


					#initial decoder state
					decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

					#Only use max iterations at synthesis time
					max_iters = hp.max_iters if not (is_training or is_evaluating) else None
					
					log("tacotron.py:initialize():row170")
					#Decode
					(frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
						CustomDecoder(decoder_cell, self.helper, decoder_init_state),
						impute_finished=False,
						maximum_iterations=max_iters,
						swap_memory=hp.tacotron_swap_with_cpu)

					log("tacotron.py:initialize():row178")
					# Reshape outputs to be one output per entry 
					#==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
					decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
					stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])
					
					log("tacotron.py:initialize():row184")
					#Postnet
					postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions')
					
					log("tacotron.py:initialize():row188")
					#Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
					residual = postnet(decoder_output)

					#Project residual to same dimension as mel spectrogram 
					#==> [batch_size, decoder_steps * r, num_mels]
					residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection')
					projected_residual = residual_projection(residual)

					log("tacotron.py:initialize():row197")
					#Compute the mel spectrogram
					mel_outputs = decoder_output + projected_residual


					if post_condition:
						# Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
						post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels],
							hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, 
							hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name='CBHG_postnet')

						#[batch_size, decoder_steps(mel_frames), cbhg_channels]
						post_outputs = post_cbhg(mel_outputs, None)

						#Linear projection of extracted features to make linear spectrogram
						linear_specs_projection = FrameProjection(hp.num_freq, scope='cbhg_linear_specs_projection')

						#[batch_size, decoder_steps(linear_frames), num_freq]
						linear_outputs = linear_specs_projection(post_outputs)

					#Grab alignments from the final decoder state
					alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])

					self.tower_decoder_output.append(decoder_output)
					self.tower_alignments.append(alignments)
					self.tower_stop_token_prediction.append(stop_token_prediction)
					self.tower_mel_outputs.append(mel_outputs)
					tower_embedded_inputs.append(embedded_inputs)
					tower_enc_conv_output_shape.append(enc_conv_output_shape)
					tower_encoder_outputs.append(encoder_outputs)
					tower_residual.append(residual)
					tower_projected_residual.append(projected_residual)

					if post_condition:
						self.tower_linear_outputs.append(linear_outputs)
			log('initialisation done {}'.format(gpus[i]))


		if is_training:
			self.ratio = self.helper._ratio
		self.tower_inputs = tower_inputs
		self.tower_input_lengths = tower_input_lengths
		self.tower_mel_targets = tower_mel_targets
		self.tower_linear_targets = tower_linear_targets
		self.tower_targets_lengths = tower_targets_lengths
		self.tower_stop_token_targets = tower_stop_token_targets

		self.all_vars = tf.trainable_variables()

		log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
		log('  Train mode:               {}'.format(is_training))
		log('  Eval mode:                {}'.format(is_evaluating))
		log('  GTA mode:                 {}'.format(gta))
		log('  Synthesis mode:           {}'.format(not (is_training or is_evaluating)))
		log('  Input:                    {}'.format(inputs.shape))
		for i in range(hp.tacotron_num_gpus+hp.tacotron_gpu_start_idx):
			log('  device:                   {}'.format(i))
			log('  embedding:                {}'.format(tower_embedded_inputs[i].shape))
			log('  enc conv out:             {}'.format(tower_enc_conv_output_shape[i]))
			log('  encoder out:              {}'.format(tower_encoder_outputs[i].shape))
			log('  decoder out:              {}'.format(self.tower_decoder_output[i].shape))
			log('  residual out:             {}'.format(tower_residual[i].shape))
			log('  projected residual out:   {}'.format(tower_projected_residual[i].shape))
			log('  mel out:                  {}'.format(self.tower_mel_outputs[i].shape))
			if post_condition:
				log('  linear out:               {}'.format(self.tower_linear_outputs[i].shape))
			log('  <stop_token> out:         {}'.format(self.tower_stop_token_prediction[i].shape))

			#1_000_000 is causing syntax problems for some people?! Python please :)
			log('  Tacotron Parameters       {:.3f} Million.'.format(np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
 def __call__(self,
              mean_image_features=None,
              mean_object_features=None,
              spatial_image_features=None,
              spatial_object_features=None,
              seq_inputs=None,
              lengths=None):
     assert (mean_image_features is not None
             or mean_object_features is not None
             or spatial_image_features is not None
             or spatial_object_features is not None)
     use_beam_search = (seq_inputs is None or lengths is None)
     if mean_image_features is not None:
         batch_size = tf.shape(mean_image_features)[0]
     elif mean_object_features is not None:
         batch_size = tf.shape(mean_object_features)[0]
     elif spatial_image_features is not None:
         batch_size = tf.shape(spatial_image_features)[0]
     elif spatial_object_features is not None:
         batch_size = tf.shape(spatial_object_features)[0]
     initial_state = self.image_caption_cell.zero_state(
         batch_size, tf.float32)
     if use_beam_search:
         if mean_image_features is not None:
             mean_image_features = seq2seq.tile_batch(
                 mean_image_features, multiplier=self.beam_size)
             self.image_caption_cell.mean_image_features = mean_image_features
         if mean_object_features is not None:
             mean_object_features = seq2seq.tile_batch(
                 mean_object_features, multiplier=self.beam_size)
             self.image_caption_cell.mean_object_features = mean_object_features
         if spatial_image_features is not None:
             spatial_image_features = seq2seq.tile_batch(
                 spatial_image_features, multiplier=self.beam_size)
             self.image_caption_cell.spatial_image_features = spatial_image_features
         if spatial_object_features is not None:
             spatial_object_features = seq2seq.tile_batch(
                 spatial_object_features, multiplier=self.beam_size)
             self.image_caption_cell.spatial_object_features = spatial_object_features
         initial_state = seq2seq.tile_batch(initial_state,
                                            multiplier=self.beam_size)
         decoder = seq2seq.BeamSearchDecoder(
             self.image_caption_cell,
             self.embeddings_map,
             tf.fill([batch_size], self.word_vocabulary.start_id),
             self.word_vocabulary.end_id,
             initial_state,
             self.beam_size,
             output_layer=self.logits_layer)
         outputs, state, lengths = seq2seq.dynamic_decode(
             decoder, maximum_iterations=self.maximum_iterations)
         ids = tf.transpose(outputs.predicted_ids, [0, 2, 1])
         sequence_length = tf.shape(ids)[2]
         flat_ids = tf.reshape(
             ids, [batch_size * self.beam_size, sequence_length])
         seq_inputs = tf.concat([
             tf.fill([batch_size * self.beam_size, 1],
                     self.word_vocabulary.start_id), flat_ids
         ], 1)
     if mean_image_features is not None:
         self.image_caption_cell.mean_image_features = mean_image_features
     if mean_object_features is not None:
         self.image_caption_cell.mean_object_features = mean_object_features
     if spatial_image_features is not None:
         self.image_caption_cell.spatial_image_features = spatial_image_features
     if spatial_object_features is not None:
         self.image_caption_cell.spatial_object_features = spatial_object_features
     activations, _state = tf.nn.dynamic_rnn(
         self.image_caption_cell,
         tf.nn.embedding_lookup(self.embeddings_map, seq_inputs),
         sequence_length=tf.reshape(lengths, [-1]),
         initial_state=initial_state)
     logits = self.logits_layer(activations)
     if use_beam_search:
         length = tf.shape(logits)[1]
         logits = tf.reshape(
             logits, [batch_size, self.beam_size, length, self.vocab_size])
     return logits, tf.argmax(logits, axis=-1, output_type=tf.int32)
Esempio n. 24
0
    def build_train_decoder(self):
        self.decoder_inputs_embedded = tf.nn.embedding_lookup(
            params=self.embedding, ids=self.decoder_inputs_train)
        if self.train_mode == 'ground_truth':
            training_helper = seq2seq.TrainingHelper(
                inputs=self.decoder_inputs_embedded,
                sequence_length=self.decoder_inputs_length_train,
                time_major=False,
                name='training_helper')
        elif self.train_mode == 'scheduled_sampling':
            training_helper = seq2seq.ScheduledEmbeddingTrainingHelper(
                inputs=self.decoder_inputs_embedded,
                sequence_length=self.decoder_inputs_length_train,
                embedding=lambda inputs: tf.nn.embedding_lookup(
                    self.embedding, inputs),
                sampling_probability=self.sampling_probability,
                name='scheduled_embedding_training_helper')
        else:
            raise NotImplementedError(
                'Train mode: {} is not yet implemented'.format(
                    self.train_mode))

        training_decoder = seq2seq.BasicDecoder(
            cell=self.decoder_cell,
            helper=training_helper,
            initial_state=self.decoder_initial_state,
            output_layer=self.output_layer)
        max_decoder_length = tf.reduce_max(self.decoder_inputs_length_train)

        self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train = seq2seq.dynamic_decode(
            decoder=training_decoder,
            output_time_major=False,
            impute_finished=True,
            maximum_iterations=max_decoder_length)

        # NOTE(sdsuo): Not sure why this is necessary
        self.decoder_logits_train = tf.identity(
            self.decoder_outputs_train.rnn_output)

        # Use argmax to extract decoder symbols to emit
        self.decoder_pred_train = tf.argmax(self.decoder_logits_train,
                                            axis=-1,
                                            name='decoder_pred_train')

        # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1]
        masks = tf.sequence_mask(lengths=self.decoder_inputs_length_train,
                                 maxlen=max_decoder_length,
                                 dtype=self.dtype,
                                 name='masks')

        # Computes per word average cross-entropy over a batch
        # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default
        self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train,
                                          targets=self.decoder_targets_train,
                                          weights=masks,
                                          average_across_timesteps=True,
                                          average_across_batch=True)

        # Training summary for the current batch_loss
        tf.summary.scalar('loss', self.loss)

        # Contruct graphs for minimizing loss
        self.init_optimizer()
Esempio n. 25
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        beam_width = config.beam_width
        GO_TOKEN = 0
        EOS_TOKEN = 1

        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")
                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(config.emb_mat),
                            trainable=True)
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(
                            axis=0, values=[word_emb_mat, self.new_emb_mat])
                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:
                    xx = tf.concat(axis=3, values=[xx, Ax])  # [N, M, JX, di]
                    qq = tf.concat(axis=2, values=[qq, Aq])  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

        # highway network
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        cell_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell_fw = SwitchableDropoutWrapper(
            cell_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell_bw = SwitchableDropoutWrapper(
            cell_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell2_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell2_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell2_fw = SwitchableDropoutWrapper(
            cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell2_bw = SwitchableDropoutWrapper(
            cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell3_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell3_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell3_fw = SwitchableDropoutWrapper(
            cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell3_bw = SwitchableDropoutWrapper(
            cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell4_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell4_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell4_fw = SwitchableDropoutWrapper(
            cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell4_bw = SwitchableDropoutWrapper(
            cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("prepro"):
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell_fw,
                                             d_cell_bw,
                                             qq,
                                             q_len,
                                             dtype='float',
                                             scope='u1')  # [N, J, d], [N, d]
            u = tf.concat(axis=2, values=[fw_u, bw_u])
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_h, bw_h), ((_, fw_h_f),
                               (_, bw_h_f)) = bidirectional_dynamic_rnn(
                                   cell_fw,
                                   cell_bw,
                                   xx,
                                   x_len,
                                   dtype='float',
                                   scope='u1')  # [N, M, JX, 2d]
                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
            else:
                (fw_h, bw_h), ((_, fw_h_f),
                               (_, bw_h_f)) = bidirectional_dynamic_rnn(
                                   cell_fw,
                                   cell_bw,
                                   xx,
                                   x_len,
                                   dtype='float',
                                   scope='h1')  # [N, M, JX, 2d]
                h = tf.concat(axis=3, values=[fw_h, bw_h])  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]),
                               [N * M, JQ, 2 * d])
                q_mask = tf.reshape(
                    tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]),
                    [N * M, JQ])
                first_cell_fw = AttentionCell(
                    cell2_fw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                first_cell_bw = AttentionCell(
                    cell2_bw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                second_cell_fw = AttentionCell(
                    cell3_fw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
                second_cell_bw = AttentionCell(
                    cell3_bw,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
            else:
                p0 = attention_layer(config,
                                     self.is_train,
                                     h,
                                     u,
                                     h_mask=self.x_mask,
                                     u_mask=self.q_mask,
                                     scope="p0",
                                     tensor_dict=self.tensor_dict)
                first_cell_fw = d_cell2_fw
                second_cell_fw = d_cell3_fw
                first_cell_bw = d_cell2_bw
                second_cell_bw = d_cell3_bw

            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(
                first_cell_fw,
                first_cell_bw,
                p0,
                x_len,
                dtype='float',
                scope='g0')  # [N, M, JX, 2d]
            g0 = tf.concat(axis=3, values=[fw_g0, bw_g0])
            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(
                second_cell_fw,
                second_cell_bw,
                g0,
                x_len,
                dtype='float',
                scope='g1')  # [N, M, JX, 2d]
            g1 = tf.concat(axis=3, values=[fw_g1, bw_g1])

            logits = get_logits([g1, p0],
                                d,
                                True,
                                wd=config.wd,
                                input_keep_prob=config.input_keep_prob,
                                mask=self.x_mask,
                                is_train=self.is_train,
                                func=config.answer_func,
                                scope='logits1')
            a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]),
                          tf.reshape(logits, [N, M * JX]))
            a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1),
                          [1, M, JX, 1])

            (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(
                d_cell4_fw,
                d_cell4_bw,
                tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]),
                x_len,
                dtype='float',
                scope='g2')  # [N, M, JX, 2d]
            g2 = tf.concat(axis=3, values=[fw_g2, bw_g2])
            logits2 = get_logits([g2, p0],
                                 d,
                                 True,
                                 wd=config.wd,
                                 input_keep_prob=config.input_keep_prob,
                                 mask=self.x_mask,
                                 is_train=self.is_train,
                                 func=config.answer_func,
                                 scope='logits2')

            flat_logits = tf.reshape(logits, [-1, M * JX])
            flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
            flat_logits2 = tf.reshape(logits2, [-1, M * JX])
            flat_yp2 = tf.nn.softmax(flat_logits2)

            if config.na:
                na_bias = tf.get_variable("na_bias", shape=[], dtype='float')
                na_bias_tiled = tf.tile(tf.reshape(na_bias, [1, 1]),
                                        [N, 1])  # [N, 1]
                concat_flat_logits = tf.concat(
                    axis=1, values=[na_bias_tiled, flat_logits])
                concat_flat_yp = tf.nn.softmax(concat_flat_logits)
                na_prob = tf.squeeze(tf.slice(concat_flat_yp, [0, 0], [-1, 1]),
                                     [1])
                flat_yp = tf.slice(concat_flat_yp, [0, 1], [-1, -1])

                concat_flat_logits2 = tf.concat(
                    axis=1, values=[na_bias_tiled, flat_logits2])
                concat_flat_yp2 = tf.nn.softmax(concat_flat_logits2)
                na_prob2 = tf.squeeze(
                    tf.slice(concat_flat_yp2, [0, 0], [-1, 1]), [1])  # [N]
                flat_yp2 = tf.slice(concat_flat_yp2, [0, 1], [-1, -1])

                self.concat_logits = concat_flat_logits
                self.concat_logits2 = concat_flat_logits2
                self.na_prob = na_prob * na_prob2

            yp = tf.reshape(flat_yp, [-1, M, JX])
            yp2 = tf.reshape(flat_yp2, [-1, M, JX])
            wyp = tf.nn.sigmoid(logits2)

            self.tensor_dict['g1'] = g1
            self.tensor_dict['g2'] = g2

            self.logits = flat_logits
            self.logits2 = flat_logits2
            self.yp = yp
            self.yp2 = yp2
            self.wyp = wyp

        with tf.variable_scope("q_gen"):
            # Question Generation Using (Paragraph & Predicted Ans Pos)
            NM = config.max_num_sents * config.batch_size

            # Separated encoder
            #ss = tf.reshape(xx, (-1, JX, dw+dco))

            q_worthy = tf.reduce_sum(
                tf.to_int32(self.y), axis=2
            )  # so we get probability distribution of answer-likely. (N, M)
            q_worthy = tf.expand_dims(tf.to_int32(tf.argmax(q_worthy, axis=1)),
                                      axis=1)  # (N) -> (N, 1)
            q_worthy = tf.concat([
                tf.expand_dims(tf.range(0, N, dtype=tf.int32), axis=1),
                q_worthy
            ],
                                 axis=1)
            # example : [0, 9], [1, 11], [2, 8], [3, 5], [4, 0], [5, 1] ...

            ss = tf.gather_nd(xx, q_worthy)
            syp = tf.expand_dims(tf.gather_nd(yp, q_worthy), axis=-1)
            syp2 = tf.expand_dims(tf.gather_nd(yp2, q_worthy), axis=-1)
            ss_with_ans = tf.concat([ss, syp, syp2], axis=2)

            qg_dim = 600
            cell_fw, cell_bw = rnn.DropoutWrapper(rnn.GRUCell(qg_dim), input_keep_prob=config.input_keep_prob), \
                               rnn.DropoutWrapper(rnn.GRUCell(qg_dim), input_keep_prob=config.input_keep_prob)
            s_outputs, s_states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw, cell_bw, ss_with_ans, dtype=tf.float32)
            s_outputs = tf.concat(s_outputs, axis=2)
            s_states = tf.concat(s_states, axis=1)

            start_tokens = tf.zeros([N], dtype=tf.int32)
            self.inp_q_with_GO = tf.concat(
                [tf.expand_dims(start_tokens, axis=1), self.q], axis=1)
            # supervise if mode is train
            if config.mode == "train":
                emb_q = tf.nn.embedding_lookup(params=word_emb_mat,
                                               ids=self.inp_q_with_GO)
                #emb_q = tf.reshape(tf.tile(tf.expand_dims(emb_q, axis=1), [1, M, 1, 1]), (NM, JQ+1, dw))
                train_helper = seq2seq.TrainingHelper(emb_q, [JQ] * N)
            else:
                s_outputs = seq2seq.tile_batch(s_outputs,
                                               multiplier=beam_width)
                s_states = seq2seq.tile_batch(s_states, multiplier=beam_width)

            cell = rnn.DropoutWrapper(rnn.GRUCell(num_units=qg_dim * 2),
                                      input_keep_prob=config.input_keep_prob)
            attention_mechanism = seq2seq.BahdanauAttention(num_units=qg_dim *
                                                            2,
                                                            memory=s_outputs)
            attn_cell = seq2seq.AttentionWrapper(cell,
                                                 attention_mechanism,
                                                 attention_layer_size=qg_dim *
                                                 2,
                                                 output_attention=True,
                                                 alignment_history=False)
            total_glove_vocab_size = 78878  #72686
            out_cell = rnn.OutputProjectionWrapper(attn_cell,
                                                   VW + total_glove_vocab_size)
            if config.mode == "train":
                decoder_initial_states = out_cell.zero_state(
                    batch_size=N, dtype=tf.float32).clone(cell_state=s_states)
                decoder = seq2seq.BasicDecoder(
                    cell=out_cell,
                    helper=train_helper,
                    initial_state=decoder_initial_states)
            else:
                decoder_initial_states = out_cell.zero_state(
                    batch_size=N * beam_width,
                    dtype=tf.float32).clone(cell_state=s_states)
                decoder = seq2seq.BeamSearchDecoder(
                    cell=out_cell,
                    embedding=word_emb_mat,
                    start_tokens=start_tokens,
                    end_token=EOS_TOKEN,
                    initial_state=decoder_initial_states,
                    beam_width=beam_width,
                    length_penalty_weight=0.0)
            outputs = seq2seq.dynamic_decode(decoder=decoder,
                                             maximum_iterations=JQ)
            if config.mode == "train":
                gen_q = outputs[0].sample_id
                gen_q_prob = outputs[0].rnn_output
                gen_q_states = outputs[1]
            else:
                gen_q = outputs[0].predicted_ids[:, :, 0]
                gen_q_prob = tf.nn.embedding_lookup(
                    params=word_emb_mat, ids=outputs[0].predicted_ids[:, :, 0])
                gen_q_states = outputs[1]

            self.gen_q = gen_q
            self.gen_q_prob = gen_q_prob
            self.gen_q_states = gen_q_states
Esempio n. 26
0
    def sample(self,
               n,
               max_length=None,
               z=None,
               temperature=None,
               start_inputs=None,
               beam_width=None,
               end_token=None):
        """Overrides BaseLstmDecoder `sample` method to add optional beam search.

    Args:
      n: Scalar number of samples to return.
      max_length: (Optional) Scalar maximum sample length to return. Required if
        data representation does not include end tokens.
      z: (Optional) Latent vectors to sample from. Required if model is
        conditional. Sized `[n, z_size]`.
      temperature: (Optional) The softmax temperature to use when not doing beam
        search. Defaults to 1.0. Ignored when `beam_width` is provided.
      start_inputs: (Optional) Initial inputs to use for batch.
        Sized `[n, output_depth]`.
      beam_width: (Optional) Width of beam to use for beam search. Beam search
        is disabled if not provided.
      end_token: (Optional) Scalar token signaling the end of the sequence to
        use for early stopping.
    Returns:
      samples: Sampled sequences. Sized `[n, max_length, output_depth]`.
      final_state: The final states of the decoder.
    Raises:
      ValueError: If `z` is provided and its first dimension does not equal `n`.
    """
        if beam_width is None:
            end_fn = (None if end_token is None else
                      lambda x: tf.equal(tf.argmax(x, axis=-1), end_token))
            return super(CategoricalLstmDecoder,
                         self).sample(n, max_length, z, temperature,
                                      start_inputs, end_fn)

        # If `end_token` is not given, use an impossible value.
        end_token = self._output_depth if end_token is None else end_token
        if z is not None and z.shape[0].value != n:
            raise ValueError(
                '`z` must have a first dimension that equals `n` when given. '
                'Got: %d vs %d' % (z.shape[0].value, n))

        if temperature is not None:
            tf.logging.warning(
                '`temperature` is ignored when using beam search.')
        # Use a dummy Z in unconditional case.
        z = tf.zeros((n, 0), tf.float32) if z is None else z

        # If not given, start with dummy `-1` token and replace with zero vectors in
        # `embedding_fn`.
        start_tokens = (tf.argmax(start_inputs, axis=-1, output_type=tf.int32)
                        if start_inputs is not None else -1 *
                        tf.ones([n], dtype=tf.int32))

        initial_state = initial_cell_state_from_embedding(
            self._dec_cell, z, name='decoder/z_to_initial_state')
        beam_initial_state = seq2seq.tile_batch(initial_state,
                                                multiplier=beam_width)

        # Tile `z` across beams.
        beam_z = tf.tile(tf.expand_dims(z, 1), [1, beam_width, 1])

        def embedding_fn(tokens):
            # If tokens are the start_tokens (negative), replace with zero vectors.
            next_inputs = tf.cond(
                tf.less(tokens[0, 0], 0),
                lambda: tf.zeros([n, beam_width, self._output_depth]),
                lambda: tf.one_hot(tokens, self._output_depth))

            # Concatenate `z` to next inputs.
            next_inputs = tf.concat([next_inputs, beam_z], axis=-1)
            return next_inputs

        decoder = seq2seq.BeamSearchDecoder(self._dec_cell,
                                            embedding_fn,
                                            start_tokens,
                                            end_token,
                                            beam_initial_state,
                                            beam_width,
                                            output_layer=self._output_layer,
                                            length_penalty_weight=0.0)

        final_output, final_state, _ = seq2seq.dynamic_decode(
            decoder,
            maximum_iterations=max_length,
            swap_memory=True,
            scope='decoder')

        # Returns samples and final states from the best beams.
        return (tf.one_hot(final_output.predicted_ids[:, :, 0],
                           self._output_depth),
                nest.map_structure(lambda x: x[:, 0], final_state.cell_state))
Esempio n. 27
0
def model_fn(features, labels, mode, params):
    embedding_encoder = tf.get_variable('embedding_encoder',
                                        shape=(params.vocab_size,
                                               params.emb_size))
    table = lookup_ops.index_to_string_table_from_file(params.word_vocab_file)

    question_emb = tf.nn.embedding_lookup(embedding_encoder,
                                          features['question_words'])
    passage_emb = tf.nn.embedding_lookup(embedding_encoder,
                                         features['passage_words'])

    question_words_length = features['question_length']
    passage_words_length = features['passage_length']

    answer_start, answer_end = features['answer_start'], features['answer_end']
    answer_start = tf.concat([tf.expand_dims(answer_start, -1)] * 50, -1)
    answer_end = tf.concat([tf.expand_dims(answer_end, -1)] * 50, -1)

    with tf.variable_scope('passage_encoding'):
        passage_enc, (_, passage_bw_state) = biGRU(tf.concat(
            [passage_emb, answer_start, answer_end], -1),
                                                   passage_words_length,
                                                   params,
                                                   layers=params.layers)

    with tf.variable_scope('question_encoding'):
        question_enc, (_, question_bw_state) = biGRU(question_emb,
                                                     question_words_length,
                                                     params,
                                                     layers=params.layers)

    # output_enc = masked_concat(question_enc, passage_enc, question_words_length, passage_words_length)

    decoder_state_layer = Dense(params.units,
                                activation=tf.tanh,
                                use_bias=True,
                                name='decoder_state_init')
    decoder_init_state = tuple(
        decoder_state_layer(
            tf.concat([passage_bw_state[i], question_bw_state[i]], -1))
        for i in range(params.layers))

    question_att = BahdanauAttention(
        params.units,
        question_enc,
        memory_sequence_length=question_words_length)
    passage_att = BahdanauAttention(
        params.units, passage_enc, memory_sequence_length=passage_words_length)

    decoder_cell = AttentionWrapper(MultiRNNCell(
        [GRUCell(params.units) for _ in range(params.layers)]),
                                    [question_att, passage_att],
                                    initial_cell_state=decoder_init_state)

    batch_size = params.batch_size  # if mode != tf.estimator.ModeKeys.PREDICT else 1

    if mode == tf.estimator.ModeKeys.TRAIN:
        answer_emb = tf.nn.embedding_lookup(embedding_encoder,
                                            features['answer_words'])
        helper = TrainingHelper(answer_emb, features['answer_length'])
    else:
        helper = GreedyEmbeddingHelper(
            embedding_encoder, tf.fill([batch_size], params.tgt_sos_id),
            params.tgt_eos_id)

    projection_layer = Dense(params.vocab_size, use_bias=False)

    decoder = SNetDecoder(decoder_cell,
                          helper,
                          decoder_cell.zero_state(batch_size, tf.float32),
                          output_layer=projection_layer,
                          params=params)

    outputs, _, outputs_length = dynamic_decode(
        decoder, maximum_iterations=params.answer_max_words)
    logits = outputs.rnn_output

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'answer': table.lookup(tf.cast(outputs.sample_id, tf.int64))
        }
        export_outputs = {
            'prediction': tf.estimator.export.PredictOutput(predictions)
        }

        return tf.estimator.EstimatorSpec(mode,
                                          predictions=predictions,
                                          export_outputs=export_outputs)

    # logits = tf.Print(logits, [outputs.sample_id, labels], summarize=1000)

    labels = tf.stop_gradient(labels[:, :tf.reduce_max(outputs_length)])

    crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                              logits=logits)
    target_weights = tf.sequence_mask(outputs_length, dtype=logits.dtype)
    loss = tf.reduce_sum(crossent * target_weights) / params.batch_size

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdadeltaOptimizer(learning_rate=1)
        global_step = tf.train.get_or_create_global_step()

        grads = optimizer.compute_gradients(loss)
        gradients, variables = zip(*grads)
        capped_grads, _ = tf.clip_by_global_norm(gradients, params.grad_clip)
        train_op = optimizer.apply_gradients(zip(capped_grads, variables),
                                             global_step=global_step)

        return EstimatorSpec(
            mode,
            loss=loss,
            train_op=train_op,
        )

    if mode == tf.estimator.ModeKeys.EVAL:
        return EstimatorSpec(mode,
                             loss=loss,
                             eval_metric_ops={
                                 'rouge-l':
                                 rouge_l(outputs.sample_id, labels,
                                         outputs_length,
                                         features['answer_length'], params,
                                         table),
                             })
Esempio n. 28
0
    def build_decoder(self, encoder_outputs, encoder_state):
        """构建解码器"""
        with tf.variable_scope('decoder') as decoder_scope:
            (self.decoder_cell,
             self.decoder_initial_state) = self.build_decoder_cell(
                 encoder_outputs, encoder_state)

            # 解码器embedding
            with tf.device(_get_embed_device(self.target_vocab_size)):
                if self.share_embedding:
                    self.decoder_embeddings = self.encoder_embeddings
                elif self.pretrained_embedding:

                    self.decoder_embeddings = tf.Variable(tf.constant(
                        0.0,
                        shape=(self.target_vocab_size, self.embedding_size)),
                                                          trainable=True,
                                                          name='embeddings')
                    self.decoder_embeddings_placeholder = tf.placeholder(
                        tf.float32,
                        (self.target_vocab_size, self.embedding_size))
                    self.decoder_embeddings_init = self.decoder_embeddings.assign(
                        self.decoder_embeddings_placeholder)
                else:
                    self.decoder_embeddings = tf.get_variable(
                        name='embeddings',
                        shape=(self.target_vocab_size, self.embedding_size),
                        initializer=self.initializer,
                        dtype=tf.float32)

            self.decoder_output_projection = layers.Dense(
                self.target_vocab_size,
                dtype=tf.float32,
                use_bias=False,
                name='decoder_output_projection')

            # 训练--train
            if self.mode == 'train':
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                    params=self.decoder_embeddings,
                    ids=self.decoder_inputs_train)
                inputs = self.decoder_inputs_embedded

                if self.time_major:
                    inputs = tf.transpose(inputs, (1, 0, 2))

                training_helper = seq2seq.TrainingHelper(
                    inputs=inputs,
                    sequence_length=self.decoder_inputs_length,
                    time_major=self.time_major,
                    name='training_helper')

                # 训练的时候不在这里应用 output_layer
                # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢
                # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数
                training_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=training_helper,
                    initial_state=self.decoder_initial_state,
                )

                # Maximum decoder time_steps in current batch
                max_decoder_length = tf.reduce_max(self.decoder_inputs_length)

                (
                    outputs,
                    self.final_state,  # contain attention
                    _  # self.final_sequence_lengths
                ) = seq2seq.dynamic_decode(
                    decoder=training_decoder,
                    output_time_major=self.time_major,
                    impute_finished=True,
                    maximum_iterations=max_decoder_length,
                    parallel_iterations=self.parallel_iterations,
                    swap_memory=True,
                    scope=decoder_scope)

                self.decoder_logits_train = self.decoder_output_projection(
                    outputs.rnn_output)

                # masks: masking for valid and padded time steps,
                # [batch_size, max_time_step + 1]
                self.masks = tf.sequence_mask(
                    lengths=self.decoder_inputs_length,
                    maxlen=max_decoder_length,
                    dtype=tf.float32,
                    name='masks')

                decoder_logits_train = self.decoder_logits_train
                if self.time_major:
                    decoder_logits_train = tf.transpose(
                        decoder_logits_train, (1, 0, 2))

                self.decoder_pred_train = tf.argmax(decoder_logits_train,
                                                    axis=-1,
                                                    name='decoder_pred_train')

                # 下面的一些变量用于特殊的学习训练
                # 自定义rewards,其实我这里是修改了masks
                # train_entropy = cross entropy
                self.train_entropy = \
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        labels=self.decoder_inputs,
                        logits=decoder_logits_train)

                self.masks_rewards = self.masks * self.rewards

                self.loss_rewards = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_inputs,
                    weights=self.masks_rewards,
                    average_across_timesteps=True,
                    average_across_batch=True,
                )

                self.loss = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_inputs,
                    weights=self.masks,
                    average_across_timesteps=True,
                    average_across_batch=True,
                )

                self.loss_add = self.loss + self.add_loss

            elif self.mode == 'decode':
                # 预测模式,非训练

                start_tokens = tf.tile([WordSequence.START], [self.batch_size])
                end_token = WordSequence.END

                def embed_and_input_proj(inputs):
                    """输入层的投影层wrapper
                    """
                    return tf.nn.embedding_lookup(self.decoder_embeddings,
                                                  inputs)

                if not self.use_beamsearch_decode:
                    # Helper to feed inputs for greedy decoding:
                    # uses the argmax of the output
                    decoding_helper = seq2seq.GreedyEmbeddingHelper(
                        start_tokens=start_tokens,
                        end_token=end_token,
                        embedding=embed_and_input_proj)
                    # Basic decoder performs greedy decoding at each time step
                    # print("building greedy decoder..")
                    inference_decoder = seq2seq.BasicDecoder(
                        cell=self.decoder_cell,
                        helper=decoding_helper,
                        initial_state=self.decoder_initial_state,
                        output_layer=self.decoder_output_projection)
                else:
                    # Beamsearch is used to approximately
                    # find the most likely translation
                    # print("building beamsearch decoder..")
                    inference_decoder = BeamSearchDecoder(
                        cell=self.decoder_cell,
                        embedding=embed_and_input_proj,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=self.decoder_initial_state,
                        beam_width=self.beam_width,
                        output_layer=self.decoder_output_projection,
                    )

                if self.max_decode_step is not None:
                    max_decode_step = self.max_decode_step
                else:
                    # 默认 4 倍输入长度的输出解码
                    max_decode_step = tf.round(
                        tf.reduce_max(self.encoder_inputs_length) * 4)

                (
                    self.decoder_outputs_decode,
                    self.final_state,
                    _  # self.decoder_outputs_length_decode
                ) = (
                    seq2seq.dynamic_decode(
                        decoder=inference_decoder,
                        output_time_major=self.time_major,
                        # impute_finished=True,	# error occurs
                        maximum_iterations=max_decode_step,
                        parallel_iterations=self.parallel_iterations,
                        swap_memory=True,
                        scope=decoder_scope))

                if not self.use_beamsearch_decode:

                    dod = self.decoder_outputs_decode
                    self.decoder_pred_decode = dod.sample_id

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0))

                else:
                    self.decoder_pred_decode = \
                        self.decoder_outputs_decode.predicted_ids

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0, 2))

                    self.decoder_pred_decode = tf.transpose(
                        self.decoder_pred_decode, perm=[0, 2, 1])
                    dod = self.decoder_outputs_decode
                    self.beam_prob = dod.beam_search_decoder_output.scores
    def construct(self):
        self.saved_session_name = os.path.join(self.tmp_folder, self.uuid_code)
        self.input_data = tf.placeholder(tf.float32,
                                         [None, None, self.input_dim])
        self.output_data = tf.placeholder(tf.float32,
                                          [None, None, self.output_dim])
        self.start_tokens = tf.placeholder(tf.float32, [None, self.output_dim])
        self.go_tokens = tf.placeholder(tf.float32, [None, 1, self.output_dim])
        self.sequence_length = tf.placeholder(tf.int32, [None])
        self.mask = tf.placeholder(tf.float32, [None, None])
        self.target_sequence_length = tf.placeholder(
            tf.int32, (None, ), name='target_sequence_length')
        self.max_target_sequence_length = tf.reduce_max(
            self.target_sequence_length, name='max_target_len')
        self.source_sequence_length = tf.placeholder(
            tf.int32, (None, ), name='source_sequence_length')
        self.x_stopping = np.full((self.stop_pad_length, self.input_dim),
                                  self.stop_pad_token,
                                  dtype=np.float32)
        self.y_stopping = np.full((self.stop_pad_length, self.output_dim),
                                  self.stop_pad_token,
                                  dtype=np.float32)
        self.learning_rate = tf.placeholder(tf.float32)
        self.batch_size = tf.placeholder(tf.float32)

        enc_cell = make_cell(self.layer_sizes, self.keep_prob)

        # We want to train the decoder to learn the stopping point as well,
        # so the sequence lengths is extended for both the decoder and the encoder
        # logic: the encoder will learn that the stopping token is the signal that the input is finished
        #        the decoder will learn to produce the stopping token to match the expected output
        #        the inferer will learn to produce the stopping token for us to recognise that and stop inferring
        self.source_sequence_length_padded = self.source_sequence_length + self.stop_pad_length
        self.target_sequence_length_padded = self.target_sequence_length + self.stop_pad_length
        max_target_sequence_length_padded = self.max_target_sequence_length + self.stop_pad_length

        _, self.enc_state = dynamic_rnn(
            enc_cell,
            self.input_data,
            sequence_length=self.source_sequence_length_padded,
            dtype=tf.float32,
            time_major=False,
            swap_memory=True)
        self.enc_state_centre = self.enc_state[-1]

        if self.symmetric:
            self.enc_state = self.enc_state[::-1]
            dec_cell = make_cell(self.layer_sizes[::-1], self.keep_prob)
        else:
            dec_cell = make_cell(self.layer_sizes, self.keep_prob)

        # 3. Dense layer to translate the decoder's output at each time
        # step into a choice from the target vocabulary
        projection_layer = tf.layers.Dense(
            units=self.output_dim,
            # kernel_initializer=tf.initializers.he_normal(),
            # kernel_regularizer=regularizer,
            kernel_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                               stddev=0.1))

        # 4. Set up a training decoder and an inference decoder
        # Training Decoder
        with tf.variable_scope("decode"):
            # During PREDICT mode, the output data is none so we can't have a training model.
            # Helper for the training process. Used by BasicDecoder to read inputs.
            dec_input = tf.concat([self.go_tokens, self.output_data], 1)
            training_helper = TrainingHelper(
                inputs=dec_input,
                sequence_length=self.target_sequence_length_padded,
                time_major=False)

            # Basic decoder
            training_decoder = BasicDecoder(dec_cell, training_helper,
                                            self.enc_state, projection_layer)

            # Perform dynamic decoding using the decoder
            self.training_decoder_output\
                = dynamic_decode(training_decoder,
                                 # True because we're using variable length sequences, which have finish points
                                 impute_finished=True,
                                 maximum_iterations=max_target_sequence_length_padded)[0]
        # 5. Inference Decoder
        # Reuses the same parameters trained by the training process
        with tf.variable_scope("decode", reuse=True):

            def end_fn(time_step_value):
                # Ideally, the inferer should produce the stopping token
                # Which can be assessed as being equal to the modelled stop token, and this should be return:
                # return tf.reduce_all(tf.equal(time_step_value, self.y_stopping))

                # However due to the nature of training, the produced stop token will never be exactly the same
                # as the modelled one. If we use an embedded layer, then this top token can be learned
                # however as we are not using the embedded layer, this function should return False
                # meaning there is no early stop
                return False

            inference_helper = InferenceHelper(sample_fn=lambda x: x,
                                               sample_shape=[self.output_dim],
                                               sample_dtype=dtypes.float32,
                                               start_inputs=self.start_tokens,
                                               end_fn=end_fn)

            # Basic decoder
            inference_decoder = BasicDecoder(dec_cell, inference_helper,
                                             self.enc_state, projection_layer)

            # Perform dynamic decoding using the decoder
            self.inference_decoder_output = dynamic_decode(
                inference_decoder,
                # True because we're using variable length sequences, which have finish points
                impute_finished=True,
                maximum_iterations=max_target_sequence_length_padded)[0]
Esempio n. 30
0
    def build_decoder(self):
        print("building decoder and attention..")
        with tf.variable_scope('decoder'):
            # Building decoder_cell and decoder_initial_state
            self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell()

            # Initialize decoder embeddings to have variance=1.
            sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
            initializer = tf.random_uniform_initializer(-sqrt3, sqrt3, dtype=self.dtype)

            self.decoder_embeddings = tf.get_variable(name='embedding',
                shape=[self.num_decoder_symbols, self.embedding_size],
                initializer=initializer, dtype=self.dtype)

            # Input projection layer to feed embedded inputs to the cell
            # ** Essential when use_residual=True to match input/output dims
            input_layer = Dense(self.hidden_units, dtype=self.dtype, name='input_projection')

            # Output projection layer to convert cell_outputs to logits
            output_layer = Dense(self.num_decoder_symbols, name='output_projection')

            if self.mode == 'train':
                # decoder_inputs_embedded: [batch_size, max_time_step + 1, embedding_size]
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                    params=self.decoder_embeddings, ids=self.decoder_inputs_train)

                # Embedded inputs having gone through input projection layer
                self.decoder_inputs_embedded = input_layer(self.decoder_inputs_embedded)

                # Helper to feed inputs for training: read inputs from dense ground truth vectors
                training_helper = seq2seq.TrainingHelper(inputs=self.decoder_inputs_embedded,
                                                   sequence_length=self.decoder_inputs_length_train,
                                                   time_major=False,
                                                   name='training_helper')

                training_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell,
                                                   helper=training_helper,
                                                   initial_state=self.decoder_initial_state,
                                                   output_layer=output_layer)
                                                   #output_layer=None)

                # Maximum decoder time_steps in current batch
                max_decoder_length = tf.reduce_max(self.decoder_inputs_length_train)

                # decoder_outputs_train: BasicDecoderOutput
                #                        namedtuple(rnn_outputs, sample_id)
                # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False
                #                                   [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True
                # decoder_outputs_train.sample_id: [batch_size], tf.int32
                (self.decoder_outputs_train, self.decoder_last_state_train,
                 self.decoder_outputs_length_train) = (seq2seq.dynamic_decode(
                    decoder=training_decoder,
                    output_time_major=False,
                    impute_finished=True,
                    maximum_iterations=max_decoder_length))

                # More efficient to do the projection on the batch-time-concatenated tensor
                # logits_train: [batch_size, max_time_step + 1, num_decoder_symbols]
                # self.decoder_logits_train = output_layer(self.decoder_outputs_train.rnn_output)
                self.decoder_logits_train = tf.identity(self.decoder_outputs_train.rnn_output)
                # Use argmax to extract decoder symbols to emit
                self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1,
                                                    name='decoder_pred_train')

                # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1]
                masks = tf.sequence_mask(lengths=self.decoder_inputs_length_train,
                                         maxlen=max_decoder_length, dtype=self.dtype, name='masks')

                # Computes per word average cross-entropy over a batch
                # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default
                self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train,
                                                  targets=self.decoder_targets_train,
                                                  weights=masks,
                                                  average_across_timesteps=True,
                                                  average_across_batch=True,)
                # Training summary for the current batch_loss
                tf.summary.scalar('loss', self.loss)

                # Contruct graphs for minimizing loss
                self.init_optimizer()

            elif self.mode == 'decode':

                # Start_tokens: [batch_size,] `int32` vector
                start_tokens = tf.ones([self.batch_size,], tf.int32) * data_utils.start_token
                end_token = data_utils.end_token

                def embed_and_input_proj(inputs):
                    return input_layer(tf.nn.embedding_lookup(self.decoder_embeddings, inputs))

                if not self.use_beamsearch_decode:
                    # Helper to feed inputs for greedy decoding: uses the argmax of the output
                    decoding_helper = seq2seq.GreedyEmbeddingHelper(start_tokens=start_tokens,
                                                                    end_token=end_token,
                                                                    embedding=embed_and_input_proj)
                    # Basic decoder performs greedy decoding at each time step
                    print("building greedy decoder..")
                    inference_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell,
                                                             helper=decoding_helper,
                                                             initial_state=self.decoder_initial_state,
                                                             output_layer=output_layer)
                else:
                    # Beamsearch is used to approximately find the most likely translation
                    print("building beamsearch decoder..")
                    inference_decoder = beam_search_decoder.BeamSearchDecoder(cell=self.decoder_cell,
                                                               embedding=embed_and_input_proj,
                                                               start_tokens=start_tokens,
                                                               end_token=end_token,
                                                               initial_state=self.decoder_initial_state,
                                                               beam_width=self.beam_width,
                                                               output_layer=output_layer,)
                # For GreedyDecoder, return
                # decoder_outputs_decode: BasicDecoderOutput instance
                #                         namedtuple(rnn_outputs, sample_id)
                # decoder_outputs_decode.rnn_output: [batch_size, max_time_step, num_decoder_symbols]   if output_time_major=False
                #                                    [max_time_step, batch_size, num_decoder_symbols]   if output_time_major=True
                # decoder_outputs_decode.sample_id: [batch_size, max_time_step], tf.int32               if output_time_major=False
                #                                   [max_time_step, batch_size], tf.int32               if output_time_major=True

                # For BeamSearchDecoder, return
                # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance
                #                         namedtuple(predicted_ids, beam_search_decoder_output)
                # decoder_outputs_decode.predicted_ids: [batch_size, max_time_step, beam_width] if output_time_major=False
                #                                       [max_time_step, batch_size, beam_width] if output_time_major=True
                # decoder_outputs_decode.beam_search_decoder_output: BeamSearchDecoderOutput instance
                #                                                    namedtuple(scores, predicted_ids, parent_ids)

                (self.decoder_outputs_decode, self.decoder_last_state_decode,
                 self.decoder_outputs_length_decode) = (seq2seq.dynamic_decode(
                    decoder=inference_decoder,
                    output_time_major=False,
                    #impute_finished=True,      # error occurs
                    maximum_iterations=self.max_decode_step))

                if not self.use_beamsearch_decode:
                    # decoder_outputs_decode.sample_id: [batch_size, max_time_step]
                    # Or use argmax to find decoder symbols to emit:
                    # self.decoder_pred_decode = tf.argmax(self.decoder_outputs_decode.rnn_output,
                    #                                      axis=-1, name='decoder_pred_decode')

                    # Here, we use expand_dims to be compatible with the result of the beamsearch decoder
                    # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False)
                    self.decoder_pred_decode = tf.expand_dims(self.decoder_outputs_decode.sample_id, -1)

                else:
                    # Use beam search to approximately find the most likely translation
                    # decoder_pred_decode: [batch_size, max_time_step, beam_width] (output_major=False)
                    self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids
    def _build(
            self,  # pylint: disable=arguments-differ, too-many-statements
            decoding_strategy='train_greedy',
            inputs=None,
            memory=None,
            memory_sequence_length=None,
            memory_attention_bias=None,
            beam_width=None,
            length_penalty=0.,
            start_tokens=None,
            end_token=None,
            context=None,
            context_sequence_length=None,
            softmax_temperature=None,
            max_decoding_length=None,
            impute_finished=False,
            embedding=None,
            helper=None,
            mode=None):
        """Performs decoding.

        The interface is mostly the same with that of RNN decoders
        (see :meth:`~texar.modules.RNNDecoderBase._build`). The main difference
        is that, here, `sequence_length` is not needed, and continuation
        generation is additionally supported.

        The function provides **3 ways** to specify the decoding method, with
        varying flexibility:

        1. The :attr:`decoding_strategy` argument.

            - **"train_greedy"**: decoding in teacher-forcing fashion (i.e.,
              feeding ground truth to decode the next step), and for each step
              sample is obtained by taking the `argmax` of logits.
              Argument :attr:`inputs` is required for this strategy.
            - **"infer_greedy"**: decoding in inference fashion (i.e., feeding
              `generated` sample to decode the next step), and for each step
              sample is obtained by taking the `argmax` of logits.
              Arguments :attr:`(start_tokens, end_token)` are
              required for this strategy, and argument
              :attr:`max_decoding_length` is optional.
            - **"infer_sample"**: decoding in inference fashion, and for each
              step sample is obtained by `random sampling` from the logits.
              Arguments :attr:`(start_tokens, end_token)` are required for this
              strategy, and argument :attr:`max_decoding_length` is optional.

          This argument is used only when arguments :attr:`helper` and
          :attr:`beam_width` are both `None`.

        2. The :attr:`helper` argument: An instance of subclass of
           :class:`texar.modules.Helper`.
           This provides a superset of decoding strategies than above.
           The interface is the same as in RNN decoders.
           Please refer to :meth:`texar.modules.RNNDecoderBase._build` for
           detailed usage and examples.

           Note that, here, though using a
           :class:`~texar.modules.TrainingHelper` corresponds to the
           "train_greedy" strategy above and will get the same output results,
           the implementation is *slower* than
           directly setting `decoding_strategy="train_greedy"`.

           Argument :attr:`max_decoding_length` is optional.

        3. **Beam search**: set :attr:`beam_width` to use beam search decoding.
           Arguments :attr:`(start_tokens, end_token)` are required,
           and argument :attr:`max_decoding_length` is optional.

        Args:
            memory (optional): The memory to attend, e.g., the output of an RNN
                encoder. A Tensor of shape `[batch_size, memory_max_time, dim]`.
            memory_sequence_length (optional): A Tensor of shape `[batch_size]`
                containing the sequence lengths for the batch entries in
                memory. Used to create attention bias of
                :attr:`memory_attention_bias` is not given. Ignored if
                `memory_attention_bias` is provided.
            memory_attention_bias (optional): A Tensor of shape
                `[batch_size, num_heads, memory_max_time, dim]`.
                An attention bias typically sets the value of a padding
                position to a large negative value for masking. If not given,
                :attr:`memory_sequence_length` is used to automatically
                create an attention bias.
            inputs (optional): Input tensor for teacher forcing decoding, of
                shape `[batch_size, target_max_time, emb_dim]` containing the
                target sequence word embeddings.
                Used when :attr:`decoding_strategy` is set to "train_greedy".
            decoding_strategy (str): A string specifying the decoding
                strategy, including "train_greedy", "infer_greedy",
                "infer_sample".
                Different arguments are required based on the
                strategy. See above for details. Ignored if
                :attr:`beam_width` or :attr:`helper` is set.
            beam_width (int): Set to use beam search. If given,
                :attr:`decoding_strategy` is ignored.
            length_penalty (float): Length penalty coefficient used in beam
                search decoding. Refer to https://arxiv.org/abs/1609.08144
                for more details.
                It Should be larger if longer sentences are wanted.
            start_tokens (optional): An int Tensor of shape `[batch_size]`,
                containing the start tokens.
                Used when :attr:`decoding_strategy` = "infer_greedy" or
                "infer_sample", or :attr:`beam_width` is set.
                Ignored if :attr:`context` is given.
            end_token (optional): An int 0D Tensor, the token that marks end
                of decoding.
                Used when :attr:`decoding_strategy` = "infer_greedy" or
                "infer_sample", or :attr:`beam_width` is set.
            context (optional): An int Tensor of shape `[batch_size, length]`,
                containing the starting tokens for decoding.
                If context is set, :attr:`start_tokens` will be ignored.
            context_sequence_length (optional): specify the length of context.
            softmax_temperature (optional): A float 0D Tensor, value to divide
                the logits by before computing the softmax. Larger values
                (above 1.0) result in more random samples. Must > 0. If `None`,
                1.0 is used.
                Used when :attr:`decoding_strategy` = "infer_sample"`.
            max_decoding_length (optional): An int scalar Tensor indicating
                the maximum allowed number of decoding steps.
                If `None` (default), use "max_decoding_length" defined in
                :attr:`hparams`. Ignored in "train_greedy" decoding.
            impute_finished (bool): If `True`, then states for batch
                entries which are marked as finished get copied through and
                the corresponding outputs get zeroed out.  This causes some
                slowdown at each time step, but ensures that the final state
                and outputs have the correct values and that backprop ignores
                time steps that were marked as finished. Ignored in
                "train_greedy" decoding.
            embedding (optional): Embedding used when
                "infer_greedy" or "infer_sample" `decoding_strategy`, or
                beam search, is used. This can be
                a callable or the `params` argument for
                :tf_main:`embedding_lookup <nn/embedding_lookup>`.
                If a callable, it can take a vector tensor of token `ids`,
                or take two arguments (`ids`, `times`), where `ids`
                is a vector tensor of token ids, and `times` is a vector tensor
                of time steps (i.e., position ids). The latter case can be used
                when attr:`embedding` is a combination of word embedding and
                position embedding.
            helper (optional): An instance of
                :tf_main:`Helper <contrib/seq2seq/Helper>` that defines the
                decoding strategy. If given, :attr:`decoding_strategy` is
                ignored.
            mode (optional): A tensor taking value in
                :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including
                `TRAIN`, `EVAL`, and `PREDICT`. Controls dropout mode.
                If `None` (default), :func:`texar.global_mode`
                is used.

        Returns:

            - For **"train_greedy"** decoding, returns an instance of \
            :class:`~texar.modules.TransformerDecoderOutput` which contains\
            `sample_id` and `logits`.

            - For **"infer_greedy"** and **"infer_sample"** decoding or\
            decoding with :attr:`helper`, returns\
            a tuple `(outputs, sequence_lengths)`, where `outputs` is an \
            instance of :class:`~texar.modules.TransformerDecoderOutput` as\
            in "train_greedy", and `sequence_lengths` is a Tensor of shape\
            `[batch_size]` containing the length of each sample.

            - For **beam search** decoding, returns a `dict` containing keys\
            "sample_id" and "log_prob".

                - **"sample_id"** is an int Tensor of shape \
                `[batch_size, max_time, beam_width]` containing generated\
                token indexes. `sample_id[:,:,0]` is the highest-probable \
                sample.
                - **"log_prob"** is a float Tensor of shape \
                `[batch_size, beam_width]` containing the log probability \
                of each sequence sample.
        """

        if memory is not None:
            if memory_attention_bias is None:
                if memory_sequence_length is None:
                    raise ValueError("`memory_sequence_length` is required if "
                                     "`memory_attention_bias` is not given.")

                enc_padding = 1 - tf.sequence_mask(memory_sequence_length,
                                                   shape_list(memory)[1],
                                                   dtype=tf.float32)
                memory_attention_bias = attn.attention_bias_ignore_padding(
                    enc_padding)

        # context will be used in step function for dynamic_decode
        if context is not None:
            start_tokens = context[:, 0]
            self.context = context[:, 1:]
            self.context_sequence_length = context_sequence_length - 1
        else:
            self.context = None

        self.embedding = embedding

        if helper is None and beam_width is None and \
                decoding_strategy == 'train_greedy':  # Teacher-forcing

            decoder_self_attention_bias = (attn.attention_bias_lower_triangle(
                shape_list(inputs)[1]))

            decoder_output = self._self_attention_stack(
                inputs,
                memory,
                decoder_self_attention_bias=decoder_self_attention_bias,
                memory_attention_bias=memory_attention_bias,
                cache=None,
                mode=mode)
            logits = self._output_layer(decoder_output)
            preds = tf.to_int32(tf.argmax(logits, axis=-1))
            rets = TransformerDecoderOutput(logits=logits, sample_id=preds)

        else:
            if max_decoding_length is None:
                max_decoding_length = self._hparams.max_decoding_length
            self.max_decoding_length = max_decoding_length
            if beam_width is None:  # Inference-like decoding
                # Prepare helper
                if helper is None:
                    if decoding_strategy == "infer_greedy":
                        helper = tx_helper.GreedyEmbeddingHelper(
                            embedding, start_tokens, end_token)
                    elif decoding_strategy == "infer_sample":
                        helper = tx_helper.SampleEmbeddingHelper(
                            embedding, start_tokens, end_token,
                            softmax_temperature)
                    else:
                        raise ValueError(
                            "Unknown decoding strategy: {}".format(
                                decoding_strategy))
                self._helper = helper

                self._cache = self._init_cache(memory,
                                               memory_attention_bias,
                                               beam_search_decoding=False)

                if context is not None:  # To avoid out-of-range in `step`
                    paddings = [[0, 0] for _ in range(get_rank(self.context))]
                    paddings[1][1] = \
                        max_decoding_length - shape_list(self.context)[1]
                    self.context = tf.pad(self.context, paddings=paddings)

                outputs, _, sequence_lengths = dynamic_decode(
                    decoder=self,
                    impute_finished=impute_finished,
                    maximum_iterations=max_decoding_length,
                    output_time_major=False,
                    scope=self.variable_scope)

                if context is not None:
                    # Here the length of sample_id will be larger than that
                    # of logit by 1, because there will be a additional
                    # start_token in the returned sample_id.
                    # the start_id should be the first token of the
                    # given context
                    outputs = TransformerDecoderOutput(
                        logits=outputs.logits,
                        sample_id=tf.concat([
                            tf.expand_dims(start_tokens, 1), outputs.sample_id
                        ],
                                            axis=1))
                    sequence_lengths = sequence_lengths + 1
                rets = outputs, sequence_lengths

            else:  # Beam-search decoding
                # Ignore `decoding_strategy`; Assume `helper` is not set
                if helper is not None:
                    raise ValueError("Must not set 'beam_width' and 'helper' "
                                     "simultaneously.")
                _batch_size = shape_list(start_tokens)[0]
                self._cache = self._init_cache(memory,
                                               memory_attention_bias,
                                               beam_search_decoding=True,
                                               batch_size=_batch_size)

                # The output format is different when running beam search
                sample_id, log_prob = self._beam_decode(
                    start_tokens,
                    end_token,
                    beam_width=beam_width,
                    length_penalty=length_penalty,
                    decode_length=max_decoding_length,
                )
                rets = {'sample_id': sample_id, 'log_prob': log_prob}

        if not self._built:
            self._add_internal_trainable_variables()
            self._built = True

        return rets
Esempio n. 32
0
    def build_decoder(self):
        print("building decoder and attention..")
        with tf.variable_scope('decoder'):
            self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell()

            output_layer = Dense(self.num_symbols, name='output_projection')
            start_tokens = tf.ones([self.batch_size,], tf.int32) * data_utils.start_token
            end_token = data_utils.end_token
            
            helper = GumbelSoftmaxEmbeddingHelper(embedding=self.embeddings, start_tokens=start_tokens,end_token= end_token, tau=self.tau)
            
            max_decoder_length = tf.reduce_max(self.encoder_inputs_length)
            decoder = tf.contrib.seq2seq.BasicDecoder(cell=self.decoder_cell, helper=helper, initial_state=self.decoder_initial_state)#, output_layer=output_layer)
            (self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train) = (seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=max_decoder_length,impute_finished=True))
            self.decoder_logits_train = tf.identity(self.decoder_outputs_train.rnn_output)#IMPORTANT
            self.decoder_pred_decode = tf.argmax(self.decoder_outputs_train.sample_id, axis=-1, output_type=tf.int32)#IMPORTANT
            
            #newintput = data_utils.insertSequence(self.decoder_pred_decode.eval(), self.encoder_inputs.eval(),1, self.total_num)
            '''
            _loss = 0
            for i in range(self.detector.batch_size):
                source, source_len = data_utils.prepare_batch(newintput[i:i*self.detector.batch_size], self.detector.stride, self.detector.maxlen, self.detector.batch_size)
                _, logits = self.detector.predict(self.sess, source, source_len)
                _loss += logits[0] - logits[1]
            '''
            self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.encoder_inputs, self.decoder_pred_decode), self.dtype))
            masks = tf.sequence_mask(lengths=self.encoder_inputs_length, maxlen=max_decoder_length, dtype=self.dtype, name='masks')
            self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train, targets=self.encoder_inputs, weights=masks)
            #self.loss = _loss + np.sum(self.decoder_pred_decode**masks**2)/np.sum(masks)/2
            tf.summary.scalar('loss', self.loss)
            self.init_optimizer()
Esempio n. 33
0
    def _test_beam_search(self,
                          decoder,
                          initial_state=None,
                          tiled_initial_state=None,
                          tf_initial_state=None,
                          beam_width_1=1,
                          initiated=False):
        # Compare with tf built-in BeamSearchDecoder
        outputs, final_state, _ = beam_search_decode(decoder_or_cell=decoder,
                                                     embedding=self._embedding,
                                                     start_tokens=[1] *
                                                     self._batch_size,
                                                     end_token=2,
                                                     beam_width=beam_width_1,
                                                     max_decoding_length=20)

        self.assertIsInstance(outputs,
                              tf.contrib.seq2seq.FinalBeamSearchDecoderOutput)
        self.assertIsInstance(final_state,
                              tf.contrib.seq2seq.BeamSearchDecoderState)

        num_trainable_variables = len(tf.trainable_variables())
        _ = decoder(decoding_strategy='infer_greedy',
                    embedding=self._embedding,
                    start_tokens=[1] * self._batch_size,
                    end_token=2,
                    max_decoding_length=20)
        self.assertEqual(num_trainable_variables,
                         len(tf.trainable_variables()))

        if tf_initial_state is None:
            tf_initial_state = decoder.cell.zero_state(
                self._batch_size * beam_width_1, tf.float32)
        beam_decoder = BeamSearchDecoder(cell=decoder.cell,
                                         embedding=self._embedding,
                                         start_tokens=[1] * self._batch_size,
                                         end_token=2,
                                         initial_state=tf_initial_state,
                                         beam_width=beam_width_1,
                                         output_layer=decoder.output_layer)

        outputs_1, final_state_1, _ = dynamic_decode(decoder=beam_decoder,
                                                     maximum_iterations=20)

        ## Tests time major
        outputs_2, _, _ = beam_search_decode(
            decoder_or_cell=decoder,
            embedding=self._embedding,
            start_tokens=[1] * self._batch_size,
            end_token=2,
            beam_width=self._beam_width,
            initial_state=initial_state,
            tiled_initial_state=tiled_initial_state,
            max_decoding_length=21)
        outputs_3, _, _ = beam_search_decode(
            decoder_or_cell=decoder,
            embedding=self._embedding,
            start_tokens=[1] * self._batch_size,
            end_token=2,
            beam_width=self._beam_width,
            initial_state=initial_state,
            tiled_initial_state=tiled_initial_state,
            max_decoding_length=21,
            output_time_major=True)

        with self.test_session() as sess:
            if not initiated:
                sess.run(tf.global_variables_initializer())

            outputs_, final_state_, outputs_1_, final_state_1_ = sess.run(
                [outputs, final_state, outputs_1, final_state_1],
                feed_dict={
                    context.global_mode(): tf.estimator.ModeKeys.PREDICT
                })

            np.testing.assert_array_equal(outputs_.predicted_ids,
                                          outputs_1_.predicted_ids)
            np.testing.assert_array_equal(
                outputs_.beam_search_decoder_output.scores,
                outputs_1_.beam_search_decoder_output.scores)
            np.testing.assert_array_equal(
                outputs_.beam_search_decoder_output.predicted_ids,
                outputs_1_.beam_search_decoder_output.predicted_ids)
            np.testing.assert_array_equal(
                outputs_.beam_search_decoder_output.parent_ids,
                outputs_1_.beam_search_decoder_output.parent_ids)
            np.testing.assert_array_equal(final_state_.log_probs,
                                          final_state_1_.log_probs)
            np.testing.assert_array_equal(final_state_.lengths,
                                          final_state_1_.lengths)

            outputs_2_, outputs_3_ = sess.run([outputs_2, outputs_3],
                                              feed_dict={
                                                  context.global_mode():
                                                  tf.estimator.ModeKeys.PREDICT
                                              })
            self.assertEqual(outputs_2_.predicted_ids.shape,
                             tuple([self._batch_size, 21, 11]))
            self.assertEqual(outputs_3_.predicted_ids.shape,
                             tuple([21, self._batch_size, 11]))
Esempio n. 34
0
  def sample(self, n, max_length=None, z=None, temperature=None,
             start_inputs=None, beam_width=None, end_token=None):
    """Overrides BaseLstmDecoder `sample` method to add optional beam search.

    Args:
      n: Scalar number of samples to return.
      max_length: (Optional) Scalar maximum sample length to return. Required if
        data representation does not include end tokens.
      z: (Optional) Latent vectors to sample from. Required if model is
        conditional. Sized `[n, z_size]`.
      temperature: (Optional) The softmax temperature to use when not doing beam
        search. Defaults to 1.0. Ignored when `beam_width` is provided.
      start_inputs: (Optional) Initial inputs to use for batch.
        Sized `[n, output_depth]`.
      beam_width: (Optional) Width of beam to use for beam search. Beam search
        is disabled if not provided.
      end_token: (Optional) Scalar token signaling the end of the sequence to
        use for early stopping.
    Returns:
      samples: Sampled sequences. Sized `[n, max_length, output_depth]`.
    Raises:
      ValueError: If `z` is provided and its first dimension does not equal `n`.
    """
    if beam_width is None:
      end_fn = (None if end_token is None else
                lambda x: tf.equal(tf.argmax(x, axis=-1), end_token))
      return super(CategoricalLstmDecoder, self).sample(
          n, max_length, z, temperature, start_inputs, end_fn)

    # If `end_token` is not given, use an impossible value.
    end_token = self._output_depth if end_token is None else end_token
    if z is not None and z.shape[0].value != n:
      raise ValueError(
          '`z` must have a first dimension that equals `n` when given. '
          'Got: %d vs %d' % (z.shape[0].value, n))

    if temperature is not None:
      tf.logging.warning('`temperature` is ignored when using beam search.')
    # Use a dummy Z in unconditional case.
    z = tf.zeros((n, 0), tf.float32) if z is None else z

    # If not given, start with dummy `-1` token and replace with zero vectors in
    # `embedding_fn`.
    start_tokens = (
        tf.argmax(start_inputs, axis=-1, output_type=tf.int32)
        if start_inputs is not None else
        -1 * tf.ones([n], dtype=tf.int32))

    initial_state = initial_cell_state_from_embedding(
        self._dec_cell, z, name='decoder/z_to_initial_state')
    beam_initial_state = seq2seq.tile_batch(
        initial_state, multiplier=beam_width)

    # Tile `z` across beams.
    beam_z = tf.tile(tf.expand_dims(z, 1), [1, beam_width, 1])

    def embedding_fn(tokens):
      # If tokens are the start_tokens (negative), replace with zero vectors.
      next_inputs = tf.cond(
          tf.less(tokens[0, 0], 0),
          lambda: tf.zeros([n, beam_width, self._output_depth]),
          lambda: tf.one_hot(tokens, self._output_depth))

      # Concatenate `z` to next inputs.
      next_inputs = tf.concat([next_inputs, beam_z], axis=-1)
      return next_inputs

    decoder = seq2seq.BeamSearchDecoder(
        self._dec_cell,
        embedding_fn,
        start_tokens,
        end_token,
        beam_initial_state,
        beam_width,
        output_layer=self._output_layer,
        length_penalty_weight=0.0)

    final_output, _, _ = seq2seq.dynamic_decode(
        decoder,
        maximum_iterations=max_length,
        swap_memory=True,
        scope='decoder')

    return tf.one_hot(
        final_output.predicted_ids[:, :, 0],
        self._output_depth)
Esempio n. 35
0
	def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False,
			global_step=None, is_training=False, is_evaluating=False):
		"""
		Initializes the model for inference

		sets "mel_outputs" and "alignments" fields.

		Args:
			- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
			  steps in the input time series, and values are character IDs
			- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
			of each sequence in inputs.
			- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
			of steps in the output time series, M is num_mels, and values are entries in the mel
			spectrogram. Only needed for training.
		"""
		if mel_targets is None and stop_token_targets is not None:
			raise ValueError('no mel targets were provided but token_targets were given')
		if mel_targets is not None and stop_token_targets is None and not gta:
			raise ValueError('Mel targets are provided without corresponding token_targets')
		if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training:
			raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!')
		if gta and linear_targets is not None:
			raise ValueError('Linear spectrogram prediction is not supported in GTA mode!')
		if is_training and self._hparams.mask_decoder and targets_lengths is None:
			raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!')
		if is_training and is_evaluating:
			raise RuntimeError('Model can not be in training and evaluation modes at the same time!')

		with tf.variable_scope('inference') as scope:
			batch_size = tf.shape(inputs)[0]
			hp = self._hparams
			assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled')
			if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
				assert global_step is not None

			#GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
			post_condition = hp.predict_linear and not gta

			# Embeddings ==> [batch_size, sequence_length, embedding_dim]
			embedding_table = tf.get_variable(
				'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
			embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs)


			#Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
			encoder_cell = TacotronEncoderCell(
				EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'),
				EncoderRNN(is_training, size=hp.encoder_lstm_units,
					zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM'))

			encoder_outputs = encoder_cell(embedded_inputs, input_lengths)

			#For shape visualization purpose
			enc_conv_output_shape = encoder_cell.conv_output_shape


			#Decoder Parts
			#Attention Decoder Prenet
			prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet')
			#Attention Mechanism
			attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp,
				mask_encoder=hp.mask_encoder, memory_sequence_length=input_lengths, smoothing=hp.smoothing, 
				cumulate_weights=hp.cumulative_weights)
			#Decoder LSTM Cells
			decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
				size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_lstm')
			#Frames Projection layer
			frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform')
			#<stop_token> projection layer
			stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection')


			#Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
			decoder_cell = TacotronDecoderCell(
				prenet,
				attention_mechanism,
				decoder_lstm,
				frame_projection,
				stop_projection)


			#Define the helper for our decoder
			if is_training or is_evaluating or gta:
				self.helper = TacoTrainingHelper(batch_size, mel_targets, stop_token_targets, hp, gta, is_evaluating, global_step)
			else:
				self.helper = TacoTestHelper(batch_size, hp)


			#initial decoder state
			decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

			#Only use max iterations at synthesis time
			max_iters = hp.max_iters if not (is_training or is_evaluating) else None

			#Decode
			(frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
				CustomDecoder(decoder_cell, self.helper, decoder_init_state),
				impute_finished=False,
				maximum_iterations=max_iters,
				swap_memory=hp.tacotron_swap_with_cpu)


			# Reshape outputs to be one output per entry 
			#==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
			decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
			stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])

		
			#Postnet
			postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions')

			#Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
			residual = postnet(decoder_output)

			#Project residual to same dimension as mel spectrogram 
			#==> [batch_size, decoder_steps * r, num_mels]
			residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection')
			projected_residual = residual_projection(residual)


			#Compute the mel spectrogram
			mel_outputs = decoder_output + projected_residual


			if post_condition:
				#Based on https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
				#Post-processing Network to map mels to linear spectrograms using same architecture as the encoder
				post_processing_cell = TacotronEncoderCell(
				EncoderConvolutions(is_training, hparams=hp, scope='post_processing_convolutions'),
				EncoderRNN(is_training, size=hp.encoder_lstm_units,
					zoneout=hp.tacotron_zoneout_rate, scope='post_processing_LSTM'))

				expand_outputs = post_processing_cell(mel_outputs)
				linear_outputs = FrameProjection(hp.num_freq, scope='post_processing_projection')(expand_outputs)

			#Grab alignments from the final decoder state
			alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])

			if is_training:
				self.ratio = self.helper._ratio
			self.inputs = inputs
			self.input_lengths = input_lengths
			self.decoder_output = decoder_output
			self.alignments = alignments
			self.stop_token_prediction = stop_token_prediction
			self.stop_token_targets = stop_token_targets
			self.mel_outputs = mel_outputs
			if post_condition:
				self.linear_outputs = linear_outputs
				self.linear_targets = linear_targets
			self.mel_targets = mel_targets
			self.targets_lengths = targets_lengths
			log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
			log('  Train mode:               {}'.format(is_training))
			log('  Eval mode:                {}'.format(is_evaluating))
			log('  GTA mode:                 {}'.format(gta))
			log('  Synthesis mode:           {}'.format(not (is_training or is_evaluating)))
			log('  embedding:                {}'.format(embedded_inputs.shape))
			log('  enc conv out:             {}'.format(enc_conv_output_shape))
			log('  encoder out:              {}'.format(encoder_outputs.shape))
			log('  decoder out:              {}'.format(decoder_output.shape))
			log('  residual out:             {}'.format(residual.shape))
			log('  projected residual out:   {}'.format(projected_residual.shape))
			log('  mel out:                  {}'.format(mel_outputs.shape))
			if post_condition:
				log('  linear out:               {}'.format(linear_outputs.shape))
			log('  <stop_token> out:         {}'.format(stop_token_prediction.shape))