Beispiel #1
0
    def BuildNetwork(self, learningRate):
        self.dataInput = tensorflow.placeholder(
            dtype=tensorflow.float32, shape=[self.batchSize, 1000, 40], name='dataInput')
        self.dataSeqInput = tensorflow.placeholder(
            dtype=tensorflow.int32, shape=[self.batchSize], name='dataSeqInput')
        self.labelInput = tensorflow.placeholder(
            dtype=tensorflow.int32, shape=[self.batchSize, None], name='labelInput')
        self.labelSeqInput = tensorflow.placeholder(
            dtype=tensorflow.int32, shape=[self.batchSize], name='labelSeqInput')

        self.parameters['EmbeddingDictionary'] = tensorflow.Variable(
            initial_value=tensorflow.truncated_normal([50, 2 * self.hiddenNoduleNumbers]), dtype=tensorflow.float32,
            name='EmbeddingDictionary')

        with tensorflow.name_scope('Encoder'):
            self.parameters['Layer1st_Conv'] = tensorflow.layers.conv2d(
                inputs=self.dataInput[:, :, :, tensorflow.newaxis], filters=8, kernel_size=[3, 3], strides=[1, 1],
                padding='SAME', name='Layer1st_Conv')
            self.parameters['Layer1st_MaxPooling'] = tensorflow.layers.max_pooling2d(
                inputs=self.parameters['Layer1st_Conv'], pool_size=[3, 3], strides=[2, 1], padding='SAME',
                name='Layer1st_MaxPooling')
            self.parameters['Layer2nd_Conv'] = tensorflow.layers.conv2d(
                inputs=self.parameters['Layer1st_MaxPooling'], filters=16, kernel_size=[3, 3], strides=[1, 1],
                padding='SAME', name='Layer2nd_Conv')
            self.parameters['Layer2nd_MaxPooling'] = tensorflow.layers.max_pooling2d(
                inputs=self.parameters['Layer2nd_Conv'], pool_size=[3, 3], strides=[2, 1], padding='SAME',
                name='Layer2nd_MaxPooling')
            self.parameters['Layer3rd_Conv'] = tensorflow.layers.conv2d(
                inputs=self.parameters['Layer2nd_MaxPooling'], filters=16, kernel_size=[3, 3], strides=[1, 1],
                padding='SAME', name='Layer3rd_Conv')

        ###############################################################################

        self.parameters['AttentionList'] = CNN_StandardAttention_Initializer(
            inputData=self.parameters['Layer3rd_Conv'], inputSeq=self.dataSeqInput, attentionScope=None,
            hiddenNoduleNumber=16, scopeName='CSA')
        self.parameters['AttentionResult'] = self.parameters['AttentionList']['FinalResult']

        ###############################################################################

        self.parameters['DecoderInitialState_C'] = tensorflow.layers.dense(
            inputs=self.parameters['AttentionResult'], units=2 * self.hiddenNoduleNumbers, activation=None,
            name='DecoderInitialState_C')
        self.parameters['DecoderInitialState_H'] = tensorflow.layers.dense(
            inputs=self.parameters['AttentionResult'], units=2 * self.hiddenNoduleNumbers, activation=None,
            name='DecoderInitialState_H')
        self.parameters['DecoderInitialState'] = rnn.LSTMStateTuple(
            c=self.parameters['DecoderInitialState_C'], h=self.parameters['DecoderInitialState_H'])

        ###############################################################################

        self.parameters['Helper'] = seq2seq.GreedyEmbeddingHelper(
            embedding=self.parameters['EmbeddingDictionary'],
            start_tokens=tensorflow.ones(self.batchSize, dtype=tensorflow.int32) * 40,
            end_token=0)
        self.parameters['Decoder_Cell'] = rnn.LSTMCell(num_units=2 * self.hiddenNoduleNumbers)
        self.parameters['Decoder'] = seq2seq.BasicDecoder(
            cell=self.parameters['Decoder_Cell'], helper=self.parameters['Helper'],
            initial_state=self.parameters['DecoderInitialState'])

        self.parameters['DecoderOutput'], self.parameters['DecoderFinalState'], self.parameters['DecoderSeqLen'] = \
            seq2seq.dynamic_decode(decoder=self.parameters['Decoder'], output_time_major=False,
                                   maximum_iterations=tensorflow.reduce_max(self.labelSeqInput))

        self.parameters['Logits'] = tensorflow.layers.dense(
            inputs=self.parameters['DecoderOutput'][0], units=50, activation=None, name='Logits')
        # self.parameters['Mask'] = tensorflow.to_float(tensorflow.not_equal(self.labelInput, 0))
        self.parameters['Loss'] = tensorflow.reduce_mean(tensorflow.nn.softmax_cross_entropy_with_logits_v2(
            labels=tensorflow.one_hot(self.labelInput, depth=50, dtype=tensorflow.float32),
            logits=self.parameters['Logits']), name='Loss')
        self.train = tensorflow.train.AdamOptimizer(learning_rate=learningRate).minimize(self.parameters['Loss'])
Beispiel #2
0
    def decoder(self, memory):
        """
        Implementation of the Tacotron decoder network.

        Arguments:
            memory (tf.Tensor):
                The output states of the encoder RNN concatenated over time. Its shape is
                expected to be shape=(B, T_sent, 2 * encoder.n_gru_units) with B being the batch
                size, T_sent being the number of tokens in the sentence including the EOS token.

        Returns:
            tf.tensor:
                Generated reduced Mel. spectrogram. The shape is
                shape=(B, T_spec // r, n_mels * r), with B being the batch size, T_spec being
                the number of frames in the spectrogram and r being the reduction factor.
        """
        with tf.variable_scope('decoder2'):
            # Query the current batch size.
            batch_size = tf.shape(memory)[0]

            # Query the number of layers for the decoder RNN.
            n_decoder_layers = self.hparams.decoder.n_gru_layers

            # Query the number of units for the decoder cells.
            n_decoder_units = self.hparams.decoder.n_decoder_gru_units

            # Query the number of units for the attention cell.
            n_attention_units = self.hparams.decoder.n_attention_units

            # General attention mechanism parameters that are the same for all mechanisms.
            mechanism_params = {
                'num_units': n_attention_units,
                'memory': memory,
            }

            if model_params.attention.mechanism == LocalLuongAttention:
                # Update the parameters with additional parameters for the local attention case.
                mechanism_params.update({
                    'attention_mode':
                    model_params.attention.luong_local_mode,
                    'score_mode':
                    model_params.attention.luong_local_score,
                    'd':
                    model_params.attention.luong_local_window_D,
                    'force_gaussian':
                    model_params.attention.luong_force_gaussian,
                    'const_batch_size':
                    16
                })

            # Create the attention mechanism.
            attention_mechanism = model_params.attention.mechanism(
                **mechanism_params)

            # Create the attention RNN cell.
            if model_params.force_cudnn:
                attention_cell = tfcrnn.CudnnCompatibleGRUCell(
                    num_units=n_attention_units)
            else:
                attention_cell = tf.nn.rnn_cell.GRUCell(
                    num_units=n_attention_units)

            # Apply the pre-net to each decoder input as show in [1], figure 1.
            attention_cell = PrenetWrapper(attention_cell,
                                           self.hparams.decoder.pre_net_layers,
                                           self.is_training())

            # Select the attention wrapper needed for the current attention mechanism.
            if model_params.attention.mechanism == LocalLuongAttention:
                wrapper = AdvancedAttentionWrapper
            else:
                wrapper = tfc.seq2seq.AttentionWrapper

            # Connect the attention cell with the attention mechanism.
            wrapped_attention_cell = wrapper(
                cell=attention_cell,
                attention_mechanism=attention_mechanism,
                attention_layer_size=n_attention_units,
                alignment_history=True,
                output_attention=True,
                initial_cell_state=None
            )  # => (B, T_sent, n_attention_units) = (B, T_sent, 256)

            # Stack several GRU cells and apply a residual connection after each cell.
            # Before the input reaches the decoder RNN it passes through the attention cell.
            cells = [wrapped_attention_cell]
            for i in range(n_decoder_layers):
                # Create a decoder GRU cell.
                if model_params.force_cudnn:
                    # => (B, T_spec, n_decoder_units) = (B, T_spec, 256)
                    cell = tfcrnn.CudnnCompatibleGRUCell(
                        num_units=n_decoder_units)
                else:
                    # => (B, T_spec, n_decoder_units) = (B, T_spec, 256)
                    cell = tf.nn.rnn_cell.GRUCell(num_units=n_decoder_units)

                # => (B, T_spec, n_decoder_units) = (B, T_spec, 256)
                cell = tf.nn.rnn_cell.ResidualWrapper(cell)
                cells.append(cell)

            # => (B, T_spec, n_decoder_units) = (B, T_spec, 256)
            decoder_cell = tf.nn.rnn_cell.MultiRNNCell(cells,
                                                       state_is_tuple=True)

            # Project the final cells output to the decoder target size.
            # => (B, T_spec, target_size * reduction) = (B, T_spec, 80 * reduction)
            output_cell = tfc.rnn.OutputProjectionWrapper(
                cell=decoder_cell,
                output_size=self.hparams.decoder.target_size *
                self.hparams.reduction,
                # activation=tf.nn.sigmoid
            )

            decoder_initial_state = output_cell.zero_state(
                batch_size=batch_size, dtype=tf.float32)

            if self.is_training():
                # During training we do not stop decoding manually. The decoder automatically
                # decodes as many time steps as are contained in the ground truth data.
                maximum_iterations = None

                # Unfold the reduced spectrogram in order to grab the r'th ground truth frames.
                mel_targets = tf.reshape(self.inp_mel_spec,
                                         [batch_size, -1, self.hparams.n_mels])

                # Create a custom training helper for feeding ground truth frames during training.
                helper = TacotronTrainingHelper(
                    batch_size=batch_size,
                    outputs=mel_targets,
                    input_size=self.hparams.decoder.target_size,
                    reduction_factor=self.hparams.reduction,
                )
            elif self._mode == Mode.EVAL:
                # During evaluation we stop decoding after the same number of frames the ground
                # truth has.
                maximum_iterations = tf.shape(self.inp_mel_spec)[1]

                # Create a custom inference helper that handles proper evaluation data feeding.
                helper = TacotronInferenceHelper(
                    batch_size=batch_size,
                    input_size=self.hparams.decoder.target_size)
            else:
                # During inference we stop decoding after `maximum_iterations` frames.
                maximum_iterations = self.hparams.decoder.maximum_iterations // self.hparams.reduction

                # Create a custom inference helper that handles proper inference data feeding.
                helper = TacotronInferenceHelper(
                    batch_size=batch_size,
                    input_size=self.hparams.decoder.target_size)

            decoder = seq2seq.BasicDecoder(cell=output_cell,
                                           helper=helper,
                                           initial_state=decoder_initial_state)

            # Start decoding.
            decoder_outputs, final_state, final_sequence_lengths = seq2seq.dynamic_decode(
                decoder=decoder,
                output_time_major=False,
                impute_finished=False,
                maximum_iterations=maximum_iterations)

            # decoder_outputs => type=BasicDecoderOutput, (rnn_output, _)
            # final_state => type=AttentionWrapperState, (attention_wrapper_state, _, _)
            # final_sequence_lengths.shape = (B)

            # Create an attention alignment summary image.
            self.alignment_history = final_state[0].alignment_history.stack()

        # shape => (B, T_spec // r, n_mels * r)
        return decoder_outputs.rnn_output
    def init_decoder_variable(self):
        # Building decoder_cell and decoder_initial_state
        self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell(
        )

        # Initialize decoder embeddings to have variance=1.
        sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
        initializer = tf.random_uniform_initializer(-sqrt3,
                                                    sqrt3,
                                                    dtype=self.dtype)

        self.decoder_embeddings = tf.get_variable(
            name='embedding',
            shape=[self.num_decoder_symbols, self.embedding_size],
            initializer=initializer,
            dtype=self.dtype)

        # Input projection layer to feed embedded inputs to the cell
        # ** Essential when use_residual=True to match input/output dims
        input_layer = Dense(self.hidden_units,
                            dtype=self.dtype,
                            name='input_projection')

        # Output projection layer to convert cell_outputs to logits
        output_layer = Dense(self.num_decoder_symbols,
                             name='output_projection')

        if self.mode == 'train':
            # decoder_inputs_embedded: [batch_size, max_time_step + 1, embedding_size]
            self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                params=self.decoder_embeddings, ids=self.decoder_inputs_train)

            # Embedded inputs having gone through input projection layer
            self.decoder_inputs_embedded = input_layer(
                self.decoder_inputs_embedded)

            # Helper to feed inputs for training: read inputs from dense ground truth vectors
            training_helper = seq2seq.TrainingHelper(
                inputs=self.decoder_inputs_embedded,
                sequence_length=self.decoder_inputs_length_train,
                time_major=False,
                name='training_helper')

            training_decoder = seq2seq.BasicDecoder(
                cell=self.decoder_cell,
                helper=training_helper,
                initial_state=self.decoder_initial_state,
                output_layer=output_layer)
            # output_layer=None)

            # Maximum decoder time_steps in current batch
            max_decoder_length = tf.reduce_max(
                self.decoder_inputs_length_train)

            # decoder_outputs_train: BasicDecoderOutput
            #                        namedtuple(rnn_outputs, sample_id)
            # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False
            #                                   [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True
            # decoder_outputs_train.sample_id: [batch_size], tf.int32
            (self.decoder_outputs_train, self.decoder_last_state_train,
             self.decoder_outputs_length_train) = (seq2seq.dynamic_decode(
                 decoder=training_decoder,
                 output_time_major=False,
                 impute_finished=True,
                 maximum_iterations=max_decoder_length))

            # More efficient to do the projection on the batch-time-concatenated tensor
            # logits_train: [batch_size, max_time_step + 1, num_decoder_symbols]
            # self.decoder_logits_train = output_layer(self.decoder_outputs_train.rnn_output)
            self.decoder_logits_train = tf.identity(
                self.decoder_outputs_train.rnn_output)
            # Use argmax to extract decoder symbols to emit
            self.decoder_pred_train = tf.argmax(self.decoder_logits_train,
                                                axis=-1,
                                                name='decoder_pred_train')

            # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1]
            masks = tf.sequence_mask(lengths=self.decoder_inputs_length_train,
                                     maxlen=max_decoder_length,
                                     dtype=self.dtype,
                                     name='masks')

            # Computes per word average cross-entropy over a batch
            # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default
            self.loss = seq2seq.sequence_loss(
                logits=self.decoder_logits_train,
                targets=self.decoder_targets_train,
                weights=masks,
                average_across_timesteps=True,
                average_across_batch=True,
            )
            # Training summary for the current batch_loss
            tf.summary.scalar('loss', self.loss)

            # Contruct graphs for minimizing loss
            self.init_optimizer()

        elif self.mode == 'decode':

            # Start_tokens: [batch_size,] `int32` vector
            start_tokens = tf.ones([
                self.batch_size,
            ], tf.int32) * self.start_token
            end_token = self.end_token

            def embed_and_input_proj(inputs):
                return input_layer(
                    tf.nn.embedding_lookup(self.decoder_embeddings, inputs))

            if not self.use_beamsearch_decode:
                # Helper to feed inputs for greedy decoding: uses the argmax of the output
                decoding_helper = seq2seq.GreedyEmbeddingHelper(
                    start_tokens=start_tokens,
                    end_token=end_token,
                    embedding=embed_and_input_proj)
                # Basic decoder performs greedy decoding at each time step
                print("building greedy decoder..")
                inference_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=decoding_helper,
                    initial_state=self.decoder_initial_state,
                    output_layer=output_layer)
            else:
                # Beamsearch is used to approximately find the most likely translation
                print("building beamsearch decoder..")
                inference_decoder = beam_search_decoder.BeamSearchDecoder(
                    cell=self.decoder_cell,
                    embedding=embed_and_input_proj,
                    start_tokens=start_tokens,
                    end_token=end_token,
                    initial_state=self.decoder_initial_state,
                    beam_width=self.beam_width,
                    output_layer=output_layer,
                )

            (self.decoder_outputs_decode, self.decoder_last_state_decode,
             self.decoder_outputs_length_decode) = (
                 seq2seq.dynamic_decode(
                     decoder=inference_decoder,
                     output_time_major=False,
                     # impute_finished=True,	# error occurs
                     maximum_iterations=self.max_decode_step))

            if not self.use_beamsearch_decode:
                # decoder_outputs_decode.sample_id: [batch_size, max_time_step]
                # Or use argmax to find decoder symbols to emit:
                # self.decoder_pred_decode = tf.argmax(self.decoder_outputs_decode.rnn_output,
                #                                      axis=-1, name='decoder_pred_decode')

                # Here, we use expand_dims to be compatible with the result of the beamsearch decoder
                # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False)
                self.decoder_pred_decode = tf.expand_dims(
                    self.decoder_outputs_decode.sample_id, -1)

            else:
                # Use beam search to approximately find the most likely translation
                # decoder_pred_decode: [batch_size, max_time_step, beam_width] (output_major=False)
                self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids
Beispiel #4
0
def create_model(model,
                 labels,
                 decoder_inputs,
                 batch_size,
                 model_type="decode",
                 sep_positions=None):
  """Creates a classification model.

  Args:
    model: the BERT model from modeling.py
    labels: ground truth paragraph order
    decoder_inputs: the input to the decoder if used
    batch_size: the batch size
    model_type: one of decode, pooled, attn
    sep_positions: (optional) for "pooled" indecies of SEP tokens

  Returns:
    tuple of (loss, per_example_loss, logits, probabilities) for model
  """

  output_layer = model.get_pooled_output()

  hidden_size = output_layer.shape[-1].value
  tpu_batch_size = tf.shape(output_layer)[0]

  num_labels = 5  # GOOGLE-INTERNAL TODO(daniter) this shouldn't be hardcoded

  with tf.variable_scope("paragraph_reconstruct"):
    if model_type == "decode":
      lstm_cell = tf.nn.rnn_cell.LSTMCell(
          num_units=hidden_size, use_peepholes=True, state_is_tuple=True)

      def sample_fn(x):
        return tf.to_float(tf.reshape(tf.argmax(x, axis=-1), (-1, 1)))

      helper = FixedSizeInferenceHelper(
          sample_fn=sample_fn,
          sample_shape=[1],
          sample_dtype=tf.float32,
          start_inputs=decoder_inputs[:, 0],
          end_fn=None)

      # Decoder
      project_layer = tf.layers.Dense(
          num_labels, use_bias=False, name="output_projection")

      my_decoder = contrib_seq2seq.BasicDecoder(
          lstm_cell,
          helper,
          tf.nn.rnn_cell.LSTMStateTuple(output_layer, output_layer),
          output_layer=project_layer)

      # Dynamic decoding
      outputs, _, _ = contrib_seq2seq.dynamic_decode(
          my_decoder,
          swap_memory=True,
          scope="paragraph_reconstruct",
          maximum_iterations=5)

      logits = outputs.rnn_output

      cross_ent = tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=labels, logits=logits)

      per_example_loss = cross_ent
      loss = tf.reduce_sum(cross_ent) / tf.to_float(batch_size)
      probabilities = tf.nn.softmax(logits, axis=-1)

    # GOOGLE-INTERAL: TODO(daniter)  currently neither of these actually train
    elif model_type == "pooled":
      token_embeddings = model.get_sequence_output()
      # sep positions come out batch by batch so we need to add the batch index
      # we do that explicitly here since we don't know the batch size in the
      # record decoder
      batch_idx = tf.range(tpu_batch_size)
      batch_idx = tf.reshape(batch_idx, [tpu_batch_size, 1])
      batch_idx = tf.tile(batch_idx, [1, 5])  # double check
      batch_idx = tf.reshape(batch_idx, [tpu_batch_size, 5, 1])
      # batch_idx = tf.Print(batch_idx, [batch_idx],
      #                      message="batch_idx", summarize=999999)
      sep_positions = tf.concat([batch_idx, sep_positions], axis=2)
      # sep_positions = tf.Print(sep_positions, [sep_positions],
      #                          message="sep_positions", summarize=999999)

      sep_vecs = tf.gather_nd(token_embeddings, sep_positions)
      sep_vecs = tf.reshape(sep_vecs, [tpu_batch_size, 5, hidden_size])
      # sep_vecs = tf.Print(sep_vecs, [sep_vecs], message="sep_vecs",
      #                     summarize=999999)

      logits = tf.layers.dense(
          inputs=sep_vecs, units=num_labels, name="output_projection")
      # logits = tf.Print(logits, [logits], message="logits", summarize=999999)
      cross_ent = tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=labels, logits=logits)

      per_example_loss = cross_ent
      loss = tf.reduce_sum(cross_ent) / tf.to_float(batch_size)
      probabilities = tf.nn.softmax(logits, axis=-1)

    elif model_type == "attn":
      # change size to match sequence embedding size
      input_consts = tf.constant([0, 1, 2, 3, 4])
      position_encoding = tf.broadcast_to(input_consts, [tpu_batch_size, 5])
      # position_encoding = tf.to_float(
      # tf.reshape(position_encoding, (-1, 5, 1)))
      token_type_table = tf.get_variable(
          name="attention_embedding",
          shape=[5, 512],  # don't hardcode
          initializer=tf.truncated_normal_initializer(stddev=0.02))
      # This vocab will be small so we always do one-hot here, since it is
      # always faster for a small vocabulary.
      flat_token_type_ids = tf.reshape(position_encoding, [-1])
      one_hot_ids = tf.one_hot(flat_token_type_ids, depth=5)
      token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
      token_type_embeddings = tf.reshape(token_type_embeddings,
                                         [tpu_batch_size, 5, 512])

      token_embeddings = model.get_sequence_output()
      attn = modeling.attention_layer(token_type_embeddings, token_embeddings)
      attn = tf.reshape(attn, (-1, 5, 512))  # head size
      logits = tf.layers.dense(
          inputs=attn, units=num_labels, name="output_projection")
      cross_ent = tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=labels, logits=logits)

      per_example_loss = cross_ent
      loss = tf.reduce_sum(cross_ent) / tf.to_float(batch_size)
      probabilities = tf.nn.softmax(logits, axis=-1)

  return (loss, per_example_loss, logits, probabilities)
def _build_decoder_action(model, dialogue_state, hparams, start_token,
                          end_token, output_layer):
  """build the decoder for action states."""

  iterator = model.iterator

  start_token_id = tf.cast(
      model.vocab_table.lookup(tf.constant(start_token)), tf.int32)
  end_token_id = tf.cast(
      model.vocab_table.lookup(tf.constant(end_token)), tf.int32)

  start_tokens = tf.fill([model.batch_size], start_token_id)
  end_token = end_token_id

  # kb is not used again
  ## Decoder.
  with tf.variable_scope("action_decoder") as decoder_scope:
    # we initialize the cell with the last layer of the last hidden state
    cell, decoder_initial_state = _build_action_decoder_cell(
        model, hparams, dialogue_state, model.global_gpu_num)
    model.global_gpu_num += 1
    ## Train or eval
    # situation one, for train, eval, mutable train
    # decoder_emp_inp: [max_time, batch_size, num_units]
    action = iterator.action
    # shift action
    paddings = tf.constant([[0, 0], [1, 0]])
    action = tf.pad(action, paddings, "CONSTANT", constant_values=0)[:, :-1]
    decoder_emb_inp = tf.nn.embedding_lookup(model.embedding_decoder,
                                             action)

    # Helper
    helper_train = seq2seq.TrainingHelper(
        decoder_emb_inp, iterator.action_len, time_major=False)

    # Decoder
    my_decoder_train = seq2seq.BasicDecoder(
        cell, helper_train, decoder_initial_state, output_layer)

    # Dynamic decoding
    outputs_train, _, _ = seq2seq.dynamic_decode(
        my_decoder_train,
        output_time_major=False,
        swap_memory=True,
        scope=decoder_scope)

    sample_id_train = outputs_train.sample_id
    logits_train = outputs_train.rnn_output
    # inference

    beam_width = hparams.beam_width
    length_penalty_weight = hparams.length_penalty_weight

    if model.mode == tf.estimator.ModeKeys.PREDICT and beam_width > 0:
      my_decoder_infer = seq2seq.BeamSearchDecoder(
          cell=cell,
          embedding=model.embedding_decoder,
          start_tokens=start_tokens,
          end_token=end_token,
          initial_state=decoder_initial_state,
          beam_width=beam_width,
          output_layer=output_layer,
          length_penalty_weight=length_penalty_weight)
    else:
      # Helper
      if model.mode in dialogue_utils.self_play_modes:
        helper_infer = seq2seq.SampleEmbeddingHelper(
            model.embedding_decoder, start_tokens, end_token)
      else:
        helper_infer = seq2seq.GreedyEmbeddingHelper(
            model.embedding_decoder, start_tokens, end_token)

      # Decoder
      my_decoder_infer = seq2seq.BasicDecoder(
          cell,
          helper_infer,
          decoder_initial_state,
          output_layer=output_layer  # applied per timestep
      )

    # Dynamic decoding
    outputs_infer, _, _ = seq2seq.dynamic_decode(
        my_decoder_infer,
        maximum_iterations=hparams.len_action,
        output_time_major=False,
        swap_memory=True,
        scope=decoder_scope)

    if model.mode == tf.estimator.ModeKeys.PREDICT and beam_width > 0:
      logits_infer = tf.no_op()
      sample_id_infer = outputs_infer.predicted_ids
    else:
      logits_infer = outputs_infer.rnn_output
      sample_id_infer = outputs_infer.sample_id

  return logits_train, logits_infer, sample_id_train, sample_id_infer
Beispiel #6
0
    def build_decoder(self, encoder_outputs, encoder_state):
        """ 
        构建解码器
        """
        with tf.variable_scope('decoder') as decoder_scope:
            (self.decoder_cell,
             self.decoder_initial_state) = self.build_decoder_cell(
                 encoder_outputs, encoder_state)
            #构建解码器的embedding
            with tf.device(_get_embed_device(self.target_vocab_size)):
                if self.share_embedding:
                    self.decoder_embeddings = self.encoder_embeddings
                elif self.pretrained_embedding:

                    self.decoder_embeddings = tf.Variable(tf.constant(
                        0.0,
                        shape=(self.target_vocab_size, self.embedding_size)),
                                                          trainable=True,
                                                          name='embeddings')
                    self.decoder_embeddings_placeholder = tf.placeholder(
                        tf.float32,
                        (self.target_vocab_size, self.embedding_size))
                    self.decoder_embeddings_init = self.decoder_embeddings.assign(
                        self.decoder_embeddings_placeholder)
                else:
                    self.decoder_embeddings = tf.get_variable(
                        name='embeddings',
                        shape=(self.target_vocab_size, self.embedding_size),
                        initializer=self.initializer,
                        dtype=tf.float32)
            self.decoder_output_projection = layers.Dense(
                self.target_vocab_size,
                dtype=tf.float32,
                use_bias=False,
                name='decoder_output_projection')

            if self.mode == 'train':
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                    params=self.decoder_embeddings,
                    ids=self.decoder_inputs_train)

                inputs = self.decoder_inputs_embedded
                if self.time_major:
                    inputs = tf.transpose(inputs, (1, 0, 2))

                training_helper = seq2seq.TrainingHelper(
                    #根据预测值或者真实值得到下一刻的输入
                    inputs=inputs,
                    sequence_length=self.decoder_inputs_length,
                    time_major=self.time_major,
                    name='training_helper')
                # 训练的时候不在这里应用 output_layer
                # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢
                # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数
                training_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=training_helper,
                    initial_state=self.decoder_initial_state
                    #output_layer = self.decoder_output_projection    #输出映射层,将rnn_size转化为vocab_size维
                )
                #decoder在当前的batch下的最大time_steps
                max_decoder_length = tf.reduce_max(self.decoder_inputs_length)

                outputs, self.final_state, _ = seq2seq.dynamic_decode(
                    decoder=training_decoder,
                    output_time_major=self.time_major,
                    impute_finished=
                    True,  #Boolean,为真时会拷贝最后一个时刻的状态并将输出置零,程序运行更稳定,使最终状态和输出具有正确的值,在反向传播时忽略最后一个完成步。但是会降低程序运行速度。
                    maximum_iterations=
                    max_decoder_length,  #最大解码步数,一般训练设置为decoder_inputs_length,预测时设置一个想要的最大序列长度即可。程序会在产生<eos>或者到达最大步数处停止
                    parallel_iterations=self.
                    parallel_iterations,  #parallel_iterations是并行执行循环的个数
                    swap_memory=True,
                    scope=decoder_scope)

                self.decoder_logits_train = self.decoder_output_projection(
                    outputs.rnn_output)
                self.masks = tf.sequence_mask(
                    #构建序列长度的mask标志
                    lengths=self.decoder_inputs_length,
                    maxlen=max_decoder_length,
                    dtype=tf.float32,
                    name='masks')

                decoder_logits_train = self.decoder_logits_train
                if self.time_major:
                    decoder_logits_train = tf.transpose(
                        decoder_logits_train, (1, 0, 2))

                self.decoder_pred_train = tf.argmax(decoder_logits_train,
                                                    axis=-1,
                                                    name='decoder_pred_train')

                self.train_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.decoder_inputs,  #真实值y
                    logits=decoder_logits_train  #预测值y_
                )

                self.masks_rewards = self.masks * self.rewards

                self.loss_rewards = seq2seq.sequence_loss(
                    logits=
                    decoder_logits_train,  #[batch_size, sequence_length, num_decoder_symbols]
                    targets=self.
                    decoder_inputs,  #[batch_size, sequence_length]  不用做one_hot
                    weights=self.
                    masks_rewards,  #[batch_size, sequence_length]  即mask,滤去padding的loss计算,使loss计算更准确。
                    average_across_timesteps=True,
                    average_across_batch=True)

                self.loss = seq2seq.sequence_loss(
                    #序列的损失函数
                    logits=
                    decoder_logits_train,  #[batch_size, sequence_length, num_decoder_symbols]
                    targets=self.
                    decoder_inputs,  #[batch_size, sequence_length]  不用做one_hot
                    weights=self.masks,  # 即mask,滤去padding的loss计算,使loss计算更准确。
                    average_across_timesteps=True,
                    average_across_batch=True)

                self.loss_add = self.loss + self.add_loss

            elif self.mode == 'decode':
                start_tokens = tf.tile([WordSequence.START], [self.batch_size])
                end_token = WordSequence.END

                def embed_and_input_proj(inputs):
                    return tf.nn.embedding_lookup(self.decoder_embeddings,
                                                  inputs)

                if not self.use_beamsearch_decode:
                    decoding_helper = seq2seq.GreedyEmbeddingHelper(
                        start_tokens=start_tokens,
                        end_token=end_token,
                        embedding=embed_and_input_proj)

                    inference_decoder = seq2seq.BasicDecoder(
                        cell=self.decoder_cell,
                        helper=decoding_helper,
                        initial_state=self.decoder_initial_state,
                        output_layer=self.decoder_output_projection)
                else:
                    inference_decoder = BeamSearchDecoder(
                        cell=self.decoder_cell,
                        embedding=embed_and_input_proj,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=self.decoder_initial_state,
                        beam_width=self.beam_width,
                        output_layer=self.decoder_output_projection)

                if self.max_decode_step is not None:
                    max_decoder_step = self.max_decode_step
                else:
                    max_decoder_step = tf.round(
                        tf.reduce_max(self.encoder_inputs_length) * 4)

                self.decoder_outputs_decode, self.final_state, _ = seq2seq.dynamic_decode(
                    decoder=inference_decoder,
                    output_time_major=self.time_major,
                    maximum_iterations=max_decoder_step,
                    parallel_iterations=self.parallel_iterations,
                    swap_memory=True,
                    scope=decoder_scope)

                if not self.use_beamsearch_decode:
                    dod = self.decoder_outputs_decode
                    self.decoder_pred_decode = dod.sample_id
                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0))
                else:
                    self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0, 2))
                    self.decoder_pred_decode = tf.transpose(
                        self.decoder_pred_decode, perm=[0, 2, 1])
                    dod = self.decoder_outputs_decode
                    self.beam_prob = dod.beam_search_decoder_output.scores
Beispiel #7
0
    def decode(self, encoder_outputs, encoder_state, source_sequence_length):
        with tf.variable_scope("Decoder") as scope:
            beam_width = self.beam_width
            decoder_type = self.decoder_type
            seq_max_len = self.seq_max_len
            batch_size = tf.shape(encoder_outputs)[0]

            if self.path_embed_method == "lstm":
                self.decoder_cell = self._build_decode_cell()
                if self.decoder_type == "beam" and self.beam_width > 0:
                    beam_memory = seq2seq.tile_batch(
                        encoder_outputs, multiplier=self.beam_width)
                    beam_source_sequence_length = seq2seq.tile_batch(
                        source_sequence_length, multiplier=self.beam_width)
                    beam_encoder_state = seq2seq.tile_batch(
                        encoder_state, multiplier=self.beam_width)
                    beam_batch_size = batch_size * self.beam_width
                    attention_mechanism = seq2seq.BahdanauAttention(
                        self.hidden_layer_dim,
                        beam_memory,
                        memory_sequence_length=beam_source_sequence_length)
                    self.beam_decoder_cell = seq2seq.AttentionWrapper(
                        self.decoder_cell,
                        attention_mechanism,
                        attention_layer_size=self.hidden_layer_dim)
                    self.beam_decoder_initial_state = self.beam_decoder_cell.zero_state(
                        beam_batch_size,
                        tf.float32).clone(cell_state=beam_encoder_state)

                memory = encoder_outputs
                source_sequence_length = source_sequence_length
                encoder_state = encoder_state

                attention_mechanism = seq2seq.BahdanauAttention(
                    self.hidden_layer_dim,
                    memory,
                    memory_sequence_length=source_sequence_length)
                self.decoder_cell = seq2seq.AttentionWrapper(
                    self.decoder_cell,
                    attention_mechanism,
                    attention_layer_size=self.hidden_layer_dim)
                self.decoder_initial_state = self.decoder_cell.zero_state(
                    batch_size, tf.float32).clone(cell_state=encoder_state)

            projection_layer = Dense(self.word_vocab_size, use_bias=False)
            """For training the model"""
            if self.mode == "train":
                decoder_train_helper = tf.contrib.seq2seq.TrainingHelper(
                    self.decoder_train_inputs_embedded,
                    self.decoder_train_length)
                decoder_train = seq2seq.BasicDecoder(
                    self.decoder_cell, decoder_train_helper,
                    self.decoder_initial_state, projection_layer)
                decoder_outputs_train, decoder_states_train, decoder_seq_len_train = seq2seq.dynamic_decode(
                    decoder_train)
                decoder_logits_train = decoder_outputs_train.rnn_output
                self.decoder_logits_train = tf.reshape(
                    decoder_logits_train,
                    [batch_size, -1, self.word_vocab_size])
            """For test the model"""
            # if self.mode == "infer" or self.if_pred_on_dev:
            if decoder_type == "greedy":
                decoder_infer_helper = seq2seq.GreedyEmbeddingHelper(
                    self.word_embeddings, tf.ones([batch_size],
                                                  dtype=tf.int32), self.EOS)
                decoder_infer = seq2seq.BasicDecoder(
                    self.decoder_cell, decoder_infer_helper,
                    self.decoder_initial_state, projection_layer)
            elif decoder_type == "beam":
                decoder_infer = seq2seq.BeamSearchDecoder(
                    cell=self.beam_decoder_cell,
                    embedding=self.word_embeddings,
                    start_tokens=tf.ones([batch_size], dtype=tf.int32),
                    end_token=self.EOS,
                    initial_state=self.beam_decoder_initial_state,
                    beam_width=beam_width,
                    output_layer=projection_layer)

            decoder_outputs_infer, decoder_states_infer, decoder_seq_len_infer = seq2seq.dynamic_decode(
                decoder_infer, maximum_iterations=seq_max_len)

            if decoder_type == "beam":
                self.decoder_logits_infer = tf.no_op()
                self.sample_id = decoder_outputs_infer.predicted_ids

            elif decoder_type == "greedy":
                self.decoder_logits_infer = decoder_outputs_infer.rnn_output
                self.sample_id = decoder_outputs_infer.sample_id
Beispiel #8
0
    def _build_main_graph(self, xs, xlens, ys, ylens):
        with tf.variable_scope('word_model', reuse=self._reuse_vars):
            embeds = self._variable(
                'embeddings',
                dtype=tf.float32,
                shape=[self._word_symbols, self._word_embedding_size])

            with tf.variable_scope('encoder', reuse=self._reuse_vars):
                fw_cells = self._rnn_cells(self._word_model_rnn_hidden_size,
                                           self._word_model_rnn_layers // 2)
                bw_cells = self._rnn_cells(self._word_model_rnn_hidden_size,
                                           self._word_model_rnn_layers // 2)

                batch_input_embeds = tf.nn.embedding_lookup(embeds, xs)

                rnn_out, rnn_state = tf.nn.bidirectional_dynamic_rnn(
                    fw_cells,
                    bw_cells,
                    batch_input_embeds,
                    xlens,
                    dtype=tf.float32)

            with tf.variable_scope('decoder', reuse=self._reuse_vars):
                # Attention only consumes encoder outputs.
                attention = seq2seq.LuongAttention(
                    self._decoder_attention_size, tf.concat(rnn_out, -1),
                    xlens)
                cells = self._rnn_cells(self._word_model_rnn_hidden_size,
                                        self._word_model_rnn_layers)
                cells = seq2seq.AttentionWrapper(cells, attention)
                decode_init_state = cells.zero_state(self._batch_size,
                                                     tf.float32)

                # This layer sits just before softmax. It seems that if an activation is placed here,
                # the network will not converge well. Why?
                def apply_dropout(v):
                    if self._mode == 'train':
                        return tf.nn.dropout(v, KEEP)
                    else:
                        return v

                final_projection = tf.layers.Dense(
                    self._word_symbols,
                    kernel_regularizer=apply_dropout,
                    use_bias=False)

                if self._mode != 'infer':
                    batch_target_embeds = tf.nn.embedding_lookup(embeds, ys)
                    helper = seq2seq.TrainingHelper(batch_target_embeds, ylens)
                    decoder = seq2seq.BasicDecoder(cells, helper,
                                                   decode_init_state,
                                                   final_projection)
                    (logits,
                     ids), state, lengths = seq2seq.dynamic_decode(decoder)
                    return logits, ids, lengths
                else:
                    helper = seq2seq.GreedyEmbeddingHelper(
                        embeds, tf.tile([self._start_token],
                                        [self._batch_size]), self._end_token)
                    decoder = seq2seq.BasicDecoder(cells, helper,
                                                   decode_init_state,
                                                   final_projection)
                    max_iters = tf.reduce_max(xlens) * 2
                    (logits, ids), state, lengths = seq2seq.dynamic_decode(
                        decoder, maximum_iterations=max_iters)
                    return logits, ids, lengths
Beispiel #9
0
    def build_decoder(self):
        with tf.variable_scope("Decoder"):
            with tf.name_scope("Decoder_cell"):
                # 创建单个cell
                def get_a_cell(lstm_size, keep_prob):
                    lstm = tf.nn.rnn_cell.BasicLSTMCell(lstm_size)
                    drop = tf.nn.rnn_cell.DropoutWrapper(
                        lstm, output_keep_prob=keep_prob)
                    return drop

                # 堆叠多层神经元
                cell = tf.nn.rnn_cell.MultiRNNCell([
                    get_a_cell(self.lstm_size, self.keep_prob)
                    for _ in range(self.n_layers)
                ])

            with tf.name_scope("Decoder_Dense"):
                #Output全连接层
                output_layer = tf.layers.Dense(
                    units=self.data.decoder_vocab_size,
                    kernel_initializer=tf.truncated_normal_initializer(
                        mean=0.0, stddev=0.1))

            with tf.variable_scope("decoder"):
                # 创建helper对象
                training_helper = tcs.TrainingHelper(
                    inputs=self.decoder_input_embedding,
                    sequence_length=self.decoder_target_sequence_length,
                    time_major=False)
                # 构造decoder
                training_decoder = tcs.BasicDecoder(
                    cell=cell,
                    helper=training_helper,
                    initial_state=self.encoder_final_state,
                    output_layer=output_layer)

                self.training_decoder_output, \
                self.training_decoder_final_state, \
                self.training_decoder_final_sequence_lengths = tcs.dynamic_decode(decoder=training_decoder,
                                                                                  output_time_major=False,
                                                                                  impute_finished=True,
                                                                                  maximum_iterations=self.decoder_max_target_sequence_length)
            # prediction
            with tf.variable_scope("decoder", reuse=True):
                # 创建一个常量tensor并复制为batch_size的大小
                start_tokens = tf.tile(
                    tf.constant([self.data.decoder_word_to_int['<GO>']],
                                dtype=tf.int32), [self.batch_size],
                    name='start_tokens')

                predicting_helper = tcs.GreedyEmbeddingHelper(
                    embedding=self.decoder_embedding,
                    start_tokens=start_tokens,
                    end_token=self.data.decoder_word_to_int['<EOS>'])

                predicting_decoder = tcs.BasicDecoder(
                    cell=cell,
                    helper=predicting_helper,
                    initial_state=self.encoder_final_state,
                    output_layer=output_layer)

                self.predicting_decoder_output, \
                self.predicting_decoder_final_state, \
                self.predicting_decoder_final_sequence_lengths = tcs.dynamic_decode(decoder=predicting_decoder,
                                                                                    output_time_major=False,
                                                                                    impute_finished=True,
                                                                                    maximum_iterations=self.decoder_max_target_sequence_length)
    def build_decoder(self, encoder_outputs, encoder_state):
        """
        Build the decoder: multi-layers of LSTMs with global attention mechanism.
        """

        sos_id_2 = tf.cast(self.dict_lab2idx_tgt[self.SOS], tf.int32)
        eos_id_2 = tf.cast(self.dict_lab2idx_tgt[self.EOS], tf.int32)

        self.output_layer = Dense(self.vocab_size_tgt,
                                  name='output_projection')

        # Decoder.
        with tf.variable_scope("decoder") as decoder_scope:

            cell, decoder_initial_state = self.build_decoder_cell(
                encoder_outputs, encoder_state, self.seq_len_src)

            # Train
            if self.mode != 'INFER':

                # tf helper for embedding decoder for training (feed back target and not predicted value)
                helper = s2s.TrainingHelper(self.word_embeddings_tgt,
                                            self.seq_len_tgt)

                # decoder cell
                decoder_cell = s2s.BasicDecoder(cell,
                                                helper,
                                                decoder_initial_state,
                                                output_layer=self.output_layer)

                # Dynamic decoding
                outputs, final_context_state, _ = s2s.dynamic_decode(
                    decoder_cell,
                    maximum_iterations=self.maximum_iterations,
                    swap_memory=False,
                    impute_finished=True,
                    scope=decoder_scope)
                # Ouputs of decoding
                sample_id = outputs.sample_id
                logits = outputs.rnn_output

            else:
                start_tokens = tf.fill([self.batch_size], sos_id_2)
                end_token = eos_id_2

                # tf helper for embedding decoder for inference (feed back predicted value and not target like in training)
                # NOTE: there must be a bug in the tf helper or in the way I am feeding the inputs as the training gets very high accuracy (close to perfect translation) (with TrainingHelper) but the performance drops with  GreedyEmbeddingHelper (at inference time) with inaccurate translations.. My guess is that there is a shift in the targets somewhere and that TrainingHelper feeds to the decoder at time t the target t, so the decoder is learning identity. But then GreedyEmbeddingHelper does not do the same because it feeds the predicted last target, hence since the NN learned identity it repeats the same word like if it did not learn anything.
                # This is essentially the reason why I was not able to move further and finish training CoVe vectors and then reproduce the results.
                helper = s2s.GreedyEmbeddingHelper(self.embedding_tgt,
                                                   start_tokens, end_token)

                decoder_cell = s2s.BasicDecoder(cell,
                                                helper,
                                                decoder_initial_state,
                                                output_layer=self.output_layer)

                # Dynamic decoding
                outputs, final_context_state, _ = s2s.dynamic_decode(
                    decoder_cell,
                    maximum_iterations=self.maximum_iterations,
                    impute_finished=False,
                    swap_memory=False,
                    scope=decoder_scope)

                logits = outputs.rnn_output
                sample_id = outputs.sample_id

        self.logits = logits
        self.sample_id = sample_id

        return logits, sample_id, final_context_state
Beispiel #11
0
    def _model(self):
        graph = tf.Graph()
        with graph.as_default():
            embedding = tf.Variable(np.zeros(
                shape=[self.num_words, self.embedding_size], dtype=np.float32),
                                    trainable=False,
                                    name='embedding')  # 词向量
            lr = tf.placeholder(tf.float32, [], name='learning_rate')
            # 输入数据
            x_input = tf.placeholder(tf.int32, [None, None],
                                     name='x_input')  # 输入数据X
            x_sequence_length = tf.placeholder(tf.int32, [None],
                                               name='x_length')  # 输入数据每一条的长度
            x_embedding = tf.nn.embedding_lookup(embedding,
                                                 x_input)  # 将输入的one-hot编码转换成向量
            y_input = tf.placeholder(tf.int32, [None, None],
                                     name='y_input')  # 输入数据Y
            y_sequence_length = tf.placeholder(tf.int32, [None],
                                               name='y_length')  # 每一个Y的长度
            y_embedding = tf.nn.embedding_lookup(embedding, y_input)  # 对Y向量化
            batch_size = tf.placeholder(tf.int32, [], name='batch_size')
            # batch_size = tf.shape(x_input)[0]
            # 使用gru代替LSTM, 4层cell堆叠
            encoder_cell = rnn.MultiRNNCell(
                [rnn.GRUCell(128, activation=tf.tanh) for _ in range(4)])
            decoder_cell = rnn.MultiRNNCell(
                [rnn.GRUCell(128, activation=tf.tanh) for _ in range(4)])
            # 计算encoder
            output, encoder_state = tf.nn.dynamic_rnn(
                cell=encoder_cell,
                inputs=x_embedding,
                initial_state=encoder_cell.zero_state(batch_size, tf.float32),
                sequence_length=x_sequence_length)

            attention_mechanism = seq2seq.BahdanauAttention(
                64, output, x_sequence_length)
            attention_cell = seq2seq.AttentionWrapper(decoder_cell,
                                                      attention_mechanism)
            decoder_cell = rnn.OutputProjectionWrapper(attention_cell,
                                                       64,
                                                       activation=tf.tanh)
            encoder_state = decoder_cell.zero_state(
                batch_size, tf.float32).clone(cell_state=encoder_state)

            output_layer = tf.layers.Dense(
                self.num_words,
                kernel_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                   stddev=0.1))

            with tf.variable_scope(tf.get_variable_scope(),
                                   reuse=tf.AUTO_REUSE):
                # 定义training decoder
                training_helper = seq2seq.TrainingHelper(
                    inputs=y_embedding, sequence_length=y_sequence_length)
                training_decoder = seq2seq.BasicDecoder(
                    decoder_cell, training_helper, encoder_state, output_layer)
                # impute_finish 标记为True时,序列读入<eos>后不再进行计算,保持state不变并且输出全0
                training_output, _, _ = seq2seq.dynamic_decode(
                    training_decoder,
                    # 加上<GO>和<EOS>
                    maximum_iterations=self.max_sentence_length + 2,
                    impute_finished=True)

                # predict decoder
                predict_helper = seq2seq.GreedyEmbeddingHelper(
                    embedding, tf.fill([batch_size], self.word2index['GO']),
                    self.word2index['EOS'])
                predict_decoder = seq2seq.BasicDecoder(decoder_cell,
                                                       predict_helper,
                                                       encoder_state,
                                                       output_layer)
                predict_output, _, _ = seq2seq.dynamic_decode(
                    predict_decoder,
                    maximum_iterations=self.max_sentence_length + 2,
                    impute_finished=True)

            # loss function
            training_logits = tf.identity(training_output.rnn_output,
                                          name='training_logits')
            predicting_logits = tf.identity(predict_output.rnn_output,
                                            name='predicting')

            masks = tf.sequence_mask(y_sequence_length,
                                     dtype=tf.float32,
                                     name='mask')

            with tf.variable_scope('optimization'):
                loss = seq2seq.sequence_loss(training_logits, y_input, masks)
                optimizer = tf.train.AdamOptimizer(lr)
                gradients = optimizer.compute_gradients(loss)
                capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var)
                                    for grad, var in gradients
                                    if grad is not None]
                train_op = optimizer.apply_gradients(capped_gradients)

        return graph, loss, train_op, predicting_logits
    def BuildNetwork(self, learningRate):
        #############################################################################
        # Input Data
        #############################################################################

        self.dataInput = tensorflow.placeholder(
            dtype=tensorflow.float32,
            shape=[None, None, self.featureShape],
            name='dataInput')
        self.labelInput = tensorflow.placeholder(dtype=tensorflow.int32,
                                                 shape=[None, None],
                                                 name='labelInput')
        self.dataLenInput = tensorflow.placeholder(dtype=tensorflow.int32,
                                                   shape=[None],
                                                   name='dataLenInput')
        self.labelLenInput = tensorflow.placeholder(dtype=tensorflow.int32,
                                                    shape=[None],
                                                    name='labelLenInput')

        #############################################################################
        # Batch Parameters
        #############################################################################

        self.parameters['BatchSize'], self.parameters[
            'TimeStep'], _ = tensorflow.unstack(
                tensorflow.shape(input=self.dataInput, name='DataShape'))
        self.parameters['LabelStep'] = tensorflow.shape(input=self.labelInput,
                                                        name='LabelShape')[1]

        ###################################################################################################
        # Encoder
        ###################################################################################################

        with tensorflow.variable_scope('Encoder'):
            self.parameters[
                'Encoder_Cell_Forward'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                    cells=[
                        rnn.LSTMCell(num_units=self.hiddenNodules)
                        for _ in range(self.rnnLayers)
                    ],
                    state_is_tuple=True)
            self.parameters[
                'Encoder_Cell_Backward'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                    cells=[
                        rnn.LSTMCell(num_units=self.hiddenNodules)
                        for _ in range(self.rnnLayers)
                    ],
                    state_is_tuple=True)

            self.parameters['Encoder_Output'], self.parameters['Encoder_FinalState'] = \
                tensorflow.nn.bidirectional_dynamic_rnn(
                    cell_fw=self.parameters['Encoder_Cell_Forward'], cell_bw=self.parameters['Encoder_Cell_Backward'],
                    inputs=self.dataInput, sequence_length=self.dataLenInput, dtype=tensorflow.float32)

        if self.attention is None:
            self.parameters['Decoder_InitalState'] = []
            for index in range(self.rnnLayers):
                self.parameters[
                    'Encoder_Cell_Layer%d' % index] = rnn.LSTMStateTuple(
                        c=tensorflow.concat([
                            self.parameters['Encoder_FinalState'][index][0].c,
                            self.parameters['Encoder_FinalState'][index][1].c
                        ],
                                            axis=1),
                        h=tensorflow.concat([
                            self.parameters['Encoder_FinalState'][index][0].h,
                            self.parameters['Encoder_FinalState'][index][1].h
                        ],
                                            axis=1))
                self.parameters['Decoder_InitalState'].append(
                    self.parameters['Encoder_Cell_Layer%d' % index])
            self.parameters['Decoder_InitalState'] = tuple(
                self.parameters['Decoder_InitalState'])
        else:
            self.attentionList = self.attention(
                dataInput=self.parameters['Encoder_Output'],
                scopeName=self.attentionName,
                hiddenNoduleNumber=2 * self.hiddenNodules,
                attentionScope=self.attentionScope,
                blstmFlag=True)
            self.parameters['Decoder_InitalState'] = []
            for index in range(self.rnnLayers):
                self.parameters[
                    'Encoder_Cell_Layer%d' % index] = rnn.LSTMStateTuple(
                        c=self.attentionList['FinalResult'],
                        h=tensorflow.concat([
                            self.parameters['Encoder_FinalState'][index][0].h,
                            self.parameters['Encoder_FinalState'][index][1].h
                        ],
                                            axis=1))
                self.parameters['Decoder_InitalState'].append(
                    self.parameters['Encoder_Cell_Layer%d' % index])
            self.parameters['Decoder_InitalState'] = tuple(
                self.parameters['Decoder_InitalState'])

        #############################################################################
        # Decoder Label Pretreatment
        #############################################################################

        self.parameters['DecoderEmbedding'] = tensorflow.Variable(
            initial_value=tensorflow.truncated_normal(
                shape=[VOCABULAR, self.hiddenNodules * 2],
                stddev=0.1,
                name='DecoderEmbedding'))

        self.parameters[
            'DecoderEmbeddingResult'] = tensorflow.nn.embedding_lookup(
                params=self.parameters['DecoderEmbedding'],
                ids=self.labelInput,
                name='DecoderEmbeddingResult')

        #############################################################################
        # Decoder
        #############################################################################

        self.parameters['Decoder_Helper'] = seq2seq.TrainingHelper(
            inputs=self.parameters['DecoderEmbeddingResult'],
            sequence_length=self.labelLenInput,
            name='Decoder_Helper')
        with tensorflow.variable_scope('Decoder'):
            self.parameters['Decoder_FC'] = Dense(VOCABULAR)

            self.parameters[
                'Decoder_Cell'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                    cells=[
                        rnn.LSTMCell(num_units=self.hiddenNodules * 2)
                        for _ in range(self.rnnLayers)
                    ],
                    state_is_tuple=True)

            self.parameters['Decoder'] = seq2seq.BasicDecoder(
                cell=self.parameters['Decoder_Cell'],
                helper=self.parameters['Decoder_Helper'],
                initial_state=self.parameters['Decoder_InitalState'],
                output_layer=self.parameters['Decoder_FC'])

            self.parameters['Decoder_Logits'], self.parameters[
                'Decoder_FinalState'], self.parameters[
                    'Decoder_FinalSeq'] = seq2seq.dynamic_decode(
                        decoder=self.parameters['Decoder'])

        with tensorflow.name_scope('Loss'):
            self.parameters['TargetsReshape'] = tensorflow.reshape(
                tensor=self.labelInput, shape=[-1], name='TargetsReshape')
            self.parameters['Decoder_Reshape'] = tensorflow.reshape(
                self.parameters['Decoder_Logits'].rnn_output, [-1, VOCABULAR],
                name='Decoder_Reshape')
            self.parameters[
                'Cost'] = tensorflow.losses.sparse_softmax_cross_entropy(
                    labels=self.parameters['TargetsReshape'],
                    logits=self.parameters['Decoder_Reshape'])

            self.train = tensorflow.train.AdamOptimizer(
                learning_rate=learningRate).minimize(self.parameters['Cost'])
Beispiel #13
0
    def _build_decoder(self):
        with tf.variable_scope("dialog_decoder"):
            with tf.variable_scope("decoder_output_projection"):  # 全连接层
                output_layer = layers_core.Dense(
                    self.config.vocab_size,
                    use_bias=False,
                    name="output_projection")  # units单元个数  词表大小

            with tf.variable_scope("decoder_rnn"):
                attn_mech = tc_seq2seq.BahdanauAttention(
                    self.config.dec_hidden_size, self.word_outputs, None)
                attn_mech1 = tc_seq2seq.BahdanauAttention(
                    self.config.dec_hidden_size, self.uttn_outputs, None)
                attn_mech2 = tc_seq2seq.BahdanauAttention(
                    self.config.dec_hidden_size, self.encoder_outputs, None)

                self.att1 = attn_mech.batch_size
                self.att2 = attn_mech.batch_size
                self.att3 = attn_mech.batch_size

                dec_cell = GRUCell(self.config.dec_hidden_size)

                #dec_cell = grucell_cond.GRUCellCond(self.config.dec_hidden_size)
                #self.encoder_outputs = tf.reshape(self.encoder_outputs,[-1,self.config.dec_hidden_size*2])
                #dec_cell = grucell_cond.CondWrapper(dec_cell, self.encoder_outputs)
                #word_outputs = tf.reshape(self.word_outputs,[self.batch_size,-1])

                dec_cell = EAttentionWrapper(
                    dec_cell, [attn_mech, attn_mech1, attn_mech2],
                    attention_layer_size=[
                        self.config.dec_hidden_size,
                        self.config.dec_hidden_size,
                        self.config.dec_hidden_size
                    ])
                #print('self.batch_size',self.batch_size)
                dec_init_state = dec_cell.zero_state(
                    batch_size=self.batch_size, dtype=tf.float32)

                # Training or Eval
                if self.mode != ModelMode.infer:  # not infer, do decode turn by turn

                    resp_emb_inp = tf.nn.embedding_lookup(
                        self.decoder_embeddings, self.target_input)
                    helper = tc_seq2seq.TrainingHelper(resp_emb_inp,
                                                       self.target_length)
                    decoder = tc_seq2seq.BasicDecoder(
                        cell=dec_cell,
                        helper=helper,
                        initial_state=dec_init_state,  # 编码层的最终状态
                        output_layer=output_layer  # 全连接层
                    )

                    dec_outputs, dec_state, _ = tc_seq2seq.dynamic_decode(
                        decoder)
                    sample_id = dec_outputs.sample_id
                    logits = dec_outputs.rnn_output

                else:
                    start_tokens = tf.fill([self.batch_size],
                                           self.config.sos_idx)
                    end_token = self.config.eos_idx
                    maximum_iterations = tf.to_int32(self.config.infer_max_len)

                    helper = tc_seq2seq.GreedyEmbeddingHelper(
                        self.decoder_embeddings,
                        start_tokens=start_tokens,
                        end_token=tf.constant(end_token, dtype=tf.int32))

                    decoder = tc_seq2seq.BasicDecoder(
                        cell=dec_cell,
                        helper=helper,
                        initial_state=dec_init_state,
                        output_layer=output_layer  # 全连接层
                    )

                    dec_outputs, dec_state, _ = tc_seq2seq.dynamic_decode(
                        decoder, maximum_iterations=maximum_iterations)
                    logits = tf.no_op()
                    sample_id = dec_outputs.sample_id

                self.logits = logits
                self.sample_id = sample_id
Beispiel #14
0
def get_model(config, embeddings=None, num_words=None, stitch_inputs=None):
    inputs = dict()
    outputs = dict()

    if stitch_inputs is None:
        inputs['x'] = tf.placeholder(tf.int32, shape=[None, None], name="x")
        inputs['y'] = tf.placeholder(tf.int32, shape=[None, None], name="y")
        inputs['seq_length'] = tf.placeholder(tf.int32,
                                              shape=[None],
                                              name="seq_length")
    else:
        inputs['x'] = stitch_inputs['x']
        inputs['y'] = stitch_inputs['y']
        inputs['seq_length'] = stitch_inputs['seq_length']

    if embeddings is None:
        logging.info('initialize embeddings')
        embeddings = tf.get_variable(
            name="embedding",
            shape=[num_words, config['embedding_size']],
            dtype=tf.float32,
            initializer=tf.random_normal_initializer(stddev=0.1),
            trainable=True)
    else:
        logging.info('use pretrained embeddings')
        logging.info('embeddings trainable: {}'.format(
            config.get('embedding_trainable', False)))
        embeddings = tf.get_variable(
            "embeddings",
            shape=embeddings.shape,
            initializer=tf.constant_initializer(embeddings),
            trainable=config.get('embedding_trainable', False))

    inputs['input_keep_prob'] = tf.placeholder_with_default(
        tf.constant(1, dtype=tf.float32), shape=[], name="input_keep_prob")
    inputs['output_keep_prob'] = tf.placeholder_with_default(
        tf.constant(1, dtype=tf.float32), shape=[], name="output_keep_prob")
    inputs['learning_rate'] = tf.placeholder_with_default(tf.constant(
        config['learning_rate'], dtype=tf.float32),
                                                          shape=[],
                                                          name="learning_rate")
    batch_size = tf.shape(inputs['x'])[0]

    def create_cell():
        rnn_cell_type = config.get('rnn_cell', 'lnlstm')
        if rnn_cell_type == 'lstm':
            logging.info('Use LSTMBlockCell cell')
            _cell = tf.contrib.rnn.LSTMBlockCell(config['rnn_size'])
        else:
            logging.info('Use LayerNormBasicLSTMCell cell')
            _cell = tf.contrib.rnn.LayerNormBasicLSTMCell(config['rnn_size'])
        _cell = tf.nn.rnn_cell.DropoutWrapper(
            _cell,
            input_keep_prob=inputs['input_keep_prob'],
            output_keep_prob=inputs['output_keep_prob'])
        return _cell

    cells = [create_cell() for _ in range(config['num_layers'])]
    cell = tf.nn.rnn_cell.MultiRNNCell(cells)

    x_embedded = tf.nn.embedding_lookup(embeddings, inputs['x'])
    helper = seq2seq.TrainingHelper(x_embedded, inputs['seq_length'])

    projection_layer = Dense(embeddings.shape[0],
                             name='projection_layer',
                             use_bias=True,
                             dtype=tf.float32)
    initial_state = cell.zero_state(batch_size, dtype=tf.float32)
    mask = sequence_mask(inputs['seq_length'], dtype=tf.float32)

    decoder = seq2seq.BasicDecoder(cell, helper, initial_state=initial_state)
    decode_output, _, _ = seq2seq.dynamic_decode(decoder,
                                                 impute_finished=True,
                                                 swap_memory=config.get(
                                                     'swap_memory', False))

    if config.get('sampled_softmax', 0) > 0:
        projection_layer.build(input_shape=decode_output.rnn_output.shape)

        def _sampled_loss(labels, logits):
            return tf.nn.sampled_softmax_loss(
                tf.transpose(projection_layer.kernel),
                projection_layer.bias,
                tf.expand_dims(labels, -1),
                logits,
                num_sampled=config['sampled_softmax'],
                num_classes=num_words)

        softmax_loss_function = _sampled_loss
        logits_input = decode_output.rnn_output
    else:
        softmax_loss_function = None
        logits_input = projection_layer(decode_output.rnn_output)

    losses = seq2seq.sequence_loss(logits_input,
                                   inputs['y'],
                                   mask,
                                   softmax_loss_function=softmax_loss_function,
                                   average_across_batch=False,
                                   average_across_timesteps=False)

    outputs['total_loss'] = tf.reduce_sum(losses)
    outputs['num_tokens'] = tf.reduce_sum(mask)

    outputs['loss'] = outputs['total_loss'] / outputs['num_tokens']
    outputs['perplexity'] = tf.exp(outputs['loss'], name='perplexity')

    if stitch_inputs is not None:
        losses = seq2seq.sequence_loss(logits_input,
                                       inputs['y'],
                                       mask,
                                       average_across_batch=False)
        outputs['losses'] = tf.identity(losses, name='losses')
        outputs['perplexities'] = tf.exp(losses, name='perplexities')

    if stitch_inputs is None:
        with tf.variable_scope('Optimizer'):
            optimizer_name = config.get('optimizer', 'sgd')
            if optimizer_name == 'adam':
                logging.info('use adam optimizer')
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=inputs['learning_rate'])
            else:
                logging.info('use sgd optimizer')
                optimizer = tf.contrib.opt.MomentumWOptimizer(
                    weight_decay=config['weight_decay'],
                    learning_rate=inputs['learning_rate'],
                    momentum=config['momentum'])
            if config.get('aggregation_method', 'default') == 'experimental':
                logging.info('use gradient aggregation method: experimental')
                gradient_var_pairs = optimizer.compute_gradients(
                    outputs['total_loss'],
                    var_list=tf.trainable_variables(),
                    aggregation_method=tf.AggregationMethod.
                    EXPERIMENTAL_ACCUMULATE_N)
            else:
                logging.info('use gradient aggregation method: default')
                gradient_var_pairs = optimizer.compute_gradients(
                    outputs['total_loss'], var_list=tf.trainable_variables())
            vars = [x[1] for x in gradient_var_pairs if x[0] is not None]
            gradients = [x[0] for x in gradient_var_pairs if x[0] is not None]
            gc = config.get('gradient_clipping', 120.0)
            gradients, _ = tf.clip_by_global_norm(gradients, clip_norm=gc)
            outputs['train_op'] = optimizer.apply_gradients(
                zip(gradients, vars))

    return inputs, outputs
Beispiel #15
0
    def build_decoder(self, encoder_outputs, encoder_state):
        """构建解码器
        """
        with tf.variable_scope('decoder') as decoder_scope:
            # Building decoder_cell and decoder_initial_state
            (
                self.decoder_cell,
                self.decoder_initial_state
            ) = self.build_decoder_cell(encoder_outputs, encoder_state)

            # 解码器embedding
            with tf.device(_get_embed_device(self.target_vocab_size)):
                if self.share_embedding:
                    self.decoder_embeddings = self.encoder_embeddings
                elif self.pretrained_embedding:

                    self.decoder_embeddings = tf.Variable(
                        tf.constant(
                            0.0,
                            shape=(self.target_vocab_size,
                                   self.embedding_size)
                        ),
                        trainable=True,
                        name='embeddings'
                    )
                    self.decoder_embeddings_placeholder = tf.placeholder(
                        tf.float32,
                        (self.target_vocab_size, self.embedding_size)
                    )
                    self.decoder_embeddings_init = \
                        self.decoder_embeddings.assign(
                            self.decoder_embeddings_placeholder)
                else:
                    self.decoder_embeddings = tf.get_variable(
                        name='embeddings',
                        shape=(self.target_vocab_size, self.embedding_size),
                        initializer=self.initializer,
                        dtype=tf.float32
                    )

            self.decoder_output_projection = layers.Dense(
                self.target_vocab_size,
                dtype=tf.float32,
                use_bias=False,
                name='decoder_output_projection'
            )

            if self.mode == 'train':
                # decoder_inputs_embedded:
                # [batch_size, max_time_step + 1, embedding_size]
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                    params=self.decoder_embeddings,
                    ids=self.decoder_inputs_train
                )

                # Helper to feed inputs for training:
                # read inputs from dense ground truth vectors
                inputs = self.decoder_inputs_embedded

                if self.time_major:
                    inputs = tf.transpose(inputs, (1, 0, 2))

                training_helper = seq2seq.TrainingHelper(
                    inputs=inputs,
                    sequence_length=self.decoder_inputs_length,
                    time_major=self.time_major,
                    name='training_helper'
                )

                # 训练的时候不在这里应用 output_layer
                # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢
                # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数
                training_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=training_helper,
                    initial_state=self.decoder_initial_state,
                    # output_layer=self.decoder_output_projection
                )

                # Maximum decoder time_steps in current batch
                max_decoder_length = tf.reduce_max(
                    self.decoder_inputs_length
                )

                # decoder_outputs_train: BasicDecoderOutput
                #     namedtuple(rnn_outputs, sample_id)
                # decoder_outputs_train.rnn_output:
                #     if output_time_major=False:
                #         [batch_size, max_time_step + 1, num_decoder_symbols]
                #     if output_time_major=True:
                #         [max_time_step + 1, batch_size, num_decoder_symbols]
                # decoder_outputs_train.sample_id: [batch_size], tf.int32

                (
                    outputs,
                    self.final_state, # contain attention
                    _ # self.final_sequence_lengths
                ) = seq2seq.dynamic_decode(
                    decoder=training_decoder,
                    output_time_major=self.time_major,
                    impute_finished=True,
                    maximum_iterations=max_decoder_length,
                    parallel_iterations=self.parallel_iterations,
                    swap_memory=True,
                    scope=decoder_scope
                )

                # More efficient to do the projection
                # on the batch-time-concatenated tensor
                # logits_train:
                # [batch_size, max_time_step + 1, num_decoder_symbols]
                # 训练的时候一次性对所有的结果进行 output_layer 的投影运算
                # 官方NMT库说这样能提高10~20%的速度
                # 实际上我提高的速度会更大
                self.decoder_logits_train = self.decoder_output_projection(
                    outputs.rnn_output
                )

                # masks: masking for valid and padded time steps,
                # [batch_size, max_time_step + 1]
                self.masks = tf.sequence_mask(
                    lengths=self.decoder_inputs_length,
                    maxlen=max_decoder_length,
                    dtype=tf.float32, name='masks'
                )

                # Computes per word average cross-entropy over a batch
                # Internally calls
                # 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default

                decoder_logits_train = self.decoder_logits_train
                if self.time_major:
                    decoder_logits_train = tf.transpose(decoder_logits_train,
                                                        (1, 0, 2))

                self.decoder_pred_train = tf.argmax(
                    decoder_logits_train, axis=-1,
                    name='decoder_pred_train')

                # 下面的一些变量用于特殊的学习训练
                # 自定义rewards,其实我这里是修改了masks
                # train_entropy = cross entropy
                self.train_entropy = \
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        labels=self.decoder_inputs,
                        logits=decoder_logits_train)

                self.masks_rewards = self.masks * self.rewards

                self.loss_rewards = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_inputs,
                    weights=self.masks_rewards,
                    average_across_timesteps=True,
                    average_across_batch=True,
                )

                self.loss = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_inputs,
                    weights=self.masks,
                    average_across_timesteps=True,
                    average_across_batch=True,
                )

                self.loss_add = self.loss + self.add_loss

            elif self.mode == 'decode':
                # 预测模式,非训练

                start_tokens = tf.tile(
                    [WordSequence.START],
                    [self.batch_size]
                )
                end_token = WordSequence.END

                def embed_and_input_proj(inputs):
                    """输入层的投影层wrapper
                    """
                    return tf.nn.embedding_lookup(
                        self.decoder_embeddings,
                        inputs
                    )

                if not self.use_beamsearch_decode:
                    # Helper to feed inputs for greedy decoding:
                    # uses the argmax of the output
                    decoding_helper = seq2seq.GreedyEmbeddingHelper(
                        start_tokens=start_tokens,
                        end_token=end_token,
                        embedding=embed_and_input_proj
                    )
                    # Basic decoder performs greedy decoding at each time step
                    # print("building greedy decoder..")
                    inference_decoder = seq2seq.BasicDecoder(
                        cell=self.decoder_cell,
                        helper=decoding_helper,
                        initial_state=self.decoder_initial_state,
                        output_layer=self.decoder_output_projection
                    )
                else:
                    # Beamsearch is used to approximately
                    # find the most likely translation
                    # print("building beamsearch decoder..")
                    inference_decoder = BeamSearchDecoder(
                        cell=self.decoder_cell,
                        embedding=embed_and_input_proj,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=self.decoder_initial_state,
                        beam_width=self.beam_width,
                        output_layer=self.decoder_output_projection,
                    )

                # For GreedyDecoder, return
                # decoder_outputs_decode: BasicDecoderOutput instance
                #     namedtuple(rnn_outputs, sample_id)
                # decoder_outputs_decode.rnn_output:
                # if output_time_major=False:
                #     [batch_size, max_time_step, num_decoder_symbols]
                # if output_time_major=True
                #     [max_time_step, batch_size, num_decoder_symbols]
                # decoder_outputs_decode.sample_id:
                # if output_time_major=False
                #     [batch_size, max_time_step], tf.int32
                # if output_time_major=True
                #     [max_time_step, batch_size], tf.int32

                # For BeamSearchDecoder, return
                # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance
                #     namedtuple(predicted_ids, beam_search_decoder_output)
                # decoder_outputs_decode.predicted_ids:
                # if output_time_major=False:
                #     [batch_size, max_time_step, beam_width]
                # if output_time_major=True
                #     [max_time_step, batch_size, beam_width]
                # decoder_outputs_decode.beam_search_decoder_output:
                #     BeamSearchDecoderOutput instance
                #     namedtuple(scores, predicted_ids, parent_ids)

                # 官方文档提到的一个潜在的最大长度选择
                # 我这里改为 * 4
                # maximum_iterations = tf.round(tf.reduce_max(source_sequence_length) * 2)
                # https://www.tensorflow.org/tutorials/seq2seq

                if self.max_decode_step is not None:
                    max_decode_step = self.max_decode_step
                else:
                    # 默认 4 倍输入长度的输出解码
                    max_decode_step = tf.round(tf.reduce_max(
                        self.encoder_inputs_length) * 4)

                (
                    self.decoder_outputs_decode,
                    self.final_state,
                    _ # self.decoder_outputs_length_decode
                ) = (seq2seq.dynamic_decode(
                    decoder=inference_decoder,
                    output_time_major=self.time_major,
                    # impute_finished=True,	# error occurs
                    maximum_iterations=max_decode_step,
                    parallel_iterations=self.parallel_iterations,
                    swap_memory=True,
                    scope=decoder_scope
                ))

                if not self.use_beamsearch_decode:
                    # decoder_outputs_decode.sample_id:
                    #     [batch_size, max_time_step]
                    # Or use argmax to find decoder symbols to emit:
                    # self.decoder_pred_decode = tf.argmax(
                    #     self.decoder_outputs_decode.rnn_output,
                    #     axis=-1, name='decoder_pred_decode')

                    # Here, we use expand_dims to be compatible with
                    # the result of the beamsearch decoder
                    # decoder_pred_decode:
                    #     [batch_size, max_time_step, 1] (output_major=False)

                    # self.decoder_pred_decode = tf.expand_dims(
                    #     self.decoder_outputs_decode.sample_id,
                    #     -1
                    # )

                    dod = self.decoder_outputs_decode
                    self.decoder_pred_decode = dod.sample_id

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0))

                else:
                    # Use beam search to approximately
                    # find the most likely translation
                    # decoder_pred_decode:
                    # [batch_size, max_time_step, beam_width] (output_major=False)
                    self.decoder_pred_decode = \
                        self.decoder_outputs_decode.predicted_ids

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0, 2))

                    self.decoder_pred_decode = tf.transpose(
                        self.decoder_pred_decode,
                        perm=[0, 2, 1])
                    dod = self.decoder_outputs_decode
                    self.beam_prob = dod.beam_search_decoder_output.scores
Beispiel #16
0
    def inference(self):
        with tf.variable_scope("embedding"):
            embedding = tf.get_variable(
                "embedding",
                shape=[self.vocab_size, self.embedding_size],
                initializer=tf.truncated_normal_initializer(stddev=0.1,
                                                            dtype=tf.float32))
            encoder_input_data_embedding = tf.nn.embedding_lookup(
                embedding, self.encoder_input_data)
            decoder_input_data_embedding = tf.nn.embedding_lookup(
                embedding, self.decoder_input_data)

        with tf.variable_scope("encoder"):
            en_lstm1 = rnn.BasicLSTMCell(256)
            en_lstm1 = rnn.DropoutWrapper(en_lstm1,
                                          output_keep_prob=self.keep_prob)
            en_lstm2 = rnn.BasicLSTMCell(256)
            en_lstm2 = rnn.DropoutWrapper(en_lstm2,
                                          output_keep_prob=self.keep_prob)
            encoder_cell_fw = rnn.MultiRNNCell([en_lstm1])
            encoder_cell_bw = rnn.MultiRNNCell([en_lstm2])
        bi_encoder_outputs, bi_encoder_state = tf.nn.bidirectional_dynamic_rnn(
            encoder_cell_fw,
            encoder_cell_bw,
            encoder_input_data_embedding,
            sequence_length=self.input_seq_len,
            dtype=tf.float32)
        encoder_outputs = tf.concat(bi_encoder_outputs, -1)
        encoder_state = []
        for layer_id in range(1):  # layer_num
            encoder_state.append(bi_encoder_state[0][layer_id])  # forward
            encoder_state.append(bi_encoder_state[1][layer_id])  # backward
        encoder_state = tuple(encoder_state)

        with tf.variable_scope("decoder"):
            de_lstm1 = rnn.BasicLSTMCell(256)
            de_lstm1 = rnn.DropoutWrapper(de_lstm1,
                                          output_keep_prob=self.keep_prob)
            de_lstm2 = rnn.BasicLSTMCell(256)
            de_lstm2 = rnn.DropoutWrapper(de_lstm2,
                                          output_keep_prob=self.keep_prob)
            decoder_cell = rnn.MultiRNNCell([de_lstm1, de_lstm2])

            attention_mechanism = seq2seq.LuongAttention(
                256, encoder_outputs, self.input_seq_len)
            decoder_cell = seq2seq.AttentionWrapper(decoder_cell,
                                                    attention_mechanism, 256)
            decoder_initial_state = decoder_cell.zero_state(self.batch_size,
                                                            dtype=tf.float32)
            decoder_initial_state = decoder_initial_state.clone(
                cell_state=encoder_state)

            output_projection = Dense(self.vocab_size,
                                      name="output_projection")
            if self.is_train:
                helper = seq2seq.TrainingHelper(decoder_input_data_embedding,
                                                self.output_seq_len)
                decoder = seq2seq.BasicDecoder(decoder_cell,
                                               helper,
                                               decoder_initial_state,
                                               output_layer=output_projection)
                decoder_outputs, _, _ = seq2seq.dynamic_decode(decoder)
                logits = decoder_outputs.rnn_output
                pred = decoder_outputs.sample_id
            else:
                # #################SampleEmbedding#################
                helper = seq2seq.SampleEmbeddingHelper(
                    embedding,
                    start_tokens=[input_data.GO_ID] * self.batch_size,
                    end_token=input_data.EOS_ID)
                # #################GreedyEmbedding#################
                # helper = seq2seq.GreedyEmbeddingHelper(embedding,
                #                                        start_tokens=[input_data.GO_ID] * self.batch_size,
                #                                        end_token=input_data.EOS_ID)
                decoder = seq2seq.BasicDecoder(decoder_cell,
                                               helper,
                                               decoder_initial_state,
                                               output_layer=output_projection)
                decoder_outputs, _, _ = seq2seq.dynamic_decode(
                    decoder, maximum_iterations=10)
                logits = decoder_outputs.rnn_output
                pred = decoder_outputs.sample_id
            return logits, pred
Beispiel #17
0
def get_train_model(hps, vocab_size, img_feature_dim):
    # img_feature_dim 图像特征的维度
    batch_size = hps.batch_size

    img_feature = tf.placeholder(tf.float32, (batch_size, img_feature_dim))
    sentence = tf.placeholder(tf.int32, (batch_size, None))
    input_sentence_len = tf.placeholder(tf.int32,shape =(batch_size,))
    mask = tf.placeholder(tf.float32, (batch_size, None))
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    global_step = tf.Variable(tf.zeros([], tf.int64), name='global_step', trainable=False)

    # prediction process
    # sentence: [a,b,c,d,e] 这个是ground trues
    # input:[img,a,b,c,d,e] 所以需要将img_feature reshape成embedding_word 类似的形状,
    # 好拼接在一起,第二个维度上面进行拼接
    # img_feature:[0.4,0.3,10,2,5]

    # 下面是真实的预测场景
    # predict #1: img_feature -> embedding_img ->lstm -> a
    # predict #2:a -> embedding_word -> lstm -> (b)
    # predict #3:b -> embedding_word -> lstm -> (c)
    # predict #4:c -> embedding_word -> lstm -> (d)
    # predict #5:d -> embedding_word -> lstm -> (e)
    # predict #6:e -> embedding_word -> lstm -> eos

    # Sets up the embedding layer.
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope('embedding', initializer=embedding_initializer):
        embeddings = tf.get_variable(
            'embeddings',
            [vocab_size, hps.num_embedding_nodes],
            tf.float32)

        embed_token_ids = tf.nn.embedding_lookup(embeddings, sentence[:, 0:-1])  # 应该还剩一个词语,剩下的多半是填充的。

    img_feature_embed_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('image_feature_embed', initializer=img_feature_embed_init):
        embed_img = tf.layers.dense(img_feature, hps.num_embedding_nodes)
        embed_img = tf.layers.batch_normalization(embed_img)
        embed_img = tf.nn.relu(embed_img)


        embed_img = tf.expand_dims(embed_img, 1)  # 这个是在第1这个维度上面扩展的,扩展后embed_img 变成了
        # (batchsize,1,32)
        # embed_token_ids现在的形状是(batchsize,num_timesteps-1,32),所以下面进行了拼接。
        embed_inputs = tf.concat([embed_img, embed_token_ids], axis=1)
        # 现在embed_inputs的维度是(batchsize,num_timesteps,32)

    decoder_output_projection = layers.Dense(
        vocab_size,
        dtype=tf.float32,
        use_bias=False,
        name='decoder_output_projection'
    )

    # Sets up LSTM network.
    scale = 1.0 / math.sqrt(hps.num_embedding_nodes + hps.num_lstm_nodes[-1]) / 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope('lstm_nn', initializer=lstm_init) as train_scope:
        cells = []
        for i in range(hps.num_lstm_layers):
            cell = create_rnn_cell(hps.num_lstm_nodes[i], hps.cell_type)
            cell = dropout(cell, keep_prob)
            cells.append(cell)
        cell = tf.contrib.rnn.MultiRNNCell(cells)

        initial_state = cell.zero_state(hps.batch_size, tf.float32)
        # rnn_outputs: [batch_size, num_timesteps, hps.num_lstm_node[-





        # seq2seq的一个类,用来帮助feeding参数。
        training_helper = seq2seq.TrainingHelper(
            inputs=embed_inputs,
            sequence_length=input_sentence_len,
            time_major=False,
            name='training_helper'
        )

        training_decoder = seq2seq.BasicDecoder(
            cell=cell,
            helper=training_helper,
            initial_state=initial_state
        )

        # decoder在当前的batch下的最大time_steps
        max_decoder_length = tf.reduce_max(
            input_sentence_len
        )

        (
            outputs,
            final_state,
            final_sequence_lengths
        ) = seq2seq.dynamic_decode(
            decoder=training_decoder,
            output_time_major=False,
            impute_finished=True,
            maximum_iterations=max_decoder_length,
            swap_memory=True,
            scope=train_scope
        )
        # rnn_outputs, _ = tf.nn.dynamic_rnn(cell,
        #                                    embed_inputs,
        #                                    initial_state=initial_state)  # 这里实际上是有一个sequence length的参数,
        # 但是实际操作中是忽略的这个参数,因为LSTM的输入拼接了图像特征的缘故吧,并且使用了mask来标明数据位和填充位

        rnn_outputs = outputs.rnn_output
        print('rnn_outputs ', rnn_outputs)


    # Sets up the fully-connected layer.因为我们需要在[batch_size, num_timesteps, hps.num_lstm_node[-1]] 上的第3个维度上去做全连接
    # 因此,我们需要把batch_size,num_timesteps 合并成1个维度,因此我们使用reshape函数。
    # fc_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    # with tf.variable_scope('lstm_nn/fc', initializer=fc_init):
    #     rnn_outputs_2d = tf.reshape(rnn_outputs, [-1, hps.num_lstm_nodes[-1]])
    #     fc1 = tf.layers.dense(rnn_outputs_2d, hps.num_fc_nodes, name='fc1')
    #     fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob)
    #     fc1_dropout = tf.nn.relu(fc1_dropout)
    #     logits = tf.layers.dense(fc1_dropout, vocab_size, name='logits')

        decoder_logits_train = decoder_output_projection(
            outputs.rnn_output
        )

    masks = tf.sequence_mask(
        lengths= input_sentence_len,
        maxlen=max_decoder_length,
        dtype=tf.float32,
        name='masks'
    )

    with tf.variable_scope('loss'):

        '''
        这个tf.nn.sparse_softmax_cross_entropy_with_logits中,我们需要做三件事情:
        1.对logits做softmax
        2.对labels 做one-hot 编码,这label做了one-hot编码以后,形状也变成了(sentence,vocab_size),和logits的size相同,
        正好可以作交叉熵。
        3.计算它们的交叉熵
        '''

        loss = seq2seq.sequence_loss(
            logits=decoder_logits_train,
            targets=sentence,
            weights=masks,
            average_across_timesteps=True,
            average_across_batch=True
        )

        logits_flatted = tf.reshape(decoder_logits_train,(-1,vocab_size))
        prediction = tf.argmax(logits_flatted, 1, output_type=tf.int32)
        sentence_flatten = tf.reshape(sentence, [-1])  # 因为我们把LSTM的输出给展平了,所以sentence也需要展平
        mask_flatten = tf.reshape(mask, [-1])
        mask_flatten = tf.cast(mask_flatten,tf.float32)

        correct_prediction = tf.equal(prediction, sentence_flatten)
        print(correct_prediction.get_shape)
        print(mask_flatten.get_shape)
        correct_prediction_with_mask = tf.multiply(
            tf.cast(correct_prediction, tf.float32),
            mask_flatten)

        mask_sum = tf.reduce_sum(mask_flatten)
        accuracy = tf.reduce_sum(correct_prediction_with_mask) / mask_sum



        tf.summary.scalar('loss', loss)

    with tf.variable_scope('train_op'):
        tvars = tf.trainable_variables()
        for var in tvars:
            logging.info("variable name: %s" % (var.name))
        grads, _ = tf.clip_by_global_norm(
            tf.gradients(loss, tvars), hps.clip_lstm_grads)

        for grad, var in zip(grads, tvars):
            tf.summary.histogram('%s_grad' % (var.name), grad)
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)

    return ((img_feature, sentence,input_sentence_len, mask, keep_prob),
            (loss,accuracy, train_op),
            global_step)
    def build_decoder(self):
        print('build decoder...')
        with tf.variable_scope('decoder'):
            self.decoder_cell, self.decoder_initial_state = \
                self.build_decoder_cell()

            self.decoder_embedding = tf.get_variable(
                name='embedding',
                shape=[self.para.decoder_vocab_size, self.para.embedding_size],
                dtype=self.dtype)
            output_projection_layer = Dense(units=self.para.decoder_vocab_size,
                                            name='output_projection')

            if self.para.mode == 'train':
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                    params=self.decoder_embedding, ids=self.decoder_inputs)

                if self.para.scheduled_sampling == 0:
                    training_helper = seq2seq.TrainingHelper(
                        inputs=self.decoder_inputs_embedded,
                        sequence_length=self.decoder_inputs_len,
                        name='training_helper')
                else:
                    self.sampling_probability = tf.cond(
                        self.global_step < self.para.start_decay_step * 2,
                        lambda: tf.cast(tf.divide(
                            self.global_step, self.para.start_decay_step * 2),
                                        dtype=self.dtype),
                        lambda: tf.constant(1.0, dtype=self.dtype),
                        name='sampling_probability')
                    training_helper = seq2seq.ScheduledEmbeddingTrainingHelper(
                        inputs=self.decoder_inputs_embedded,
                        sequence_length=self.decoder_inputs_len,
                        embedding=self.decoder_embedding,
                        sampling_probability=self.sampling_probability,
                        name='training_helper')

                training_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=training_helper,
                    initial_state=self.decoder_initial_state,
                    output_layer=output_projection_layer)
                max_decoder_length = tf.reduce_max(self.decoder_inputs_len)
                self.decoder_outputs, decoder_states, decoder_outputs_len = \
                    seq2seq.dynamic_decode(
                        decoder=training_decoder,
                        maximum_iterations=max_decoder_length
                    )

                rnn_output = self.decoder_outputs.rnn_output
                # rnn_output should be padded to max_len
                # calculation of loss will be handled by masks
                self.rnn_output_padded = tf.pad(rnn_output, \
                    [[0, 0],
                     [0, self.para.max_len - tf.shape(rnn_output)[1]],
                     [0, 0]] \
                )
                self.loss = self.compute_loss(logits=self.rnn_output_padded,
                                              labels=self.decoder_targets)

            elif self.para.mode == 'test':
                start_tokens = tf.fill([self.para.batch_size], 1)

                if self.para.beam_search == 0:
                    inference_helper = seq2seq.GreedyEmbeddingHelper(
                        start_tokens=start_tokens,
                        end_token=2,
                        embedding=self.decoder_embedding)
                    inference_decoder = seq2seq.BasicDecoder(
                        cell=self.decoder_cell,
                        helper=inference_helper,
                        initial_state=self.decoder_initial_state,
                        output_layer=output_projection_layer)
                else:
                    inference_decoder = seq2seq.BeamSearchDecoder(
                        cell=self.decoder_cell,
                        embedding=self.decoder_embedding,
                        start_tokens=start_tokens,
                        end_token=2,
                        initial_state=self.decoder_initial_state,
                        beam_width=self.para.beam_width,
                        output_layer=output_projection_layer)

                self.decoder_outputs, decoder_states, decoder_outputs_len = \
                    seq2seq.dynamic_decode(
                        decoder=inference_decoder,
                        maximum_iterations=self.para.max_len
                    )
                if self.para.beam_search == 0:
                    # self.decoder_predictions_id: [batch_size, max_len, 1]
                    self.decoder_predicted_ids = tf.expand_dims( \
                        input=self.decoder_outputs.sample_id, \
                        axis=-1 \
                    )
                else:
                    # self.decoder_predicted_ids: [batch_size, <= max_len, beam_width]
                    self.decoder_predicted_ids = self.decoder_outputs.predicted_ids
    def build_decoder(self):
        with tf.variable_scope("decoder"):
            decoder_cell, decoder_initial_state = self.build_decoder_cell()

            # start tokens : [batch_size], which is fed to BeamsearchDecoder during inference
            start_tokens = tf.ones([self.batch_size],
                                   dtype=tf.int32) * data_util.ID_GO
            end_token = data_util.ID_EOS
            input_layer = Dense(self.state_size,
                                dtype=tf.float32,
                                name="input_layer")
            output_layer = Dense(self.decoder_vocab_size,
                                 name="output_projection")
            if self.mode == "train":
                # feed ground truth decoder input token every time step
                decoder_input_lookup = tf.nn.embedding_lookup(
                    self.embedding_matrix, self.decoder_input)
                decoder_input_lookup = input_layer(decoder_input_lookup)
                training_helper = seq2seq.TrainingHelper(
                    inputs=decoder_input_lookup,
                    sequence_length=self.decoder_train_len,
                    name="training_helper")
                training_decoder = seq2seq.BasicDecoder(
                    cell=decoder_cell,
                    initial_state=decoder_initial_state,
                    helper=training_helper,
                    output_layer=output_layer)

                # decoder_outputs_train: BasicDecoderOutput
                #                        namedtuple(rnn_outputs, sample_id)
                # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False
                #                                   [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True
                # decoder_outputs_train.sample_id: [batch_size], tf.int32
                max_decoder_len = tf.reduce_max(self.decoder_train_len)
                decoder_outputs_train, final_state, _ = seq2seq.dynamic_decode(
                    training_decoder,
                    impute_finished=True,
                    swap_memory=True,
                    maximum_iterations=max_decoder_len)
                self.decoder_logits_train = tf.identity(
                    decoder_outputs_train.rnn_output)
                decoder_pred = tf.argmax(self.decoder_logits_train, axis=2)
                # sequence mask for get valid sequence except zero padding
                weights = tf.sequence_mask(self.decoder_len,
                                           maxlen=max_decoder_len,
                                           dtype=tf.float32)
                # compute cross entropy loss for all sequence prediction and ignore loss from zero padding
                self.loss = seq2seq.sequence_loss(
                    logits=self.decoder_logits_train,
                    targets=self.decoder_target,
                    weights=weights,
                    average_across_batch=True,
                    average_across_timesteps=True)
                tf.summary.scalar("loss", self.loss)

                with tf.variable_scope("train_optimizer") and tf.device(
                        "/device:GPU:1"):
                    # use AdamOptimizer and clip gradient by max_norm 5.0
                    # use global step for counting every iteration
                    params = tf.trainable_variables()
                    gradients = tf.gradients(self.loss, params)
                    clipped_gradients, _ = tf.clip_by_global_norm(
                        gradients, 5.0)
                    learning_rate = tf.train.exponential_decay(
                        self.lr, self.global_step, 100000, 0.96)
                    opt = tf.train.AdagradOptimizer(learning_rate)

                    self.train_op = opt.apply_gradients(
                        zip(clipped_gradients, params),
                        global_step=self.global_step)

            elif self.mode == "test":

                def embedding_proj(inputs):
                    return input_layer(
                        tf.nn.embedding_lookup(self.embedding_matrix, inputs))

                inference_decoder = seq2seq.BeamSearchDecoder(
                    cell=decoder_cell,
                    embedding=embedding_proj,
                    start_tokens=start_tokens,
                    end_token=end_token,
                    initial_state=decoder_initial_state,
                    beam_width=self.beam_depth,
                    output_layer=output_layer)

                # For GreedyDecoder, return
                # decoder_outputs_decode: BasicDecoderOutput instance
                #                         namedtuple(rnn_outputs, sample_id)
                # decoder_outputs_decode.rnn_output: [batch_size, max_time_step, num_decoder_symbols] 	if output_time_major=False
                #                                    [max_time_step, batch_size, num_decoder_symbols] 	if output_time_major=True
                # decoder_outputs_decode.sample_id: [batch_size, max_time_step], tf.int32		if output_time_major=False
                #                                   [max_time_step, batch_size], tf.int32               if output_time_major=True

                # For BeamSearchDecoder, return
                # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance
                #                         namedtuple(predicted_ids, beam_search_decoder_output)
                # decoder_outputs_decode.predicted_ids: [batch_size, max_time_step, beam_width] if output_time_major=False
                #                                       [max_time_step, batch_size, beam_width] if output_time_major=True
                # decoder_outputs_decode.beam_search_decoder_output: BeamSearchDecoderOutput instance
                #                                                    namedtuple(scores, predicted_ids, parent_ids)
                with tf.device("/device:GPU:1"):
                    decoder_outputs, decoder_last_state, decoder_output_length = \
                        seq2seq.dynamic_decode(decoder=inference_decoder,
                                               output_time_major=False,
                                               swap_memory=False,
                                               maximum_iterations=self.max_iter)
                    self.decoder_pred_test = decoder_outputs.predicted_ids
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
                   target_dict_dim, is_generating, beam_size,
                   max_generation_length):
    src_word_idx = tf.placeholder(tf.int32, shape=[None, None])
    src_sequence_length = tf.placeholder(tf.int32, shape=[
        None,
    ])

    src_embedding_weights = tf.get_variable("source_word_embeddings",
                                            [source_dict_dim, embedding_dim])
    src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx)

    src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
    src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
    # no peephole
    encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=src_forward_cell,
        cell_bw=src_reversed_cell,
        inputs=src_embedding,
        sequence_length=src_sequence_length,
        dtype=tf.float32)

    # concat the forward outputs and backward outputs
    encoded_vec = tf.concat(encoder_outputs, axis=2)

    # project the encoder outputs to size of decoder lstm
    encoded_proj = tf.contrib.layers.fully_connected(inputs=tf.reshape(
        encoded_vec, shape=[-1, embedding_dim * 2]),
                                                     num_outputs=decoder_size,
                                                     activation_fn=None,
                                                     biases_initializer=None)
    encoded_proj_reshape = tf.reshape(
        encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size])

    # get init state for decoder lstm's H
    backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1])
    decoder_boot = tf.contrib.layers.fully_connected(inputs=tf.reshape(
        backword_first, shape=[-1, embedding_dim]),
                                                     num_outputs=decoder_size,
                                                     activation_fn=tf.nn.tanh,
                                                     biases_initializer=None)

    # prepare the initial state for decoder lstm
    cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32)
    initial_state = LSTMStateTuple(cell_init, decoder_boot)

    # create decoder lstm cell
    decoder_cell = LSTMCellWithSimpleAttention(
        decoder_size,
        encoded_vec if not is_generating else seq2seq.tile_batch(
            encoded_vec, beam_size),
        encoded_proj_reshape if not is_generating else seq2seq.tile_batch(
            encoded_proj_reshape, beam_size),
        src_sequence_length if not is_generating else seq2seq.tile_batch(
            src_sequence_length, beam_size),
        forget_bias=0.0)

    output_layer = Dense(target_dict_dim, name='output_projection')

    if not is_generating:
        trg_word_idx = tf.placeholder(tf.int32, shape=[None, None])
        trg_sequence_length = tf.placeholder(tf.int32, shape=[
            None,
        ])
        trg_embedding_weights = tf.get_variable(
            "target_word_embeddings", [target_dict_dim, embedding_dim])
        trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights,
                                               trg_word_idx)

        training_helper = seq2seq.TrainingHelper(
            inputs=trg_embedding,
            sequence_length=trg_sequence_length,
            time_major=False,
            name='training_helper')

        training_decoder = seq2seq.BasicDecoder(cell=decoder_cell,
                                                helper=training_helper,
                                                initial_state=initial_state,
                                                output_layer=output_layer)

        # get the max length of target sequence
        max_decoder_length = tf.reduce_max(trg_sequence_length)

        decoder_outputs_train, _, _ = seq2seq.dynamic_decode(
            decoder=training_decoder,
            output_time_major=False,
            impute_finished=True,
            maximum_iterations=max_decoder_length)

        decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output)
        decoder_pred_train = tf.argmax(decoder_logits_train,
                                       axis=-1,
                                       name='decoder_pred_train')
        masks = tf.sequence_mask(lengths=trg_sequence_length,
                                 maxlen=max_decoder_length,
                                 dtype=tf.float32,
                                 name='masks')

        # place holder of label sequence
        lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None])

        # compute the loss
        loss = seq2seq.sequence_loss(logits=decoder_logits_train,
                                     targets=lbl_word_idx,
                                     weights=masks,
                                     average_across_timesteps=True,
                                     average_across_batch=True)

        # return feeding list and loss operator
        return {
            'src_word_idx': src_word_idx,
            'src_sequence_length': src_sequence_length,
            'trg_word_idx': trg_word_idx,
            'trg_sequence_length': trg_sequence_length,
            'lbl_word_idx': lbl_word_idx
        }, loss
    else:
        start_tokens = tf.ones([
            tf.shape(src_word_idx)[0],
        ], tf.int32) * START_TOKEN_IDX
        # share the same embedding weights with target word
        trg_embedding_weights = tf.get_variable(
            "target_word_embeddings", [target_dict_dim, embedding_dim])

        inference_decoder = beam_search_decoder.BeamSearchDecoder(
            cell=decoder_cell,
            embedding=lambda tokens: tf.nn.embedding_lookup(
                trg_embedding_weights, tokens),
            start_tokens=start_tokens,
            end_token=END_TOKEN_IDX,
            initial_state=tf.nn.rnn_cell.LSTMStateTuple(
                tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size),
                tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)),
            beam_width=beam_size,
            output_layer=output_layer)

        decoder_outputs_decode, _, _ = seq2seq.dynamic_decode(
            decoder=inference_decoder,
            output_time_major=False,
            #impute_finished=True,# error occurs
            maximum_iterations=max_generation_length)

        predicted_ids = decoder_outputs_decode.predicted_ids

        return {
            'src_word_idx': src_word_idx,
            'src_sequence_length': src_sequence_length
        }, predicted_ids
Beispiel #21
0
    def build_model(self):
        """
        建立seq2seq模型
        """
        self.query_input = tf.placeholder(tf.int32, [None, None])
        self.query_length = tf.placeholder(tf.int32, [None])

        self.answer_input = tf.placeholder(tf.int32, [None, None])
        self.answer_target = tf.placeholder(tf.int32, [None, None])
        self.answer_length = tf.placeholder(tf.int32, [None])
        self.batch_size = array_ops.shape(self.query_input)[0]

        if self.mode == "train":
            self.max_decode_step = tf.reduce_max(self.answer_length)
            self.sequence_mask = tf.sequence_mask(self.answer_length,
                                                  self.max_decode_step, dtype=tf.float32)
        elif self.mode == "decode":
            self.max_decode_step = tf.reduce_max(self.query_length) * 10

        # input and output embedding
        # 词变为向量
        self.embeddings_matrix = tf.Variable(tf.random_uniform([
            self.vocab_size, EMBEDDING_SIZE], -1.0, 1.0), dtype=tf.float32)

        self.query_embeddings = tf.nn.embedding_lookup(self.embeddings_matrix, self.query_input)
        self.answer_embeddings = tf.nn.embedding_lookup(self.embeddings_matrix, self.answer_input)

        # encoder process
        self.encoder_outputs, self.encoder_state = tf.nn.dynamic_rnn(
            rnn.BasicLSTMCell(ENCODER_HIDDEN_SIZE), self.query_embeddings,
            sequence_length=self.query_length, dtype=tf.float32)

        # 通过beam search 加工出一批临时变量,后续复用
        batch_size, encoder_outputs, encoder_state, encoder_length = (self.batch_size,
                                                                      self.encoder_outputs, self.encoder_state,
                                                                      self.query_length)

        if self.mode == "decode":
            batch_size = batch_size * BEAM_WIDTH
            encoder_outputs = seq2seq.tile_batch(t=self.encoder_outputs, multiplier=BEAM_WIDTH)
            encoder_state = nest.map_structure(lambda s: seq2seq.tile_batch(
                t=s, multiplier=BEAM_WIDTH), self.encoder_state)
            encoder_length = seq2seq.tile_batch(t=self.query_length, multiplier=BEAM_WIDTH)

        # attention wrapper
        self.attention_mechanism = seq2seq.BahdanauAttention(num_units=ENCODER_HIDDEN_SIZE,
                                                             memory=encoder_outputs,
                                                             memory_sequence_length=encoder_length)
        self.decoder_cell = seq2seq.AttentionWrapper(rnn.BasicLSTMCell(DECODER_HIDDEN_SIZE),
                                                     attention_mechanism=self.attention_mechanism,
                                                     attention_layer_size=ATTENTION_SIZE)
        self.decoder_initial_state = self.decoder_cell.zero_state(batch_size=batch_size,
                                                                  dtype=tf.float32).clone(cell_state=encoder_state)

        self.decoder_dense = tf.layers.Dense(self.vocab_size, dtype=tf.float32, use_bias=False,
                                             kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))

        # 如果是训练过程,使用training helper, 否则使用greedyhelper或beamsearch helper
        if self.mode == "train":
            training_helper = seq2seq.TrainingHelper(inputs=self.answer_embeddings,
                                                     sequence_length=self.answer_length)
            training_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, helper=training_helper,
                                                    initial_state=self.decoder_initial_state,
                                                    output_layer=self.decoder_dense)

            decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=training_decoder,
                                                                      impute_finished=True,
                                                                      maximum_iterations=self.max_decode_step)
            self.decoder_logits = tf.identity(decoder_outputs.rnn_output)

            self.loss = seq2seq.sequence_loss(logits=decoder_outputs.rnn_output,
                                              targets=self.answer_target, weights=self.sequence_mask)
            self.sample_ids = decoder_outputs.sample_id

            self.optimizer = tf.train.AdamOptimizer(LR_RATE)
            self.train_op = self.optimizer.minimize(self.loss)

            tf.summary.scalar('loss', self.loss)
            self.summary_op = tf.summary.merge_all()
        elif self.mode == "decode":
            start_tokens = tf.ones([self.batch_size], tf.int32) * self.go
            end_token = self.eos

            # 在beam search的情况下,给beam search helper传递的值,不需要使用BEAM_WIDTH的tensor
            # 此处使用beam_search/greedy helper解码都可以,如果只回复1条时等价
            if USE_BEAMSEARCH:
                inference_decoder = seq2seq.BeamSearchDecoder(cell=self.decoder_cell,
                                                              embedding=self.embeddings_matrix,
                                                              start_tokens=start_tokens,
                                                              end_token=end_token,
                                                              initial_state=self.decoder_initial_state,
                                                              beam_width=BEAM_WIDTH, output_layer=self.decoder_dense)
                # 使用beam_search的时候,结果是predicted_ids, beam_search_decoder_output
                # predicted_ids: [batch_size, decoder_targets_length, beam_size]
                # beam_search_decoder_output: scores, predicted_ids, parent_ids
                decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder=inference_decoder, maximum_iterations=self.max_decode_step)
                self.sample_ids = decoder_outputs.predicted_ids
                self.sample_ids = tf.transpose(self.sample_ids, perm=[0, 2, 1])  # 转置成行句子
            else:
                decoding_helper = seq2seq.GreedyEmbeddingHelper(start_tokens=start_tokens,
                                                                end_token=end_token, embedding=self.embeddings_matrix)
                inference_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell,
                                                         helper=decoding_helper,
                                                         initial_state=self.decoder_initial_state,
                                                         output_layer=self.decoder_dense)
                # 不使用beam_search的时候,结果是rnn_outputs, sample_id,
                # rnn_output: [batch_size, decoder_targets_length, vocab_size]
                # sample_id: [batch_size, decoder_targets_length], tf.int32
                self.decoder_outputs_decode, self.final_state, _ = seq2seq.dynamic_decode(
                    decoder=inference_decoder, maximum_iterations=self.max_decode_step)
                self.sample_ids = self.decoder_outputs_decode.sample_id
Beispiel #22
0
    def build_decoder(self):
        print("building decoder and attention..")
        with tf.variable_scope('decoder'):
            # Building decoder_cell and decoder_initial_state
            self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell(
            )

            input_layer = Dense(self.hidden_units,
                                dtype=self.dtype,
                                name='input_projection')
            # Output projection layer to convert cell_outputs to logits
            output_layer = Dense(self.num_decoder_symbols,
                                 name='output_projection')

            if self.mode == 'train':
                initializer = tf.random_uniform_initializer(-math.sqrt(3),
                                                            math.sqrt(3),
                                                            dtype=self.dtype)
                self.decoder_embeddings = tf.get_variable(
                    name='embedding',
                    shape=[self.num_decoder_symbols, self.embedding_size],
                    initializer=initializer,
                    dtype=self.dtype)
                self.decoder_encoded = tf.nn.embedding_lookup(
                    params=self.decoder_embeddings,
                    ids=self.decoder_inputs_train)
                self.decoder_inputs_encoded = input_layer(self.decoder_encoded)
                print(" Decoder input encoded is ",
                      self.decoder_inputs_encoded.shape)

                # Helper to feed inputs for training: read inputs from dense ground truth vectors
                training_helper = seq2seq.TrainingHelper(
                    inputs=self.decoder_inputs_encoded,
                    sequence_length=self.decoder_inputs_length_train,
                    time_major=False,
                    name='training_helper')

                training_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=training_helper,
                    initial_state=self.decoder_initial_state,
                    output_layer=output_layer)

                (self.decoder_outputs_train, self.decoder_last_state_train,
                 self.decoder_outputs_length_train) = (seq2seq.dynamic_decode(
                     decoder=training_decoder,
                     output_time_major=False,
                     impute_finished=True,
                     swap_memory=True,
                     maximum_iterations=self.max_decoder_length))

                # More efficient to do the projection on the batch-time-concatenated tensor
                # logits_train: [batch_size, max_time_step + 1, num_decoder_symbols]
                # self.decoder_logits_train = output_layer(self.decoder_outputs_train.rnn_output)
                self.decoder_logits_train = tf.identity(
                    self.decoder_outputs_train.rnn_output)
                # Use argmax to extract decoder symbols to emit
                self.decoder_pred_train = tf.argmax(self.decoder_logits_train,
                                                    axis=-1,
                                                    name='decoder_pred_train')

                # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1]
                masks = tf.sequence_mask(
                    lengths=self.decoder_inputs_length_masks,
                    maxlen=self.max_decoder_length,
                    dtype=self.dtype,
                    name='masks')

                print("logits train shape is ",
                      self.decoder_logits_train.shape)
                print("decoder_targets_train train shape is ",
                      self.decoder_targets_train.shape)

                self.loss = tf.reduce_sum(
                    seq2seq.sequence_loss(
                        logits=self.decoder_logits_train,
                        targets=self.decoder_targets_train,
                        weights=masks,
                        average_across_timesteps=False,
                        average_across_batch=True,
                    ))

                # Compute predictions
                self.accuracy, self.accuracy_op = tf.metrics.accuracy(
                    labels=self.decoder_targets_train,
                    predictions=self.decoder_pred_train,
                    name="accuracy")

                # Training summary for the current batch_loss
                tf.summary.scalar('loss', self.loss)
                tf.summary.scalar('teacher_forcing_accuracy', self.accuracy)

                # Contruct graphs for minimizing loss
                self.init_optimizer()

            elif self.mode == 'decode':
                self.decoder_embeddings = tf.get_variable(
                    name='embedding',
                    shape=[self.num_decoder_symbols, self.embedding_size],
                    dtype=self.dtype)

                # Start_tokens: [batch_size,] `int32` vector
                start_tokens = tf.ones([
                    self.batch_size,
                ], tf.int32) * self.dest_start_token_index
                end_token = self.dest_eos_token_index

                def embed_and_input_proj(inputs):
                    encoded_input = tf.nn.embedding_lookup(
                        self.decoder_embeddings, inputs)
                    return input_layer(encoded_input)

                if not self.use_beamsearch_decode:
                    # Helper to feed inputs for greedy decoding: uses the argmax of the output
                    decoding_helper = seq2seq.GreedyEmbeddingHelper(
                        start_tokens=start_tokens,
                        end_token=end_token,
                        embedding=embed_and_input_proj)
                    # Basic decoder performs greedy decoding at each time step
                    print("building greedy decoder..")
                    inference_decoder = seq2seq.BasicDecoder(
                        cell=self.decoder_cell,
                        helper=decoding_helper,
                        initial_state=self.decoder_initial_state,
                        output_layer=output_layer)
                else:
                    # Beamsearch is used to approximately find the most likely translation
                    print("building beamsearch decoder..")
                    inference_decoder = beam_search_decoder.BeamSearchDecoder(
                        cell=self.decoder_cell,
                        embedding=embed_and_input_proj,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=self.decoder_initial_state,
                        beam_width=self.beam_width,
                        output_layer=output_layer,
                    )

                (self.decoder_outputs_decode, self.decoder_last_state_decode,
                 self.decoder_outputs_length_decode) = (seq2seq.dynamic_decode(
                     decoder=inference_decoder,
                     output_time_major=False,
                     swap_memory=True,
                     maximum_iterations=self.max_decode_step))

                if not self.use_beamsearch_decode:
                    # Here, we use expand_dims to be compatible with the result of the beamsearch decoder
                    # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False)
                    self.decoder_pred_decode = tf.expand_dims(
                        self.decoder_outputs_decode.sample_id, -1)

                else:
                    # Use beam search to approximately find the most likely translation
                    # decoder_pred_decode: [batch_size, max_time_step, beam_width] (output_major=False)
                    self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids
def _build_decoder(model, encoder_outputs, encoder_state, hparams, start_token,
                   end_token, output_layer, aux_hidden_state):
  """build decoder for the seq2seq model."""

  iterator = model.iterator

  start_token_id = tf.cast(
      model.vocab_table.lookup(tf.constant(start_token)), tf.int32)
  end_token_id = tf.cast(
      model.vocab_table.lookup(tf.constant(end_token)), tf.int32)

  start_tokens = tf.fill([model.batch_size], start_token_id)
  end_token = end_token_id

  ## Decoder.
  with tf.variable_scope("decoder") as decoder_scope:
    cell, decoder_initial_state = _build_decoder_cell(
        model, hparams, encoder_state, base_gpu=model.global_gpu_num)
    model.global_gpu_num += hparams.num_layers
    # ## Train or eval

    decoder_emb_inp = tf.nn.embedding_lookup(model.embedding_decoder,
                                             iterator.target)
    # Helper
    helper_train = help_py.TrainingHelper(
        decoder_emb_inp, iterator.dialogue_len, time_major=False)

    # Decoder
    my_decoder_train = basic_decoder.BasicDecoder(
        cell,
        helper_train,
        decoder_initial_state,
        encoder_outputs,
        iterator.turns,
        output_layer=output_layer,
        aux_hidden_state=aux_hidden_state)

    # Dynamic decoding
    outputs_train, _, _ = seq2seq.dynamic_decode(
        my_decoder_train,
        output_time_major=False,
        swap_memory=True,
        scope=decoder_scope)

    sample_id_train = outputs_train.sample_id
    logits_train = outputs_train.rnn_output
    ## Inference
    # else:

    beam_width = hparams.beam_width
    length_penalty_weight = hparams.length_penalty_weight

    if model.mode == tf.estimator.ModeKeys.PREDICT and beam_width > 0:
      my_decoder_infer = seq2seq.BeamSearchDecoder(
          cell=cell,
          embedding=model.embedding_decoder,
          start_tokens=start_tokens,
          end_token=end_token,
          initial_state=decoder_initial_state,
          beam_width=beam_width,
          output_layer=output_layer,
          length_penalty_weight=length_penalty_weight)
    else:
      # Helper
      if model.mode in dialogue_utils.self_play_modes:
        helper_infer = seq2seq.SampleEmbeddingHelper(
            model.embedding_decoder, start_tokens, end_token)
      else:  # inference
        helper_infer = seq2seq.GreedyEmbeddingHelper(
            model.embedding_decoder, start_tokens, end_token)

      # Decoder
      my_decoder_infer = seq2seq.BasicDecoder(
          cell,
          helper_infer,
          decoder_initial_state,
          output_layer=output_layer  # applied per timestep
      )

    # Dynamic decoding
    outputs_infer, _, _ = seq2seq.dynamic_decode(
        my_decoder_infer,
        maximum_iterations=hparams.max_inference_len,
        output_time_major=False,
        swap_memory=True,
        scope=decoder_scope)

    if model.mode == tf.estimator.ModeKeys.PREDICT and beam_width > 0:
      logits_infer = tf.no_op()
      sample_id_infer = outputs_infer.predicted_ids
    else:
      logits_infer = outputs_infer.rnn_output
      sample_id_infer = outputs_infer.sample_id

  return logits_train, logits_infer, sample_id_train, sample_id_infer
Beispiel #24
0
    def build_decoder(self):
        """构建解码器
        """
        with tf.variable_scope('decoder') as decoder_scope:
            # Building decoder_cell and decoder_initial_state
            (self.decoder_cell,
             self.decoder_initial_state) = self.build_decoder_cell()

            # 解码器embedding
            if self.share_embedding:
                self.decoder_embeddings = self.encoder_embeddings
            else:
                with tf.device(_get_embed_device(self.target_vocab_size)):
                    self.decoder_embeddings = tf.get_variable(
                        name='embeddings',
                        shape=(self.target_vocab_size, self.embedding_size),
                        initializer=self.initializer,
                        dtype=tf.float32)

            # On Using Very Large Target Vocabulary
            # for Neural Machine Translation
            # https://arxiv.org/pdf/1412.2007v2.pdf

            # Input projection layer to feed embedded inputs to the cell
            # ** Essential when use_residual=True to match input/output dims
            hidden_units = self.hidden_units
            if self.bidirectional:
                hidden_units *= 2

            input_layer = layers.Dense(hidden_units,
                                       dtype=tf.float32,
                                       use_bias=False,
                                       name='input_projection')

            self.output_layer = layers.Dense(self.target_vocab_size,
                                             dtype=tf.float32,
                                             use_bias=False,
                                             name='output_projection')

            if self.mode == 'train':
                # decoder_inputs_embedded:
                # [batch_size, max_time_step + 1, embedding_size]
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                    params=self.decoder_embeddings,
                    ids=self.decoder_inputs_train)

                # Embedded inputs having gone through input projection layer
                self.decoder_inputs_embedded = input_layer(
                    self.decoder_inputs_embedded)

                # Helper to feed inputs for training:
                # read inputs from dense ground truth vectors
                inputs = self.decoder_inputs_embedded
                if self.time_major:
                    inputs = tf.transpose(inputs, (1, 0, 2))

                training_helper = seq2seq.TrainingHelper(
                    inputs=inputs,
                    sequence_length=self.decoder_inputs_length_train,
                    time_major=self.time_major,
                    name='training_helper')

                # 训练的时候不在这里应用 output_layer
                # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢
                # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数
                training_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=training_helper,
                    initial_state=self.decoder_initial_state,
                    # output_layer=self.output_layer
                )

                # Maximum decoder time_steps in current batch
                max_decoder_length = tf.reduce_max(
                    self.decoder_inputs_length_train)

                # decoder_outputs_train: BasicDecoderOutput
                #     namedtuple(rnn_outputs, sample_id)
                # decoder_outputs_train.rnn_output:
                #     if output_time_major=False:
                #         [batch_size, max_time_step + 1, num_decoder_symbols]
                #     if output_time_major=True:
                #         [max_time_step + 1, batch_size, num_decoder_symbols]
                # decoder_outputs_train.sample_id: [batch_size], tf.int32

                (
                    outputs,
                    self.final_state,  # contain attention
                    _  # self.final_sequence_lengths
                ) = seq2seq.dynamic_decode(
                    decoder=training_decoder,
                    output_time_major=self.time_major,
                    impute_finished=True,
                    maximum_iterations=max_decoder_length,
                    parallel_iterations=self.parallel_iterations,
                    swap_memory=True,
                    scope=decoder_scope)

                # More efficient to do the projection
                # on the batch-time-concatenated tensor
                # logits_train:
                # [batch_size, max_time_step + 1, num_decoder_symbols]
                # 训练的时候一次性对所有的结果进行 output_layer 的投影运算
                # 官方NMT库说这样能提高10~20%的速度
                # 实际上我提高的速度会更大
                self.decoder_logits_train = self.output_layer(
                    outputs.rnn_output)

                # masks: masking for valid and padded time steps,
                # [batch_size, max_time_step + 1]
                self.masks = tf.sequence_mask(
                    lengths=self.decoder_inputs_length_train,
                    maxlen=max_decoder_length,
                    dtype=tf.float32,
                    name='masks')

                # Computes per word average cross-entropy over a batch
                # Internally calls
                # 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default

                decoder_logits_train = self.decoder_logits_train
                if self.time_major:
                    decoder_logits_train = tf.transpose(
                        decoder_logits_train, (1, 0, 2))

                self.decoder_pred_train = tf.argmax(decoder_logits_train,
                                                    axis=-1,
                                                    name='decoder_pred_train')

                # 下面的一些变量用于强化学习训练
                self.train_entropy = \
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        labels=self.decoder_targets_train,
                        logits=decoder_logits_train)
                # self.train_entropy *= self.masks
                # print(self.train_entropy.shape)
                self.train_entropy_rewards = tf.multiply(
                    self.train_entropy, self.rewards)
                # print('self.train_entropy_rewards.shape', self.train_entropy_rewards.shape)
                self.train_entropy_rewards *= self.masks

                # https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/contrib/seq2seq/python/ops/loss.py
                # if average_across_timesteps and average_across_batch:
                #   crossent = math_ops.reduce_sum(crossent)
                #   total_size = math_ops.reduce_sum(weights)
                #   total_size += 1e-12  # to avoid division by 0 for all-0 weights
                #   crossent /= total_size

                self.loss_without_rewards = tf.reduce_sum(self.train_entropy)
                self.loss_rewards = tf.reduce_sum(self.train_entropy_rewards)

                total_size = tf.reduce_sum(self.masks)
                total_size += 1e-12
                self.loss_without_rewards /= total_size
                self.loss_rewards /= total_size

                self.loss = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_targets_train,
                    weights=self.masks,
                    average_across_timesteps=True,
                    average_across_batch=True,
                )

                # Training summary for the current batch_loss
                tf.summary.scalar('loss', self.loss)

            elif self.mode == 'decode':
                # 预测模式,非训练

                start_tokens = tf.fill([self.batch_size], WordSequence.START)
                end_token = WordSequence.END

                def embed_and_input_proj(inputs):
                    """输入层的投影层wrapper
                    """
                    return input_layer(
                        tf.nn.embedding_lookup(self.decoder_embeddings,
                                               inputs))

                if not self.use_beamsearch_decode:
                    # Helper to feed inputs for greedy decoding:
                    # uses the argmax of the output
                    decoding_helper = seq2seq.GreedyEmbeddingHelper(
                        start_tokens=start_tokens,
                        end_token=end_token,
                        embedding=embed_and_input_proj)
                    # Basic decoder performs greedy decoding at each time step
                    # print("building greedy decoder..")
                    inference_decoder = seq2seq.BasicDecoder(
                        cell=self.decoder_cell,
                        helper=decoding_helper,
                        initial_state=self.decoder_initial_state,
                        output_layer=self.output_layer)
                else:
                    # Beamsearch is used to approximately
                    # find the most likely translation
                    # print("building beamsearch decoder..")
                    inference_decoder = BeamSearchDecoder(
                        cell=self.decoder_cell,
                        embedding=embed_and_input_proj,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=self.decoder_initial_state,
                        beam_width=self.beam_width,
                        output_layer=self.output_layer,
                    )

                # For GreedyDecoder, return
                # decoder_outputs_decode: BasicDecoderOutput instance
                #     namedtuple(rnn_outputs, sample_id)
                # decoder_outputs_decode.rnn_output:
                # if output_time_major=False:
                #     [batch_size, max_time_step, num_decoder_symbols]
                # if output_time_major=True
                #     [max_time_step, batch_size, num_decoder_symbols]
                # decoder_outputs_decode.sample_id:
                # if output_time_major=False
                #     [batch_size, max_time_step], tf.int32
                # if output_time_major=True
                #     [max_time_step, batch_size], tf.int32

                # For BeamSearchDecoder, return
                # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance
                #     namedtuple(predicted_ids, beam_search_decoder_output)
                # decoder_outputs_decode.predicted_ids:
                # if output_time_major=False:
                #     [batch_size, max_time_step, beam_width]
                # if output_time_major=True
                #     [max_time_step, batch_size, beam_width]
                # decoder_outputs_decode.beam_search_decoder_output:
                #     BeamSearchDecoderOutput instance
                #     namedtuple(scores, predicted_ids, parent_ids)

                # 官方文档提到的一个潜在的最大长度选择
                # maximum_iterations = tf.round(tf.reduce_max(source_sequence_length) * 2)
                # https://www.tensorflow.org/tutorials/seq2seq

                if self.max_decode_step is not None:
                    max_decode_step = self.max_decode_step
                else:
                    # 默认 4 倍输入长度的输出解码
                    max_decode_step = tf.round(
                        tf.reduce_max(self.encoder_inputs_length) * 4)

                (
                    self.decoder_outputs_decode,
                    self.final_state,
                    _  # self.decoder_outputs_length_decode
                ) = (
                    seq2seq.dynamic_decode(
                        decoder=inference_decoder,
                        output_time_major=self.time_major,
                        # impute_finished=True,	# error occurs
                        maximum_iterations=max_decode_step,
                        parallel_iterations=self.parallel_iterations,
                        swap_memory=True,
                        scope=decoder_scope))

                if not self.use_beamsearch_decode:
                    # decoder_outputs_decode.sample_id:
                    #     [batch_size, max_time_step]
                    # Or use argmax to find decoder symbols to emit:
                    # self.decoder_pred_decode = tf.argmax(
                    #     self.decoder_outputs_decode.rnn_output,
                    #     axis=-1, name='decoder_pred_decode')

                    # Here, we use expand_dims to be compatible with
                    # the result of the beamsearch decoder
                    # decoder_pred_decode:
                    #     [batch_size, max_time_step, 1] (output_major=False)

                    # self.decoder_pred_decode = tf.expand_dims(
                    #     self.decoder_outputs_decode.sample_id,
                    #     -1
                    # )

                    dod = self.decoder_outputs_decode
                    self.decoder_pred_decode = dod.sample_id

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0))

                else:
                    # Use beam search to approximately
                    # find the most likely translation
                    # decoder_pred_decode:
                    # [batch_size, max_time_step, beam_width] (output_major=False)
                    self.decoder_pred_decode = \
                        self.decoder_outputs_decode.predicted_ids

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0, 2))

                    self.decoder_pred_decode = tf.transpose(
                        self.decoder_pred_decode, perm=[0, 2, 1])
                    dod = self.decoder_outputs_decode
                    self.beam_prob = dod.beam_search_decoder_output.scores
Beispiel #25
0
    def __init__(self, vocab_size, hidden_size, dropout,
                 num_layers, max_gradient_norm, batch_size, learning_rate,
                 lr_decay_factor, max_target_length,
                 max_source_length, decoder_mode=False):
        '''
        vocab_size: number of vocab tokens
        buckets: buckets of max sequence lengths
        hidden_size: dimension of hidden layers
        num_layers: number of hidden layers
        max_gradient_norm: maximum gradient magnitude
        batch_size: number of training examples fed to network at once
        learning_rate: starting learning rate of network
        lr_decay_factor: amount by which to decay learning rate
        num_samples: number of samples for sampled softmax
        decoder_mode: Whether to build backpass nodes or not
        '''
        GO_ID = config.GO_ID
        EOS_ID = config.EOS_ID
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.global_step = tf.Variable(0, trainable=False)
        self.learning_rate = learning_rate
        self.encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
        self.source_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name='source_lengths')

        self.decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')
        self.target_lengths = tf.placeholder(shape=(None,), dtype=tf.int32, name="target_lengths")

        with tf.variable_scope('embeddings') as scope:
            embeddings = tf.Variable(tf.random_uniform([vocab_size, hidden_size], -1.0, 1.0), dtype=tf.float32)
            encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, self.encoder_inputs)
            targets_embedding = tf.nn.embedding_lookup(embeddings, self.decoder_targets)


        with tf.variable_scope('encoder') as scope:
            encoder_cell = rnn.LSTMCell(hidden_size)
            encoder_cell = rnn.DropoutWrapper(encoder_cell,
                                              input_keep_prob=dropout)

            encoder_cell = rnn.MultiRNNCell([encoder_cell] * num_layers)

            _, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell,
                                                               cell_bw=encoder_cell,
                                                               sequence_length=self.source_lengths,
                                                               inputs=encoder_inputs_embedded,
                                                               dtype=tf.float32,
                                                               time_major=False)

        with tf.variable_scope('decoder') as scope:
            decoder_cell = rnn.LSTMCell(hidden_size)
            decoder_cell = rnn.DropoutWrapper(decoder_cell,
                                              input_keep_prob=dropout)

            decoder_cell = rnn.MultiRNNCell([decoder_cell] * num_layers)

            #TODO add attention
            #seq2seq.BahdanauAttention(num_units=,memory=encoder_output)

            #decoder_cell = seq2seq.AttentionWrapper(cell=decoder_cell,
            #                                        attention_mechanism=)

        if decoder_mode:
            decoder = seq2seq.BeamSearchDecoder(embedding=embeddings,
                                                start_tokens=tf.tile([GOD_ID], [batch_size]),
                                                end_token=EOS_ID,
                                                initial_state=encoder_state[0],
                                                beam_width=2)
        else:
            helper = seq2seq.TrainingHelper(inputs=targets_embedding,
                                            sequence_length=self.target_lengths)

            decoder = seq2seq.BasicDecoder(cell=decoder_cell,
                                           helper=helper,
                                           initial_state=encoder_state[-1],
                                           output_layer=Dense(vocab_size))

        final_outputs, final_state, final_sequence_lengths =\
                            seq2seq.dynamic_decode(decoder=decoder)

        self.logits = final_outputs.rnn_output

        if not decoder_mode:
            with tf.variable_scope("loss") as scope:
                #have to pad logits, dynamic decode produces results not consistent
                #in shape with targets
                pad_size = self.max_target_length - tf.reduce_max(final_sequence_lengths)
                self.logits = tf.pad(self.logits, [[0, 0], [0,pad_size], [0, 0]])

                weights = tf.sequence_mask(lengths=final_sequence_lengths,
                                           maxlen=self.max_target_length,
                                           dtype=tf.float32,
                                           name='weights')

                x_entropy_loss = seq2seq.sequence_loss(logits=self.logits,
                                                       targets=self.decoder_targets,
                                                       weights=weights)

                self.loss = tf.reduce_mean(x_entropy_loss)

            optimizer = tf.train.AdamOptimizer()
            gradients = optimizer.compute_gradients(x_entropy_loss)
            capped_grads = [(tf.clip_by_value(grad, -max_gradient_norm, max_gradient_norm), var) for grad, var in gradients]
            self.train_op = optimizer.apply_gradients(capped_grads,
                                                      global_step=self.global_step)
            self.saver = tf.train.Saver(tf.global_variables())
Beispiel #26
0
    def build_decoder(self, encoder_outputs, encoder_state):
        """构建解码器"""

        with tf.variable_scope('decoder') as decoder_scope:
            (self.decoder_cell,
             self.decoder_initial_state) = self.build_decoder_cell(
                 encoder_outputs, encoder_state)

            # 解码器embedding:
            # 判断发生在GPU/CPU上
            with tf.device(_get_embed_device(self.target_vocab_size)):

                # decoder与encoder是否共享embedding
                if self.share_embedding:
                    self.decoder_embeddings = self.encoder_embeddings

                # decoder与encoder不共享但预训练过
                elif self.pretrained_embedding:
                    self.decoder_embeddings = tf.Variable(tf.constant(
                        0.0,
                        shape=(self.target_vocab_size, self.embedding_size)),
                                                          trainable=True,
                                                          name='embeddings')
                    self.decoder_embeddings_placeholder = tf.placeholder(
                        tf.float32,
                        shape=(self.target_vocab_size, self.embedding_size))
                    self.decoder_embeddings_init = self.decoder_embeddings.assign(
                        self.decoder_embeddings_placeholder)

                # decoder与encoder不共享并且未预训练
                else:
                    self.decoder_embeddings = tf.get_variable(
                        name='embeddings',
                        shape=(self.target_vocab_size, self.embedding_size),
                        initializer=self.initializer,
                        dtype=tf.float32)

            # 输出(全连接层)
            self.decoder_output_projection = layers.Dense(
                self.target_vocab_size,
                dtype=tf.float32,
                use_bias=False,
                name='decoder_output_projection')

            # 训练模式
            if self.mode == 'train':
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                    params=self.decoder_embeddings,
                    ids=self.decoder_inputs_train)
                inputs = self.decoder_inputs_embedded

                if self.time_major:
                    inputs = tf.transpose(inputs, (1, 0, 2))

                training_helper = seq2seq.TrainingHelper(
                    inputs=inputs,
                    sequence_length=self.decoder_inputs_length,
                    time_major=self.time_major,
                    name='training_helper')

                # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数
                training_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=training_helper,
                    initial_state=self.decoder_initial_state,
                )

                # Maximum decoder time_steps in current batch
                max_decoder_length = tf.reduce_max(self.decoder_inputs_length)

                (
                    outputs,
                    self.final_state,  # contain attention
                    _  # self.final_sequence_lengths
                ) = seq2seq.dynamic_decode(
                    decoder=training_decoder,
                    output_time_major=self.time_major,
                    impute_finished=True,
                    maximum_iterations=max_decoder_length,
                    parallel_iterations=self.parallel_iterations,
                    swap_memory=True,
                    scope=decoder_scope)

                self.decoder_logits_train = self.decoder_output_projection(
                    outputs.rnn_output)

                # masks: masking for valid and padded time steps,
                # [batch_size, max_time_step + 1]
                self.masks = tf.sequence_mask(
                    lengths=self.decoder_inputs_length,
                    maxlen=max_decoder_length,
                    dtype=tf.float32,
                    name='masks')

                decoder_logits_train = self.decoder_logits_train
                if self.time_major:
                    decoder_logits_train = tf.transpose(
                        decoder_logits_train, (1, 0, 2))

                self.decoder_pred_train = tf.argmax(decoder_logits_train,
                                                    axis=-1,
                                                    name='decoder_pred_train')

                # 下面的一些变量用于特殊的学习训练
                # 自定义rewards
                # train_entropy = cross entropy
                self.train_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.decoder_inputs, logits=decoder_logits_train)

                self.masks_rewards = self.masks * self.rewards

                self.loss_rewards = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_inputs,
                    weights=self.masks_rewards,
                    average_across_timesteps=True,
                    average_across_batch=True,
                )

                self.loss = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_inputs,
                    weights=self.masks,
                    average_across_timesteps=True,
                    average_across_batch=True,
                )

                self.loss_add = self.loss + self.add_loss

            # 预测模式,非训练
            elif self.mode == 'decode':

                start_tokens = tf.tile([WordSequence.START], [self.batch_size])
                end_token = WordSequence.END

                def embed_and_input_proj(inputs):
                    """输入层的投影层wrapper"""
                    return tf.nn.embedding_lookup(self.decoder_embeddings,
                                                  inputs)

                if not self.use_beamsearch_decode:
                    # Helper to feed inputs for greedy decoding:
                    # uses the argmax of the output
                    decoding_helper = seq2seq.GreedyEmbeddingHelper(
                        start_tokens=start_tokens,
                        end_token=end_token,
                        embedding=embed_and_input_proj)
                    # Basic decoder performs greedy decoding at each time step
                    # print("building greedy decoder..")
                    inference_decoder = seq2seq.BasicDecoder(
                        cell=self.decoder_cell,
                        helper=decoding_helper,
                        initial_state=self.decoder_initial_state,
                        output_layer=self.decoder_output_projection)
                else:
                    # Beamsearch is used to approximately
                    # find the most likely translation
                    inference_decoder = BeamSearchDecoder(
                        cell=self.decoder_cell,
                        embedding=embed_and_input_proj,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=self.decoder_initial_state,
                        beam_width=self.beam_width,
                        output_layer=self.decoder_output_projection,
                    )

                if self.max_decode_step is not None:
                    max_decode_step = self.max_decode_step
                else:
                    # 默认 4 倍输入长度的输出解码
                    max_decode_step = tf.round(
                        tf.reduce_max(self.encoder_inputs_length) * 4)

                (
                    self.decoder_outputs_decode,
                    self.final_state,
                    _  # self.decoder_outputs_length_decode
                ) = seq2seq.dynamic_decode(
                    decoder=inference_decoder,
                    output_time_major=self.time_major,
                    # impute_finished=True,	# error occurs
                    maximum_iterations=max_decode_step,
                    parallel_iterations=self.parallel_iterations,
                    swap_memory=True,
                    scope=decoder_scope)

                if not self.use_beamsearch_decode:
                    dod = self.decoder_outputs_decode
                    self.decoder_pred_decode = dod.sample_id

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0))

                else:
                    self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0, 2))

                    self.decoder_pred_decode = tf.transpose(
                        self.decoder_pred_decode, perm=[0, 2, 1])
                    dod = self.decoder_outputs_decode
                    self.beam_prob = dod.beam_search_decoder_output.scores
    def build_train_decoder(self):
        self.decoder_inputs_embedded = tf.nn.embedding_lookup(
            params=self.embedding, ids=self.decoder_inputs_train)
        if self.train_mode == 'ground_truth':
            training_helper = seq2seq.TrainingHelper(
                inputs=self.decoder_inputs_embedded,
                sequence_length=self.decoder_inputs_length_train,
                time_major=False,
                name='training_helper')
        elif self.train_mode == 'scheduled_sampling':
            training_helper = seq2seq.ScheduledEmbeddingTrainingHelper(
                inputs=self.decoder_inputs_embedded,
                sequence_length=self.decoder_inputs_length_train,
                embedding=lambda inputs: tf.nn.embedding_lookup(
                    self.embedding, inputs),
                sampling_probability=self.sampling_probability,
                name='scheduled_embedding_training_helper')
        else:
            raise NotImplementedError(
                'Train mode: {} is not yet implemented'.format(
                    self.train_mode))

        training_decoder = seq2seq.BasicDecoder(
            cell=self.decoder_cell,
            helper=training_helper,
            initial_state=self.decoder_initial_state,
            output_layer=self.output_layer)
        max_decoder_length = tf.reduce_max(self.decoder_inputs_length_train)

        self.decoder_outputs_train, self.decoder_last_state_train, self.decoder_outputs_length_train = seq2seq.dynamic_decode(
            decoder=training_decoder,
            output_time_major=False,
            impute_finished=True,
            maximum_iterations=max_decoder_length)

        # NOTE(sdsuo): Not sure why this is necessary
        self.decoder_logits_train = tf.identity(
            self.decoder_outputs_train.rnn_output)

        # Use argmax to extract decoder symbols to emit
        self.decoder_pred_train = tf.argmax(self.decoder_logits_train,
                                            axis=-1,
                                            name='decoder_pred_train')

        # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1]
        masks = tf.sequence_mask(lengths=self.decoder_inputs_length_train,
                                 maxlen=max_decoder_length,
                                 dtype=self.dtype,
                                 name='masks')

        # Computes per word average cross-entropy over a batch
        # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default
        self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train,
                                          targets=self.decoder_targets_train,
                                          weights=masks,
                                          average_across_timesteps=True,
                                          average_across_batch=True)

        # Training summary for the current batch_loss
        tf.summary.scalar('loss', self.loss)

        # Contruct graphs for minimizing loss
        self.init_optimizer()
Beispiel #28
0
    def BuildNetwork(self, learningRate):
        self.dataInput = tensorflow.placeholder(dtype=tensorflow.float32,
                                                shape=[None, None, 40],
                                                name='DataInput')
        self.labelInput = tensorflow.placeholder(dtype=tensorflow.float32,
                                                 name='LabelInput')
        self.seqInput = tensorflow.placeholder(dtype=tensorflow.int32,
                                               shape=[None],
                                               name='SeqInput')

        #############################################################################
        # Batch Parameters
        #############################################################################

        self.parameters['BatchSize'], self.parameters[
            'TimeStep'], _ = tensorflow.unstack(
                tensorflow.shape(input=self.dataInput, name='DataShape'))

        ###################################################################################################
        # Encoder
        ###################################################################################################

        with tensorflow.variable_scope('Encoder_AE'):
            self.parameters[
                'Encoder_Cell_Forward_AE'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                    cells=[
                        rnn.LSTMCell(num_units=self.hiddenNodules)
                        for _ in range(self.rnnLayers)
                    ],
                    state_is_tuple=True)
            self.parameters[
                'Encoder_Cell_Backward_AE'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                    cells=[
                        rnn.LSTMCell(num_units=self.hiddenNodules)
                        for _ in range(self.rnnLayers)
                    ],
                    state_is_tuple=True)

            self.parameters['Encoder_Output_AE'], self.parameters['Encoder_FinalState_AE'] = \
                tensorflow.nn.bidirectional_dynamic_rnn(
                    cell_fw=self.parameters['Encoder_Cell_Forward_AE'],
                    cell_bw=self.parameters['Encoder_Cell_Backward_AE'],
                    inputs=self.dataInput, sequence_length=self.seqInput, dtype=tensorflow.float32)

        if self.attention is None:
            self.parameters['Decoder_InitalState_AE'] = []
            for index in range(self.rnnLayers):
                self.parameters[
                    'Encoder_Cell_Layer%d_AE' % index] = rnn.LSTMStateTuple(
                        c=tensorflow.concat([
                            self.parameters['Encoder_FinalState_AE'][index]
                            [0].c, self.parameters['Encoder_FinalState_AE']
                            [index][1].c
                        ],
                                            axis=1),
                        h=tensorflow.concat([
                            self.parameters['Encoder_FinalState_AE'][index]
                            [0].h, self.parameters['Encoder_FinalState_AE']
                            [index][1].h
                        ],
                                            axis=1))
                self.parameters['Decoder_InitalState_AE'].append(
                    self.parameters['Encoder_Cell_Layer%d_AE' % index])
            self.parameters['Decoder_InitalState_AE'] = tuple(
                self.parameters['Decoder_InitalState_AE'])
        else:
            self.attentionList = self.attention(
                dataInput=self.parameters['Encoder_Output_AE'],
                scopeName=self.attentionName,
                hiddenNoduleNumber=2 * self.hiddenNodules,
                attentionScope=self.attentionScope,
                blstmFlag=True)
            self.parameters['Decoder_InitalState_AE'] = []
            for index in range(self.rnnLayers):
                self.parameters[
                    'Encoder_Cell_Layer%d_AE' % index] = rnn.LSTMStateTuple(
                        c=self.attentionList['FinalResult'],
                        h=tensorflow.concat([
                            self.parameters['Encoder_FinalState_AE'][index]
                            [0].h, self.parameters['Encoder_FinalState_AE']
                            [index][1].h
                        ],
                                            axis=1))
                self.parameters['Decoder_InitalState_AE'].append(
                    self.parameters['Encoder_Cell_Layer%d_AE' % index])
            self.parameters['Decoder_InitalState_AE'] = tuple(
                self.parameters['Decoder_InitalState_AE'])

        #############################################################################
        # Decoder Label Pretreatment
        #############################################################################

        self.parameters['Decoder_Helper_AE'] = seq2seq.TrainingHelper(
            inputs=self.dataInput,
            sequence_length=self.seqInput,
            name='Decoder_Helper_AE')
        with tensorflow.variable_scope('Decoder_AE'):
            self.parameters['Decoder_FC_AE'] = Dense(40)
            self.parameters[
                'Decoder_Cell_AE'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                    cells=[
                        rnn.LSTMCell(num_units=self.hiddenNodules * 2)
                        for _ in range(self.rnnLayers)
                    ],
                    state_is_tuple=True)

            self.parameters['Decoder_AE'] = seq2seq.BasicDecoder(
                cell=self.parameters['Decoder_Cell_AE'],
                helper=self.parameters['Decoder_Helper_AE'],
                initial_state=self.parameters['Decoder_InitalState_AE'],
                output_layer=self.parameters['Decoder_FC_AE'])

            self.parameters['Decoder_Logits_AE'], self.parameters[
                'Decoder_FinalState_AE'], self.parameters[
                    'Decoder_FinalSeq_AE'] = seq2seq.dynamic_decode(
                        decoder=self.parameters['Decoder_AE'])

        #############################################################################
        # Losses
        #############################################################################

        self.parameters['Loss_AE'] = tensorflow.losses.absolute_difference(
            labels=self.dataInput,
            predictions=self.parameters['Decoder_Logits_AE'][0],
            weights=self.weight)
        self.trainAE = tensorflow.train.AdamOptimizer(
            learning_rate=learningRate).minimize(self.parameters['Loss_AE'])

        #############################################################################
        # DBLSTM Second BLSTM
        #############################################################################

        with tensorflow.variable_scope('FirstBLSTM'):
            self.parameters[
                'First_Cell_Forward'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                    cells=[
                        rnn.LSTMCell(num_units=self.hiddenNodules)
                        for _ in range(self.rnnLayers)
                    ],
                    state_is_tuple=True)
            self.parameters[
                'First_Cell_Backward'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                    cells=[
                        rnn.LSTMCell(num_units=self.hiddenNodules)
                        for _ in range(self.rnnLayers)
                    ],
                    state_is_tuple=True)

            self.parameters['First_Output'], self.parameters['First_FinalState'] = \
                tensorflow.nn.bidirectional_dynamic_rnn(
                    cell_fw=self.parameters['First_Cell_Forward'], cell_bw=self.parameters['First_Cell_Backward'],
                    inputs=self.dataInput, sequence_length=self.seqInput, dtype=tensorflow.float32)

        if self.attention is None:
            self.parameters['First_FinalOutput'] = tensorflow.concat([
                self.parameters['First_FinalState'][self.rnnLayers - 1][0].h,
                self.parameters['First_FinalState'][self.rnnLayers - 1][1].h
            ],
                                                                     axis=1)
        else:
            self.firstAttentionList = self.attention(
                dataInput=self.parameters['First_Output'],
                scopeName=self.attentionName + '_DBLTM',
                hiddenNoduleNumber=2 * self.hiddenNodules,
                attentionScope=self.attentionScope,
                blstmFlag=True)
            self.parameters['First_FinalOutput'] = self.firstAttentionList[
                'FinalResult']

        if self.concatType == 'Concat':
            self.parameters['First_Concat'] = tensorflow.concat(
                [
                    self.parameters['First_FinalOutput'],
                    self.attentionList['FinalResult']
                ],
                axis=1,
                name='First_Concat')
        if self.concatType == 'Plus':
            self.parameters['First_Concat'] = tensorflow.add(
                self.parameters['First_FinalOutput'],
                self.attentionList['FinalResult'],
                name='First_Plus')
        if self.concatType == 'Multiply':
            self.parameters['First_Concat'] = tensorflow.multiply(
                self.parameters['First_FinalOutput'],
                self.attentionList['FinalResult'],
                name='First_Multiply')

        with tensorflow.variable_scope('SecondBLSTM'):
            self.parameters[
                'Second_Cell_Forward'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                    cells=[
                        rnn.LSTMCell(num_units=self.hiddenNodules)
                        for _ in range(self.rnnLayers)
                    ],
                    state_is_tuple=True)
            self.parameters[
                'Second_Cell_Backward'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                    cells=[
                        rnn.LSTMCell(num_units=self.hiddenNodules)
                        for _ in range(self.rnnLayers)
                    ],
                    state_is_tuple=True)

            self.parameters['Second_Output'], self.parameters['Second_FinalState'] = \
                tensorflow.nn.bidirectional_dynamic_rnn(
                    cell_fw=self.parameters['Second_Cell_Forward'], cell_bw=self.parameters['Second_Cell_Backward'],
                    inputs=self.parameters['First_Concat'][tensorflow.newaxis, :, :], dtype=tensorflow.float32)

        if self.secondAttention is None:
            self.parameters['Second_FinalOutput'] = tensorflow.concat([
                self.parameters['Second_FinalState'][self.rnnLayers - 1][0].h,
                self.parameters['Second_FinalState'][self.rnnLayers - 1][1].h
            ],
                                                                      axis=1)
        else:
            self.secondAttentionList = self.secondAttention(
                dataInput=self.parameters['Second_Output'],
                scopeName=self.secondAttentionName,
                hiddenNoduleNumber=2 * self.hiddenNodules,
                attentionScope=self.secondAttentionScope,
                blstmFlag=True)
            self.parameters['Second_FinalOutput'] = self.secondAttentionList[
                'FinalResult']

        self.parameters['FinalPredict'] = tensorflow.reshape(
            tensor=tensorflow.layers.dense(
                inputs=self.parameters['Second_FinalOutput'],
                units=1,
                activation=None,
                name='FinalPredict'),
            shape=[1])

        if self.lossType == 'MSE':
            self.parameters['Loss'] = tensorflow.losses.mean_squared_error(
                labels=self.labelInput,
                predictions=self.parameters['FinalPredict'])
        if self.lossType == 'RMSE':
            self.parameters['Loss'] = tensorflow.sqrt(
                tensorflow.losses.mean_squared_error(
                    labels=self.labelInput,
                    predictions=self.parameters['FinalPredict']))
        if self.lossType == 'MAE':
            self.parameters['Loss'] = tensorflow.losses.absolute_difference(
                labels=self.labelInput,
                predictions=self.parameters['FinalPredict'])

        self.train = tensorflow.train.AdamOptimizer(
            learning_rate=learningRate).minimize(
                self.parameters['Loss'],
                var_list=tensorflow.global_variables()
                [NETWORK_LENGTH[self.attentionName]:])
    def create_model_predict(self, input, mode='decode'):
        use_beam_search = False
        if self.params.beam_with > 1:
            use_beam_search = True
        with tf.variable_scope("attetnion_seq2seq", reuse=tf.AUTO_REUSE):
            embeddings_matrix = self._create_embedding()

            keep_prob = 1 - self.params.dropout_rate
            batch_size = tf.shape(input)[0]
            # encoder
            encoder_outputs, encoder_last_states, encoder_inputs_length = self._create_encoder(
                embeddings_matrix, input, keep_prob)

            # decoder
            with tf.variable_scope('decoder'):
                # # Output projection layer to convert cell_outpus to logits
                output_layer = Dense(self.params.vocab_size,
                                     name='output_project')
                input_layer = Dense(self.params.hidden_units * 2,
                                    dtype=tf.float32,
                                    name='input_projection')
                decoder_cell, decoder_initial_state = create_decoder_cell(
                    enc_outputs=encoder_outputs,
                    enc_states=encoder_last_states,
                    enc_seq_len=encoder_inputs_length,
                    num_layers=self.params.depth,
                    num_units=self.params.hidden_units * 2,
                    keep_prob=keep_prob,
                    use_residual=self.params.use_residual,
                    use_beam_search=use_beam_search,
                    beam_size=self.params.beam_with,
                    batch_size=batch_size,
                    top_attention=self.params.top_attention)

                # Start_tokens: [batch_size,] `int32` vector
                start_tokens = tf.ones([
                    batch_size,
                ], tf.int32) * data_utils.GO_ID
                end_token = data_utils.EOS_ID

                def embed_and_input_proj(inputs):
                    return input_layer(
                        tf.nn.embedding_lookup(embeddings_matrix, inputs))

                if self.params.beam_with <= 1:
                    decode_helper = seq2seq.GreedyEmbeddingHelper(
                        start_tokens=start_tokens,
                        end_token=end_token,
                        embedding=embed_and_input_proj)
                    inference_decoder = seq2seq.BasicDecoder(
                        cell=decoder_cell,
                        helper=decode_helper,
                        initial_state=decoder_initial_state,
                        output_layer=output_layer)
                    decoder_output, _, _ = seq2seq.dynamic_decode(
                        decoder=inference_decoder,
                        output_time_major=False,
                        impute_finished=True,
                        maximum_iterations=self.params.max_seq_length)
                else:
                    inference_decoder = seq2seq.BeamSearchDecoder(
                        cell=decoder_cell,
                        embedding=embed_and_input_proj,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=decoder_initial_state,
                        beam_width=self.params.beam_with,
                        output_layer=output_layer)

                decoder_output, _, _ = seq2seq.dynamic_decode(
                    decoder=inference_decoder,
                    output_time_major=False,
                    maximum_iterations=self.params.max_seq_length)

                if self.params.beam_with <= 1:
                    decoder_predict = tf.expand_dims(decoder_output.sample_id,
                                                     -1)
                else:
                    decoder_predict = decoder_output.predicted_ids

        decoder_predict = tf.identity(decoder_predict, 'predicts')
        return decoder_predict
    def build_decoder(self):
        print("building decoder and attention..")
        with tf.variable_scope('decoder'):
            self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell()

            initializer = tf.contrib.layers.xavier_initializer(seed=0, dtype=self.dtype)
            
            self.decoder_embeddings = tf.get_variable(name='embedding',
                shape=[self.num_decoder_symbols, self.decoder_embedding_size],
                initializer=initializer, dtype=self.dtype)

            input_layer = Dense(self.decoder_hidden_units, dtype=self.dtype, name='input_projection')
            output_layer = Dense(self.num_decoder_symbols, name='output_projection')

            if self.mode == 'train':
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                    params=self.decoder_embeddings, ids=self.decoder_inputs_train)
               
                self.decoder_inputs_embedded = input_layer(self.decoder_inputs_embedded)

                training_helper = seq2seq.TrainingHelper(inputs=self.decoder_inputs_embedded,
                                                   sequence_length=self.decoder_inputs_length_train,
                                                   time_major=False,
                                                   name='training_helper')

                training_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell,
                                                   helper=training_helper,
                                                   initial_state=self.decoder_initial_state,
                                                   output_layer=output_layer)
                                                   #output_layer=None)
                    
                max_decoder_length = tf.reduce_max(self.decoder_inputs_length_train)

                (self.decoder_outputs_train, self.decoder_last_state_train, 
                 self.decoder_outputs_length_train) = (seq2seq.dynamic_decode(
                    decoder=training_decoder,
                    output_time_major=False,
                    impute_finished=True,
                    maximum_iterations=max_decoder_length))
                 
                self.decoder_logits_train = tf.identity(self.decoder_outputs_train.rnn_output) 
                self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1,
                                                    name='decoder_pred_train')

                masks = tf.sequence_mask(lengths=self.decoder_inputs_length_train, 
                                         maxlen=max_decoder_length, dtype=self.dtype, name='masks')

                self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train, 
                                                  targets=self.decoder_targets_train,
                                                  weights=masks,
                                                  average_across_timesteps=True,
                                                  average_across_batch=True,)

                tf.summary.scalar('loss', self.loss)

                # Contruct graphs for minimizing loss
                self.init_optimizer()

            elif self.mode == 'decode':
        
                # Start_tokens: [batch_size,] `int32` vector
                start_tokens = tf.ones([self.batch_size,], tf.int32) * data_utils.start_token
                end_token = data_utils.end_token

                def embed_and_input_proj(inputs):
                    return input_layer(tf.nn.embedding_lookup(self.decoder_embeddings, inputs))
                    
                if not self.use_beamsearch_decode:
                    # Helper to feed inputs for greedy decoding: uses the argmax of the output
                    decoding_helper = seq2seq.GreedyEmbeddingHelper(start_tokens=start_tokens,
                                                                    end_token=end_token,
                                                                    embedding=embed_and_input_proj)
                    # Basic decoder performs greedy decoding at each time step
                    print("building greedy decoder..")
                    inference_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell,
                                                             helper=decoding_helper,
                                                             initial_state=self.decoder_initial_state,
                                                             output_layer=output_layer)
                else:
                    # Beamsearch is used to approximately find the most likely translation
                    print("building beamsearch decoder..")
                    inference_decoder = beam_search_decoder.BeamSearchDecoder(cell=self.decoder_cell,
                                                               embedding=embed_and_input_proj,
                                                               start_tokens=start_tokens,
                                                               end_token=end_token,
                                                               initial_state=self.decoder_initial_state,
                                                               beam_width=self.beam_width,
                                                               output_layer=output_layer,)
                # For GreedyDecoder, return
                # decoder_outputs_decode: BasicDecoderOutput instance
                #                         namedtuple(rnn_outputs, sample_id)
                # decoder_outputs_decode.rnn_output: [batch_size, max_time_step, num_decoder_symbols] 	if output_time_major=False
                #                                    [max_time_step, batch_size, num_decoder_symbols] 	if output_time_major=True
                # decoder_outputs_decode.sample_id: [batch_size, max_time_step], tf.int32		if output_time_major=False
                #                                   [max_time_step, batch_size], tf.int32               if output_time_major=True 
                
                # For BeamSearchDecoder, return
                # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance
                #                         namedtuple(predicted_ids, beam_search_decoder_output)
                # decoder_outputs_decode.predicted_ids: [batch_size, max_time_step, beam_width] if output_time_major=False
                #                                       [max_time_step, batch_size, beam_width] if output_time_major=True
                # decoder_outputs_decode.beam_search_decoder_output: BeamSearchDecoderOutput instance
                #                                                    namedtuple(scores, predicted_ids, parent_ids)

                (self.decoder_outputs_decode, self.decoder_last_state_decode,
                 self.decoder_outputs_length_decode) = (seq2seq.dynamic_decode(
                    decoder=inference_decoder,
                    output_time_major=False,
                    #impute_finished=True,	# error occurs
                    maximum_iterations=self.max_decode_step))

                ### get alignment from decoder_last_state
                if self.use_attention:
                    self.alignment = self.decoder_last_state_decode[0].alignment_history.stack()
                else:
                    self.alignment = []

                if not self.use_beamsearch_decode:
                    # decoder_outputs_decode.sample_id: [batch_size, max_time_step]
                    # Or use argmax to find decoder symbols to emit:
                    # self.decoder_pred_decode = tf.argmax(self.decoder_outputs_decode.rnn_output,
                    #                                      axis=-1, name='decoder_pred_decode')

                    # Here, we use expand_dims to be compatible with the result of the beamsearch decoder
                    # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False)
                    self.decoder_pred_decode = tf.expand_dims(self.decoder_outputs_decode.sample_id, -1)

                else:
                    # Use beam search to approximately find the most likely translation
                    # decoder_pred_decode: [batch_size, max_time_step, beam_width] (output_major=False)
                    self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids