Example #1
0
    def _build_bidirectional_rnn(self, inputs, dtype, hparams, num_bi_layers):
        # Construct forward and backward cells.
        #each one has num_bi_layers layers. Each layer has num_units.
        fw_cell = model_helper.create_rnn_cell(
            hparams.unit_type, hparams.num_units, num_bi_layers,
            hparams.forget_bias, hparams.in_to_hidden_dropout, self.mode)
        bw_cell = model_helper.create_rnn_cell(
            hparams.unit_type, hparams.num_units, num_bi_layers,
            hparams.forget_bias, hparams.in_to_hidden_dropout, self.mode)

        # initial_state_fw, initial_state_bw are initialized to 0
        # bi_outputs is a tuple (output_fw, output_bw) containing the forward and the backward rnn output Tensor
        # bi_state is a tuple (output_state_fw, output_state_bw) with the forward and the backward final states.
        # Each state has num_units.
        # num_bi_layers>1, we have a list of num_bi_layers tuples.
        bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(
            fw_cell,
            bw_cell,
            inputs,
            dtype=dtype,
            sequence_length=self.input_sequence_length,
            time_major=self.time_major)

        # return fw and bw outputs,i.e., ([h1_fw;h1_bw],...,[hT_fw;hT_bw]) concatenated.
        return tf.concat(bi_outputs, -1), bi_state
Example #2
0
    def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
                            source_sequence_length):
        """Build an RNN cell that can be used by decoder."""
        # We only make use of encoder_outputs in attention-based models
        if hparams.attention:
            raise ValueError("BasicModel doesn't support attention.")

        cell = model_helper.create_rnn_cell(
            unit_type=hparams.unit_type,
            num_units=hparams.num_units,
            num_layers=self.num_decoder_layers,
            num_residual_layers=self.num_decoder_residual_layers,
            forget_bias=hparams.forget_bias,
            dropout=hparams.dropout,
            num_gpus=self.num_gpus,
            mode=self.mode,
            single_cell_fn=self.single_cell_fn)

        # For beam search, we need to replicate encoder infos beam_width times
        if self.mode == tf.contrib.learn.ModeKeys.INFER and hparams.beam_width > 0:
            decoder_initial_state = tf.contrib.seq2seq.tile_batch(
                encoder_state, multiplier=hparams.beam_width)
        else:
            decoder_initial_state = encoder_state

        return cell, decoder_initial_state
Example #3
0
  def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
                          source_sequence_length, base_gpu=0):
    """Build an RNN cell that can be used by decoder."""
    # We only make use of encoder_outputs in attention-based models
    if hparams.attention:
      raise ValueError("BasicModel doesn't support attention.")

    cell = model_helper.create_rnn_cell(
        unit_type=hparams.unit_type,
        num_units=self.num_units,
        num_layers=self.num_decoder_layers,
        num_residual_layers=self.num_decoder_residual_layers,
        forget_bias=hparams.forget_bias,
        dropout=hparams.dropout,
        num_gpus=self.num_gpus,
        mode=self.mode,
        single_cell_fn=self.single_cell_fn,
        base_gpu=base_gpu
    )

    if hparams.language_model:
      encoder_state = cell.zero_state(self.batch_size, self.dtype)
    elif not hparams.pass_hidden_state:
      raise ValueError("For non-attentional model, "
                       "pass_hidden_state needs to be set to True")

    # For beam search, we need to replicate encoder infos beam_width times
    if (self.mode == tf.contrib.learn.ModeKeys.INFER and
        hparams.infer_mode == "beam_search"):
      decoder_initial_state = tf.contrib.seq2seq.tile_batch(
          encoder_state, multiplier=hparams.beam_width)
    else:
      decoder_initial_state = encoder_state

    return cell, decoder_initial_state
Example #4
0
  def _build_all_encoder_layers(self, bi_encoder_outputs,
                                num_uni_layers, dtype, hparams):
    """Build encoder layers all at once."""
    uni_cell = model_helper.create_rnn_cell(
        unit_type=hparams.unit_type,
        num_units=hparams.num_units,
        num_layers=num_uni_layers,
        num_residual_layers=self.num_encoder_residual_layers,
        forget_bias=hparams.forget_bias,
        dropout=hparams.dropout,
        num_gpus=self.num_gpus,
        base_gpu=1,
        mode=self.mode,
        single_cell_fn=self.single_cell_fn)
    encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
        uni_cell,
        bi_encoder_outputs,
        dtype=dtype,
        sequence_length=self.iterator.source_sequence_length,
        time_major=self.time_major)

    # Use the top layer for now
    self.encoder_state_list = [encoder_outputs]

    return encoder_state, encoder_outputs
Example #5
0
  def _build_all_encoder_layers(self, bi_encoder_outputs,
                                num_uni_layers, dtype, hparams):
    """Build encoder layers all at once."""
    uni_cell = model_helper.create_rnn_cell(
        unit_type=hparams.unit_type,
        num_units=hparams.num_units,
        num_layers=num_uni_layers,
        num_residual_layers=self.num_encoder_residual_layers,
        forget_bias=hparams.forget_bias,
        dropout=hparams.dropout,
        mode=self.mode,
        single_cell_fn=self.single_cell_fn,
        global_step=self.global_step)
    if hparams.use_dynamic_rnn:
      encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
          uni_cell,
          bi_encoder_outputs,
          dtype=dtype,
          sequence_length=self.features["source_sequence_length"],
          time_major=self.time_major)
    else:
      encoder_outputs, encoder_state = tf.contrib.recurrent.functional_rnn(
          uni_cell,
          bi_encoder_outputs,
          dtype=dtype,
          sequence_length=self.features["source_sequence_length"],
          time_major=self.time_major,
          use_tpu=hparams.use_tpu)

    # Use the top layer for now
    self.encoder_state_list = [encoder_outputs]

    return encoder_state, encoder_outputs
 def _build_encoder_cell(self, hparams, num_layers):
     return model_helper.create_rnn_cell(unit_type=hparams.unit_type,
                                         num_units=hparams.num_units,
                                         num_layers=num_layers,
                                         forget_bias=hparams.forget_bias,
                                         dropout=hparams.dropout,
                                         mode=self.mode,
                                         single_cell_fn=self.single_cell_fn)
Example #7
0
    def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state, source_sequence_length):
        """Build a RNN cell with attention mechanism that can be used by decoder."""
        attention_option = hparams.attention
        attention_architecture = hparams.attention_architecture

        if attention_architecture != "standard":
            raise ValueError(
                "Unknown attention architecture %s" % attention_architecture)

        num_units = hparams.num_units
        num_layers = hparams.num_layers
        num_residual_layers = hparams.num_residual_layers
        beam_width = hparams.beam_width

        dtype = tf.float32

        if self.time_major:
            memory = tf.transpose(encoder_outputs, [1, 0, 2])
        else:
            memory = encoder_outputs

        if self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width > 0:
            memory = tf.contrib.seq2seq.tile_batch(memory, multiplier=beam_width)
            source_sequence_length = tf.contrib.seq2seq.tile_batch(source_sequence_length, multiplier=beam_width)
            encoder_state = tf.contrib.seq2seq.tile_batch(encoder_state, multiplier=beam_width)
            batch_size = self.batch_size * beam_width
        else:
            batch_size = self.batch_size

        attention_mechanism = create_attention_mechanism(attention_option, num_units, memory, source_sequence_length)

        cell = model_helper.create_rnn_cell(unit_type=hparams.unit_type,
                                            num_units=num_units,
                                            num_layers=num_layers,
                                            num_residual_layers=num_residual_layers,
                                            forget_bias=hparams.forget_bias,
                                            dropout=hparams.dropout,
                                            base_gpu=hparams.base_gpu,
                                            mode=self.mode,
                                            single_cell_fn=self.single_cell_fn)

        # Only generate alignment in greedy INFER mode.
        alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width == 0)

        cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism, attention_layer_size=num_units,
                                                   alignment_history=alignment_history, name="attention")

        cell = tf.contrib.rnn.DeviceWrapper(cell, model_helper.get_device_str(hparams.base_gpu))

        if hparams.pass_hidden_state:
            decoder_initial_state = cell.zero_state(batch_size, dtype).clone(cell_state=encoder_state)
        else:
            decoder_initial_state = cell.zero_state(batch_size, dtype)

        return cell, decoder_initial_state
Example #8
0
    def _build_encoder_cell(self,
                            hparams,
                            num_layers,
                            num_residual_layers,
                            base_gpu=0):
        """Build a multi-layer RNN cell that can be used by encoder."""

        if hparams.model == 'model3':
            if hparams.mann == 'ntm':
                return NTMCell(hparams.num_layers,
                               hparams.num_units,
                               use_att_memory=False,
                               att_memory=False,
                               att_memory_size=None,
                               att_memory_vector_dim=None,
                               use_ext_memory=True,
                               ext_memory_size=hparams.num_memory_locations,
                               ext_memory_vector_dim=hparams.memory_unit_size,
                               ext_read_head_num=hparams.read_heads,
                               ext_write_head_num=hparams.write_heads,
                               dropout=hparams.dropout,
                               batch_size=hparams.batch_size,
                               mode=self.mode,
                               shift_range=1,
                               output_dim=hparams.num_units,
                               reuse=False,
                               record_w_history=hparams.record_w_history)
            elif hparams.mann == 'dnc':
                access_config = {
                    'memory_size': hparams.num_memory_locations,
                    'word_size': hparams.memory_unit_size,
                    'num_reads': hparams.read_heads,
                    'num_writes': hparams.write_heads
                }
                controller_config = {
                    'num_units': hparams.num_units,
                    'num_layers': hparams.num_layers
                }

                return DNC(access_config, controller_config, hparams.num_units,
                           20, hparams.dropout, self.mode, hparams.batch_size)
        else:
            return model_helper.create_rnn_cell(
                unit_type=hparams.unit_type,
                num_units=hparams.num_units,
                num_layers=num_layers,
                num_residual_layers=num_residual_layers,
                forget_bias=hparams.forget_bias,
                dropout=hparams.dropout,
                num_gpus=hparams.num_gpus,
                mode=self.mode,
                base_gpu=base_gpu,
                single_cell_fn=self.single_cell_fn,
                num_proj=None)
Example #9
0
 def create_rnn_cell(self):
     "create rnn cell"
     return model_helper.create_rnn_cell(
         num_layers=self.rnn_num_layers, 
         cell_type=self.cell_type, 
         num_units=self.num_units, 
         dropout=self.dropout, 
         mode=self.mode, 
         residual_connect=self.residual_connect, 
         residual_fn=None, 
         num_gpus=self.num_gpus)
Example #10
0
    def _build_encoder_cell(self, hparams, num_layers, base_gpu=0):
        """Build a multi-layer RNN cell that can be used by encoder."""

        return model_helper.create_rnn_cell(unit_type=hparams.unit_type,
                                            num_units=hparams.num_units,
                                            num_layers=num_layers,
                                            forget_bias=hparams.forget_bias,
                                            dropout=hparams.dropout,
                                            mode=self.mode,
                                            base_gpu=base_gpu,
                                            single_cell_fn=self.single_cell_fn)
Example #11
0
 def _build_encoder_cell(self, hparams, num_layers, num_residual_layers,
                         dtype=None):
   """Build a multi-layer RNN cell that can be used by encoder."""
   return model_helper.create_rnn_cell(
       unit_type=hparams.unit_type,
       num_units=self.num_units,
       num_layers=num_layers,
       num_residual_layers=num_residual_layers,
       forget_bias=hparams.forget_bias,
       dropout=hparams.dropout,
       mode=self.mode,
       dtype=dtype,
       single_cell_fn=self.single_cell_fn,
       use_block_lstm=hparams.use_block_lstm)
def _build_encoder_cell(model,
                        hparams,
                        num_layers,
                        num_residual_layers,
                        base_gpu=0,
                        all_layer_outputs=False):
    """multi rnn cell for the seq2seq encoder."""
    return model_helper.create_rnn_cell(
        num_units=hparams.num_units,
        num_layers=num_layers,
        num_residual_layers=num_residual_layers,
        dropout=hparams.dropout,
        num_gpus=hparams.num_gpus,
        mode=model.mode,
        base_gpu=base_gpu,
        single_cell_fn=model.single_cell_fn,
        all_layer_outputs=all_layer_outputs)
Example #13
0
    def _build_rnn(self, hparams):
        if self.time_major:
            self.inputs = tf.transpose(self.inputs)

        emb_inp = tf.nn.embedding_lookup(self.input_embedding, self.inputs)
        last_hidden_sate = []
        # RNN outputs: [max_time, batch_size, num_units]
        with tf.variable_scope("rnn") as scope:
            dtype = scope.dtype
            # Look up embedding, emb_imp: [max_time, batch_size, num_units]
            if hparams.rnn_type == "uni":
                cell = model_helper.create_rnn_cell(
                    hparams.unit_type, hparams.num_units, hparams.num_layers,
                    hparams.forget_bias, hparams.in_to_hidden_dropout,
                    self.mode)
                # encoder_state --> a Tensor of shape `[batch_size, cell.state_size]` or a list of such Tensors for many layers
                _, last_hidden_sate = tf.nn.dynamic_rnn(
                    cell,
                    emb_inp,
                    dtype=dtype,
                    sequence_length=self.input_sequence_length,
                    time_major=self.time_major)
            elif hparams.rnn_type == "bi":
                num_bi_layers = int(hparams.num_layers / 2)
                print("num_bi_layers %d" % num_bi_layers)
                _, bi_last_hidden_state = self._build_bidirectional_rnn(
                    emb_inp, dtype, hparams, num_bi_layers)
                # if the encoder has 1 layer per bi-rnn, it means that it has 1 fwd and 1 bwd layers -> in total it has 2 layers.
                # and every fwd and bwd layer has enc_units each -> in total 2*enc_units
                if num_bi_layers == 1:
                    last_hidden_sate = bi_last_hidden_state
                else:
                    # alternatively concat forward and backward states
                    last_hidden_sate = []
                    for layer_id in range(num_bi_layers):
                        # bi_encoder_state[0] are all the enc_layers/2 fwd states.
                        last_hidden_sate.append(
                            bi_last_hidden_state[0][layer_id])  # forward
                        # bi_encoder_state[1] are all the enc_layers/2 bwd states.
                        last_hidden_sate.append(
                            bi_last_hidden_state[1][layer_id])  # backward
                        last_hidden_sate = tuple(last_hidden_sate)
            else:
                raise ValueError("Unknown rnn type: %s" % hparams.rnn_type)
        return last_hidden_sate
Example #14
0
 def build_rnn(self, hparams):
     # Look up embedding: emb_imp.shape = [batch_size, max_seq_length, num_units]
     emb_inp = tf.nn.embedding_lookup(self.input_embedding, self.inputs)
     # rnn_outputs.shape = [batch_size, max_seq_length, num_units]
     with tf.variable_scope("rnn") as scope:
         dtype = scope.dtype
         cell = model_helper.create_rnn_cell(
             hparams.unit_type, hparams.num_units, hparams.num_layers,
             hparams.forget_bias, hparams.in_to_hidden_dropout, self.mode)
         # last_hidden_sate --> a Tensor of shape [batch_size, num_units] or a list of such Tensors for many layers
         # rnn_outputs --> a Tensor of shape [batch_size, max_seq_length, num_units].
         # sequence_length: It is used to zero-out outputs when past a batch element's true sequence length.
         rnn_outputs, last_hidden_sate = tf.nn.dynamic_rnn(
             cell,
             emb_inp,
             dtype=dtype,
             sequence_length=self.input_sequence_length)
     return rnn_outputs, last_hidden_sate
    def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
                            source_sequence_length):
        cell = model_helper.create_rnn_cell(
            unit_type=hparams.unit_type,
            num_units=hparams.num_units,
            num_layers=hparams.num_decoder_layers,
            forget_bias=hparams.forget_bias,
            dropout=hparams.dropout,
            mode=self.mode,
            single_cell_fn=self.single_cell_fn)

        # For beam search, we need to replicate encoder infos beam_width times
        if self.mode == tf.estimator.ModeKeys.PREDICT and hparams.beam_width > 0:
            decoder_initial_state = tf.contrib.seq2seq.tile_batch(
                encoder_state, multiplier=hparams.beam_width)
        else:
            decoder_initial_state = encoder_state

        return cell, decoder_initial_state
Example #16
0
 def _build_encoder_cell(self,
                         hparams,
                         num_layers,
                         num_residual_layers,
                         fast_reverse=False,
                         reverse=False):
   """Build a multi-layer RNN cell that can be used by encoder."""
   mlperf_log.gnmt_print(key=mlperf_log.MODEL_HP_DROPOUT,
                         value=hparams.dropout)
   return model_helper.create_rnn_cell(
       unit_type=hparams.unit_type,
       num_units=self.num_units,
       num_layers=num_layers,
       num_residual_layers=num_residual_layers,
       forget_bias=hparams.forget_bias,
       dropout=hparams.dropout,
       mode=self.mode,
       single_cell_fn=self.single_cell_fn,
       global_step=self.global_step,
       fast_reverse=fast_reverse,
       seq_len=self.features["source_sequence_length"] if reverse else None)
def _build_action_decoder_cell(model, hparams, encoder_state, base_gpu):
    """decoder cell constructor for action states."""
    num_residual_layers = hparams.num_residual_layers
    cell = model_helper.create_rnn_cell(
        num_units=hparams.num_units,
        num_layers=1,
        num_residual_layers=num_residual_layers,
        dropout=hparams.dropout,
        num_gpus=hparams.num_gpus,
        mode=model.mode,
        single_cell_fn=model.single_cell_fn,
        base_gpu=base_gpu)

    # For beam search, we need to replicate encoder infos beam_width times
    if model.mode == tf.contrib.learn.ModeKeys.INFER and hparams.beam_width > 0:
        decoder_initial_state = tf.contrib.seq2seq.tile_batch(
            encoder_state[-1], multiplier=hparams.beam_width)
    else:
        decoder_initial_state = encoder_state[-1]

    return cell, decoder_initial_state
def _build_decoder_cell(model, hparams, encoder_state, base_gpu):
  """multi rnn cell for the seq2seq decoder."""

  num_layers = hparams.num_layers
  num_residual_layers = hparams.num_residual_layers
  cell = model_helper.create_rnn_cell(
      num_units=hparams.num_units,
      num_layers=num_layers,
      num_residual_layers=num_residual_layers,
      dropout=hparams.dropout,
      num_gpus=hparams.num_gpus,
      mode=model.mode,
      single_cell_fn=model.single_cell_fn,
      base_gpu=base_gpu)

  # For beam search, we need to replicate encoder infos beam_width times
  if model.mode == tf.estimator.ModeKeys.PREDICT and hparams.beam_width > 0:
    decoder_initial_state = seq2seq.tile_batch(
        encoder_state, multiplier=hparams.beam_width)
  else:
    decoder_initial_state = encoder_state

  return cell, decoder_initial_state
Example #19
0
    def _build_encoder(self, hparams):
        """Build a GNMT encoder."""
        if hparams.encoder_type == "uni" or hparams.encoder_type == "bi":
            return super(GNMTModel, self)._build_encoder(hparams)

        if hparams.encoder_type != "gnmt":
            raise ValueError("Unknown encoder_type %s" % hparams.encoder_type)

        # Build GNMT encoder.
        num_layers = hparams.num_layers
        num_residual_layers = hparams.num_residual_layers
        num_bi_layers = 1
        num_uni_layers = num_layers - num_bi_layers
        utils.print_out("  num_bi_layers = %d" % num_bi_layers)
        utils.print_out("  num_uni_layers = %d" % num_uni_layers)

        iterator = self.iterator
        source = iterator.source
        if self.time_major:
            source = tf.transpose(source)

        with tf.variable_scope("encoder") as scope:
            dtype = scope.dtype

            # Look up embedding, emp_inp: [max_time, batch_size, num_units]
            #   when time_major = True
            encoder_emb_inp = tf.nn.embedding_lookup(self.embedding_encoder,
                                                     source)

            # Execute _build_bidirectional_rnn from Model class
            bi_encoder_outputs, bi_encoder_state = self._build_bidirectional_rnn(
                inputs=encoder_emb_inp,
                sequence_length=iterator.source_sequence_length,
                dtype=dtype,
                hparams=hparams,
                num_bi_layers=num_bi_layers,
                num_bi_residual_layers=0,  # no residual connection
            )

            uni_cell = model_helper.create_rnn_cell(
                unit_type=hparams.unit_type,
                num_units=hparams.num_units,
                num_layers=num_uni_layers,
                num_residual_layers=num_residual_layers,
                forget_bias=hparams.forget_bias,
                dropout=hparams.dropout,
                num_gpus=hparams.num_gpus,
                base_gpu=1,
                mode=self.mode,
                single_cell_fn=self.single_cell_fn)

            # encoder_outputs: size [max_time, batch_size, num_units]
            #   when time_major = True
            encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
                uni_cell,
                bi_encoder_outputs,
                dtype=dtype,
                sequence_length=iterator.source_sequence_length,
                time_major=self.time_major)

            # Pass all encoder state except the first bi-directional layer's state to
            # decoder.
            encoder_state = (bi_encoder_state[1], ) + (
                (encoder_state, ) if num_uni_layers == 1 else encoder_state)

        return encoder_outputs, encoder_state
Example #20
0
  def _build_encoder_layers_unidi(self, inputs, sequence_length,
                                  num_uni_layers, hparams, dtype):
    """Build encoder layers all at once."""
    encoder_outputs = None
    encoder_state = tuple()

    if hparams.use_fused_lstm:
      for i in range(num_uni_layers):
        if (not np.isclose(hparams.dropout, 0.) and
            self.mode == tf.contrib.learn.ModeKeys.TRAIN):
          cell_inputs = tf.nn.dropout(inputs, keep_prob=1-hparams.dropout)
        else:
          cell_inputs = inputs

        cell = block_lstm.LSTMBlockFusedCell(
            hparams.num_units, hparams.forget_bias, dtype=dtype)
        encoder_outputs, (final_c, final_h) = cell(
            cell_inputs,
            dtype=dtype,
            sequence_length=sequence_length)
        encoder_state += (tf.nn.rnn_cell.LSTMStateTuple(final_c, final_h),)
        if i >= num_uni_layers - self.num_encoder_residual_layers:
          # Add the pre-dropout inputs. Residual wrapper is applied after
          # dropout wrapper.
          encoder_outputs += inputs
        inputs = encoder_outputs
    elif hparams.use_cudnn_lstm:
      # Single layer cudnn rnn, dropout isnt applied in the kernel
      for i in range(num_uni_layers):
        if (not np.isclose(hparams.dropout, 0.) and
            self.mode == tf.contrib.learn.ModeKeys.TRAIN):
          inputs = tf.nn.dropout(inputs, keep_prob=1-hparams.dropout)

        encoder_outputs, encoder_states = self._build_unidi_rnn_cudnn(
            inputs,
            None,  # initial_state
            sequence_length,
            dtype,
            hparams,
            1,  # num_layer
            is_fwd=True)
        encoder_state += (tf.nn.rnn_cell.LSTMStateTuple(encoder_states.c,
                                                        encoder_states.h),)
        if i >= num_uni_layers - self.num_encoder_residual_layers:
          encoder_outputs += inputs
        inputs = encoder_outputs
    else:
      uni_cell = model_helper.create_rnn_cell(
          unit_type=hparams.unit_type,
          num_units=hparams.num_units,
          num_layers=num_uni_layers,
          num_residual_layers=self.num_encoder_residual_layers,
          forget_bias=hparams.forget_bias,
          dropout=hparams.dropout,
          dtype=dtype,
          mode=self.mode,
          single_cell_fn=self.single_cell_fn,
          use_block_lstm=hparams.use_block_lstm)

      if hparams.use_dynamic_rnn:
        encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
            uni_cell,
            inputs,
            dtype=dtype,
            sequence_length=sequence_length,
            time_major=self.time_major)
      else:
        encoder_outputs, encoder_state = tf.contrib.recurrent.functional_rnn(
            uni_cell,
            inputs,
            dtype=dtype,
            sequence_length=sequence_length,
            time_major=self.time_major,
            use_tpu=False)

    return encoder_state, encoder_outputs
    def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
                            source_sequence_length):
        """Build a RNN cell with attention mechanism that can be used by decoder."""
        # No Attention
        if not self.has_attention:
            return super(AttentionModel,
                         self)._build_decoder_cell(hparams, encoder_outputs,
                                                   encoder_state,
                                                   source_sequence_length)
        elif hparams.attention_architecture != "standard":
            raise ValueError("Unknown attention architecture %s" %
                             hparams.attention_architecture)

        num_units = hparams.num_units
        num_layers = self.num_decoder_layers
        #num_residual_layers = self.num_decoder_residual_layers
        infer_mode = hparams.infer_mode

        dtype = tf.float32

        # Ensure memory is batch-major
        if self.time_major:
            memory = tf.transpose(encoder_outputs, [1, 0, 2])
        else:
            memory = encoder_outputs

        if (self.mode == tf.contrib.learn.ModeKeys.INFER
                and infer_mode == "beam_search"):
            memory, source_sequence_length, encoder_state, batch_size = (
                self._prepare_beam_search_decoder_inputs(
                    hparams.beam_width, memory, source_sequence_length,
                    encoder_state))
        else:
            batch_size = self.batch_size

        # Attention
        attention_mechanism = self.attention_mechanism_fn(
            hparams.attention, num_units, memory, source_sequence_length,
            self.mode)

        cell = model_helper.create_rnn_cell(unit_type=hparams.unit_type,
                                            num_units=num_units,
                                            num_layers=num_layers,
                                            forget_bias=hparams.forget_bias,
                                            dropout=hparams.dropout,
                                            num_gpus=self.num_gpus,
                                            mode=self.mode,
                                            single_cell_fn=self.single_cell_fn)

        # Only generate alignment in greedy INFER mode.
        alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER
                             and infer_mode != "beam_search")
        cell = tf.contrib.seq2seq.AttentionWrapper(
            cell,
            attention_mechanism,
            attention_layer_size=num_units,
            alignment_history=alignment_history,
            output_attention=hparams.output_attention,
            name="attention")

        # TODO(thangluong): do we need num_layers, num_gpus?
        cell = tf.contrib.rnn.DeviceWrapper(
            cell, model_helper.get_device_str(num_layers - 1, self.num_gpus))

        if hparams.pass_hidden_state:
            decoder_initial_state = cell.zero_state(
                batch_size, dtype).clone(cell_state=encoder_state)
        else:
            decoder_initial_state = cell.zero_state(batch_size, dtype)

        return cell, decoder_initial_state
Example #22
0
    def _build_graph(self, hparams, scope=None):
        """Construct the train, evaluation, and inference graphs.
        Args:
            hparams: The hyperparameters for configuration
            scope: The variable scope name for this subgraph, default "dynamic_seq2seq"
        Returns:
            A tuple with (logits, loss, metrics, update_ops)
        """

        enc_inputs, dec_inputs, dec_outputs, seq_len = self.iterator.get_next()

        # get the size of the batch
        batch_size = tf.shape(enc_inputs)[0]

        with tf.variable_scope(scope or "dynamic_seq2seq", dtype=tf.float32):
            # create encoder
            dense_input_layer = tf.layers.Dense(hparams.num_units, use_bias=False)

            if hparams.dense_input:
                enc_inputs = dense_input_layer(enc_inputs)

            enc_cells = mdl_help.create_rnn_cell(unit_type=hparams.unit_type,
                                                 num_units=hparams.num_units,
                                                 num_layers=hparams.num_layers,
                                                 depth=hparams.depth,
                                                 num_residual_layers=hparams.num_residual_layers,
                                                 forget_bias=hparams.forget_bias,
                                                 dropout=hparams.dropout,
                                                 mode=self.mode,
                                                 use_highway_as_residual=hparams.use_highway_as_residual)

            # run encoder
            enc_outputs, enc_state = tf.nn.dynamic_rnn(cell=enc_cells,
                                                       inputs=enc_inputs,
                                                       sequence_length=seq_len,
                                                       swap_memory=True,
                                                       dtype=tf.float32,
                                                       scope="encoder")

            tgt_seq_len = tf.add(seq_len, tf.constant(1, tf.int32))

            # TODO: Add Inference decoder
            # create decoder
            dec_cells = mdl_help.create_rnn_cell(unit_type=hparams.unit_type,
                                                 num_units=hparams.num_units,
                                                 num_layers=hparams.num_layers,
                                                 depth=hparams.depth,
                                                 num_residual_layers=hparams.num_residual_layers,
                                                 forget_bias=hparams.forget_bias,
                                                 dropout=hparams.dropout,
                                                 mode=self.mode,
                                                 use_highway_as_residual=hparams.use_highway_as_residual)

            # decoder embedding
            decoder_embedding = tf.get_variable("decoder_embedding",
                                                [hparams.num_labels, hparams.num_units])
            if hparams.dense_input:
                # convert to int32 argmax values for embedding to work
                dec_inputs = tf.argmax(dec_inputs, axis=-1, output_type=tf.int32)
                dec_inputs = tf.nn.embedding_lookup(decoder_embedding, dec_inputs)

            # output project layer
            projection_layer = tf.layers.Dense(hparams.num_labels, use_bias=False)

            if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
                if hparams.train_helper == "teacher":
                    # teacher forcing
                    helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_inputs,
                                                               sequence_length=tgt_seq_len)
                elif hparams.train_helper == "sched":
                    if hparams.dense_input:
                        embedding = decoder_embedding
                    else:
                        embedding = tf.eye(hparams.num_labels)
                    # scheduled sampling
                    helper = tf.contrib.seq2seq.\
                             ScheduledEmbeddingTrainingHelper(inputs=dec_inputs,
                                                              sequence_length=tgt_seq_len,
                                                              embedding=embedding,
                                                              sampling_probability=self.sample_probability,
                                                              )
            elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
                if hparams.dense_input:
                    embedding = decoder_embedding
                else:
                    embedding = tf.eye(hparams.num_labels)
                helper = tf.contrib.seq2seq.\
                         ScheduledEmbeddingTrainingHelper(inputs=dec_inputs,
                                                          sequence_length=tgt_seq_len,
                                                          embedding=embedding,
                                                          sampling_probability=tf.constant(1.0))

            decoder = tf.contrib.seq2seq.BasicDecoder(cell=dec_cells,
                                                      helper=helper,
                                                      initial_state=enc_state,
                                                      output_layer=projection_layer)

            # run decoder
            final_outputs, final_states, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder=decoder,
                    impute_finished=True,
                    swap_memory=True,
                    scope="decoder")

            logits = final_outputs.rnn_output

            # mask out entries longer than target sequence length
            mask = tf.sequence_mask(tgt_seq_len, dtype=tf.float32)

            #stop gradient thru labels by crossent op
            labels = tf.stop_gradient(dec_outputs)

            crossent = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                                  labels=labels,
                                                                  name="crossent")

#            loss = (tf.reduce_sum(crossent*mask)/(hparams.batch_size*tf.reduce_mean(tf.cast(tgt_seq_len,
#                                                                                            tf.float32))))


            loss = tf.reduce_sum((crossent * mask) / tf.expand_dims(
                tf.expand_dims(tf.cast(tgt_seq_len, tf.float32), -1), -1)) / tf.cast(batch_size, tf.float32)

            metrics = []
            update_ops = []
            if self.mode == tf.contrib.learn.ModeKeys.EVAL:
                predictions = tf.argmax(input=logits, axis=-1)
                targets = tf.argmax(input=dec_outputs, axis=-1)
                acc, acc_update = tf.metrics.accuracy(predictions=predictions,
                                                      labels=targets,
                                                      weights=mask)
                # flatten for confusion matrix
                targets_flat = tf.reshape(targets, [-1])
                predictions_flat = tf.reshape(predictions, [-1])
                mask_flat = tf.reshape(mask, [-1])
                cm, cm_update = streaming_confusion_matrix(labels=targets_flat,
                                                           predictions=predictions_flat,
                                                           num_classes=hparams.num_labels,
                                                           weights=mask_flat)
                tf.add_to_collection("eval", cm_summary(cm, hparams.num_labels))
                metrics = [acc, cm]
                update_ops = [acc_update, cm_update]

            return logits, loss, metrics, update_ops
Example #23
0
    def _build_graph(self, hparams, scope=None):
        """Construct the train, evaluation, and inference graphs.
        Args:
            hparams: The hyperparameters for configuration
            scope: The variable scope name for this subgraph
        Returns:
            A tuple with (logits, loss, metrics, update_ops)
        """

        sample = self.iterator.get_next()

        inputs, tgt_outputs, seq_len = sample

        with tf.variable_scope(scope or "dynamic_bdrnn", dtype=tf.float32):
            # TODO: hidden activations are passed thru FC net
            # TODO: hidden-to-hidden network has skip connections (residual)
            # TODO: initial hidden and cell states are learned

            # create bdrnn
            fw_cells = mdl_help.create_rnn_cell(
                unit_type=hparams.unit_type,
                num_units=hparams.num_units,
                num_layers=hparams.num_layers,
                depth=0,
                num_residual_layers=0,
                forget_bias=hparams.forget_bias,
                dropout=0.,
                mode=self.mode,
                num_gpus=1,
                base_gpu=0)

            bw_cells = mdl_help.create_rnn_cell(
                unit_type=hparams.unit_type,
                num_units=hparams.num_units,
                num_layers=hparams.num_layers,
                depth=0,
                num_residual_layers=0,
                forget_bias=hparams.forget_bias,
                dropout=0.,
                mode=self.mode,
                num_gpus=1,
                base_gpu=0)

            #            print(fw_cells.zero_state(1, dtype=tf.float32))
            #            initial_fw_state = tf.get_variable("initial_fw_state", shape=fw_cells.state_size)
            #            initial_bw_state = tf.get_variable("initial_bw_state", shape=bw_cells.state_size)
            #            initial_fw_state_tiled = tf.tile(initial_fw_state, [hparams.batch_size, 1])
            #            initial_bw_state_tiled = tf.tile(initial_bw_state, [hparams.batch_size, 1])

            # run bdrnn
            outputs, output_states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=fw_cells,
                cell_bw=bw_cells,
                inputs=inputs,
                sequence_length=seq_len,
                initial_state_fw=None,
                initial_state_bw=None,
                dtype=tf.float32)
            # outputs is a tuple (output_fw, output_bw)
            # output_fw/output_bw are tensors [batch_size, max_time, cell.output_size]
            # outputs_states is a tuple (output_state_fw, output_state_bw) containing final states for
            # forward and backward rnn

            # concatenate the outputs of each direction
            combined_outputs = tf.concat([outputs[0], outputs[1]], axis=-1)

            # dense output layers
            dense1 = tf.layers.dense(inputs=combined_outputs,
                                     units=hparams.num_dense_units,
                                     activation=tf.nn.relu,
                                     use_bias=True)
            drop1 = tf.layers.dropout(
                inputs=dense1,
                rate=hparams.dropout,
                training=self.mode == tf.contrib.learn.ModeKeys.TRAIN)
            dense2 = tf.layers.dense(inputs=drop1,
                                     units=hparams.num_dense_units,
                                     activation=tf.nn.relu,
                                     use_bias=True)
            drop2 = tf.layers.dropout(
                inputs=dense2,
                rate=hparams.dropout,
                training=self.mode == tf.contrib.learn.ModeKeys.TRAIN)

            logits = tf.layers.dense(inputs=drop2,
                                     units=hparams.num_labels,
                                     use_bias=False)

            # mask out entries longer than target sequence length
            mask = tf.sequence_mask(seq_len, dtype=tf.float32)

            #stop gradient thru labels by crossent op
            tgt_outputs = tf.stop_gradient(tgt_outputs)

            crossent = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=logits, labels=tgt_outputs, name="crossent")

            # divide loss by batch_size * mean(seq_len)
            loss = (tf.reduce_sum(crossent * mask) /
                    (hparams.batch_size *
                     tf.reduce_mean(tf.cast(seq_len, tf.float32))))

            metrics = []
            update_ops = []
            if self.mode == tf.contrib.learn.ModeKeys.EVAL:
                predictions = tf.argmax(input=logits, axis=-1)
                tgt_labels = tf.argmax(input=tgt_outputs, axis=-1)
                acc, acc_update = tf.metrics.accuracy(predictions=predictions,
                                                      labels=tgt_labels,
                                                      weights=mask)
                # confusion matrix
                targets_flat = tf.reshape(tgt_labels, [-1])
                predictions_flat = tf.reshape(predictions, [-1])
                mask_flat = tf.reshape(mask, [-1])
                cm, cm_update = streaming_confusion_matrix(
                    labels=targets_flat,
                    predictions=predictions_flat,
                    num_classes=hparams.num_labels,
                    weights=mask_flat)
                tf.add_to_collection("eval",
                                     cm_summary(cm, hparams.num_labels))
                metrics = [acc, cm]
                update_ops = [acc_update, cm_update]

            return logits, loss, metrics, update_ops
Example #24
0
    def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
                            source_sequence_length):
        """Build a RNN cell with attention mechanism that can be used by decoder."""
        attention_option = hparams.attention
        attention_architecture = hparams.attention_architecture

        if attention_architecture != "standard":
            raise ValueError("Unknown attention architecture %s" %
                             attention_architecture)

        num_units = hparams.num_units
        num_layers = hparams.num_layers
        num_residual_layers = hparams.num_residual_layers
        num_gpus = hparams.num_gpus
        beam_width = hparams.beam_width

        dtype = tf.float32

        if self.time_major:
            memory = tf.transpose(encoder_outputs, [1, 0, 2])
        else:
            memory = encoder_outputs

        if self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width > 0:
            memory = tf.contrib.seq2seq.tile_batch(memory,
                                                   multiplier=beam_width)
            source_sequence_length = tf.contrib.seq2seq.tile_batch(
                source_sequence_length, multiplier=beam_width)
            encoder_state = tf.contrib.seq2seq.tile_batch(
                encoder_state, multiplier=beam_width)
            batch_size = self.batch_size * beam_width
        else:
            batch_size = self.batch_size

        if hparams.model in ('model0', 'model1', 'model2'):
            att_memory = tf.contrib.layers.fully_connected(
                memory,
                num_units,
                activation_fn=None,
                weights_initializer=tf.random_uniform_initializer(-0.1, 0.1))

            cell = NTMCell(num_layers,
                           num_units,
                           use_att_memory=True,
                           att_memory=att_memory,
                           att_memory_size=hparams.src_max_len,
                           att_memory_vector_dim=num_units,
                           use_ext_memory=(hparams.model == 'model2'),
                           ext_memory_size=hparams.num_memory_locations
                           if hparams.model == 'model2' else None,
                           ext_memory_vector_dim=hparams.memory_unit_size
                           if hparams.model == 'model2' else None,
                           ext_read_head_num=hparams.read_heads
                           if hparams.model == 'model2' else None,
                           ext_write_head_num=hparams.write_heads
                           if hparams.model == 'model2' else None,
                           dropout=hparams.dropout,
                           batch_size=batch_size,
                           mode=self.mode,
                           output_dim=num_units,
                           addressing_mode='content' if hparams.model
                           == 'model0' else 'content_and_location')

            decoder_initial_state = cell.zero_state(batch_size, dtype)

            if hparams.pass_hidden_state:
                decoder_initial_state = tuple([encoder_state] +
                                              list(decoder_initial_state[1:]))
        else:
            attention_mechanism = create_attention_mechanism(
                attention_option, num_units, memory, source_sequence_length)

            cell = model_helper.create_rnn_cell(
                unit_type=hparams.unit_type,
                num_units=num_units,
                num_layers=num_layers,
                num_residual_layers=num_residual_layers,
                forget_bias=hparams.forget_bias,
                dropout=hparams.dropout,
                num_gpus=num_gpus,
                mode=self.mode,
                single_cell_fn=self.single_cell_fn,
                num_proj=None,
                num_cells=2 if (hparams.encoder_type == "bi") else 1)

            # Only generate alignment in greedy INFER mode.
            alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER
                                 and beam_width == 0)
            cell = tf.contrib.seq2seq.AttentionWrapper(
                cell,
                attention_mechanism,
                attention_layer_size=num_units,
                alignment_history=alignment_history,
                name="attention")

            # TODO(thangluong): do we need num_layers, num_gpus?
            cell = tf.contrib.rnn.DeviceWrapper(
                cell, model_helper.get_device_str(num_layers - 1, num_gpus))

            if hparams.pass_hidden_state:
                decoder_initial_state = cell.zero_state(
                    batch_size, dtype).clone(cell_state=encoder_state)
            else:
                decoder_initial_state = cell.zero_state(batch_size, dtype)

        return cell, decoder_initial_state