Example #1
0
      def SampleGRUSeq2Seq(enc_inp, dec_inp, weights):
        """Example sequence-to-sequence model that uses GRU cells."""

        def GRUSeq2Seq(enc_inp, dec_inp):
          cell = core_rnn_cell_impl.MultiRNNCell(
              [core_rnn_cell_impl.GRUCell(24)] * 2, state_is_tuple=True)
          return seq2seq_lib.embedding_attention_seq2seq(
              enc_inp,
              dec_inp,
              cell,
              num_encoder_symbols=classes,
              num_decoder_symbols=classes,
              embedding_size=24,
              output_projection=(w, b))

        targets = [dec_inp[i + 1] for i in range(len(dec_inp) - 1)] + [0]

        def SampledLoss(labels, inputs):
          labels = array_ops.reshape(labels, [-1, 1])
          return nn_impl.sampled_softmax_loss(
              weights=w_t,
              biases=b,
              labels=labels,
              inputs=inputs,
              num_sampled=8,
              num_classes=classes)

        return seq2seq_lib.model_with_buckets(
            enc_inp,
            dec_inp,
            targets,
            weights,
            buckets,
            GRUSeq2Seq,
            softmax_loss_function=SampledLoss)
Example #2
0
      def SampleGRUSeq2Seq(enc_inp, dec_inp, weights):
        """Example sequence-to-sequence model that uses GRU cells."""

        def GRUSeq2Seq(enc_inp, dec_inp):
          cell = rnn_cell.MultiRNNCell(
              [rnn_cell.GRUCell(24) for _ in range(2)], state_is_tuple=True)
          return seq2seq_lib.embedding_attention_seq2seq(
              enc_inp,
              dec_inp,
              cell,
              num_encoder_symbols=classes,
              num_decoder_symbols=classes,
              embedding_size=24,
              output_projection=(w, b))

        targets = [dec_inp[i + 1] for i in range(len(dec_inp) - 1)] + [0]

        def SampledLoss(labels, logits):
          labels = array_ops.reshape(labels, [-1, 1])
          return nn_impl.sampled_softmax_loss(
              weights=w_t,
              biases=b,
              labels=labels,
              inputs=logits,
              num_sampled=8,
              num_classes=classes)

        return seq2seq_lib.model_with_buckets(
            enc_inp,
            dec_inp,
            targets,
            weights,
            buckets,
            GRUSeq2Seq,
            softmax_loss_function=SampledLoss)
Example #3
0
      def SampleGRUSeq2Seq(enc_inp, dec_inp, weights, per_example_loss):
        """Example sequence-to-sequence model that uses GRU cells."""

        def GRUSeq2Seq(enc_inp, dec_inp):
          cell = core_rnn_cell_impl.MultiRNNCell(
              [core_rnn_cell_impl.GRUCell(24)] * 2, state_is_tuple=True)
          return seq2seq_lib.embedding_attention_seq2seq(
              enc_inp,
              dec_inp,
              cell,
              num_encoder_symbols=classes,
              num_decoder_symbols=classes,
              embedding_size=24)

        targets = [dec_inp[i + 1] for i in range(len(dec_inp) - 1)] + [0]
        return seq2seq_lib.model_with_buckets(
            enc_inp,
            dec_inp,
            targets,
            weights,
            buckets,
            GRUSeq2Seq,
            per_example_loss=per_example_loss)
Example #4
0
  def __init__(self,
               source_vocab_size,
               target_vocab_size,
               buckets,
               size,
               num_layers,
               max_gradient_norm,
               batch_size,
               learning_rate,
               learning_rate_decay_factor,
               use_lstm=False,
               num_samples=512,
               forward_only=False,
               dtype=tf.float32):
    """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
      dtype: the data type to use to store internal variables.
    """
    self.source_vocab_size = source_vocab_size
    self.target_vocab_size = target_vocab_size
    self.buckets = buckets
    self.batch_size = batch_size
    self.learning_rate = tf.Variable(
        float(learning_rate), trainable=False, dtype=dtype)
    self.learning_rate_decay_op = self.learning_rate.assign(
        self.learning_rate * learning_rate_decay_factor)
    self.global_step = tf.Variable(0, trainable=False)

    # If we use sampled softmax, we need an output projection.
    output_projection = None
    softmax_loss_function = None
    # Sampled softmax only makes sense if we sample less than vocabulary size.
    if num_samples > 0 and num_samples < self.target_vocab_size:
      w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype)
      w = tf.transpose(w_t)
      b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype)
      output_projection = (w, b)

      def sampled_loss(labels, logits):
        labels = tf.reshape(labels, [-1, 1])
        # We need to compute the sampled_softmax_loss using 32bit floats to
        # avoid numerical instabilities.
        local_w_t = tf.cast(w_t, tf.float32)
        local_b = tf.cast(b, tf.float32)
        local_inputs = tf.cast(logits, tf.float32)
        return tf.cast(
            tf.nn.sampled_softmax_loss(
                weights=local_w_t,
                biases=local_b,
                labels=labels,
                inputs=local_inputs,
                num_sampled=num_samples,
                num_classes=self.target_vocab_size),
            dtype)
      softmax_loss_function = sampled_loss

    # Create the internal multi-layer cell for our RNN.
    def single_cell():
      return tf.contrib.rnn.GRUCell(size)
    if use_lstm:
      def single_cell():
        return tf.contrib.rnn.BasicLSTMCell(size)
    cell = single_cell()
    if num_layers > 1:
      cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(num_layers)])

    # The seq2seq function: we use embedding for the input and attention.
    def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
      return seq2seq_patch.embedding_attention_seq2seq(
          encoder_inputs,
          decoder_inputs,
          cell,
          cell,
          num_encoder_symbols=source_vocab_size,
          num_decoder_symbols=target_vocab_size,
          embedding_size=size,
          output_projection=output_projection,
          feed_previous=do_decode,
          dtype=dtype)

    # Feeds for inputs.
    self.encoder_inputs = []
    self.decoder_inputs = []
    self.target_weights = []
    for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
      self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="encoder{0}".format(i)))
    for i in xrange(buckets[-1][1] + 1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="decoder{0}".format(i)))
      self.target_weights.append(tf.placeholder(dtype, shape=[None],
                                                name="weight{0}".format(i)))

    # Our targets are decoder inputs shifted by one.
    targets = [self.decoder_inputs[i + 1]
               for i in xrange(len(self.decoder_inputs) - 1)]

    # Training outputs and losses.
    if forward_only:
      self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
          self.encoder_inputs, self.decoder_inputs, targets,
          self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True),
          softmax_loss_function=softmax_loss_function)
      # If we use output projection, we need to project outputs for decoding.
      if output_projection is not None:
        for b in xrange(len(buckets)):
          self.outputs[b] = [
              tf.matmul(output, output_projection[0]) + output_projection[1]
              for output in self.outputs[b]
          ]
    else:
      self.outputs, self.losses = seq2seq.model_with_buckets(
          self.encoder_inputs, self.decoder_inputs, targets,
          self.target_weights, buckets,
          lambda x, y: seq2seq_f(x, y, False),
          softmax_loss_function=softmax_loss_function)

    # Gradients and SGD update operation for training the model.
    params = tf.trainable_variables()
    if not forward_only:
      self.gradient_norms = []
      self.updates = []
      opt = tf.train.GradientDescentOptimizer(self.learning_rate)
      for b in xrange(len(buckets)):
        gradients = tf.gradients(self.losses[b], params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                         max_gradient_norm)
        self.gradient_norms.append(norm)
        self.updates.append(opt.apply_gradients(
            zip(clipped_gradients, params), global_step=self.global_step))

    self.saver = tf.train.Saver(tf.global_variables())