Example #1
0
 def get_cell(n_hidden):
     logging.info("Constructing cell of size={}".format(n_hidden))
     if use_lstm:
         logging.info("Using LSTM cells")
         if initializer:
             cell = rnn_cell.LSTMCell(n_hidden, initializer=initializer)
         else:
             # to use peephole connections, cell clipping or a projection layer, use LSTMCell
             cell = rnn_cell.BasicLSTMCell(n_hidden)
     else:
         logging.info("Using GRU cells")
         cell = rnn_cell.GRUCell(n_hidden)
     if not forward_only and use_lstm and keep_prob < 1:
         logging.info("Adding dropout wrapper around lstm cells")
         cell = rnn_cell.DropoutWrapper(cell,
                                        output_keep_prob=keep_prob)
     if encoder == "bidirectional":
         logging.info("Bidirectional model")
         if init_backward:
             logging.info(
                 "Use backward encoder state to initialize decoder state"
             )
         cell = BidirectionalRNNCell([cell] * 2)
     elif encoder == "bow":
         logging.info("BOW model")
         if num_layers > 1:
             logging.info("Model with %d layers for the decoder" %
                          num_layers)
             cell = BOWCell(rnn_cell.MultiRNNCell([cell] * num_layers))
         else:
             cell = BOWCell(single_cell)
     elif num_layers > 1:
         logging.info("Model with %d layers" % num_layers)
         cell = rnn_cell.MultiRNNCell([cell] * num_layers)
     return cell
Example #2
0
    def __init__(self,
                 vocab_size,
                 buckets_or_sentence_length,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 model_type,
                 use_lstm=True,
                 num_samples=512,
                 forward_only=False):
        """Create the model.  This constructor can be used to created an embedded or embedded-attention, bucketed or non-bucketed model made of single or multi-layer RNN cells. 

    Args:
      vocab_size: size of the vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets_or_sentence_length: 
        if using buckets:
          a list of pairs (I, O), where I specifies maximum input length
          that will be processed in that bucket, and O specifies maximum output
          length. Training instances that have inputs longer than I or outputs
          longer than O will be pushed to the next bucket and padded accordingly.
          We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
        else:
          number of the maximum number of words per sentence.
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
    """
        # Need to determine if we're using buckets or not:
        if type(buckets_or_sentence_length) == list:
            self.buckets = buckets_or_sentence_length
        else:
            self.max_sentence_length = buckets_or_sentence_length

        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # Summary variables. NOTE: added these.
        # self.summary_op_learning_rate = tf.scalar_summary('learning rate', self.learning_rate)
        # self.summary_op_global_step = tf.scalar_summary('global step', self.global_step)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.vocab_size:
            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [size, self.vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.vocab_size])
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                with tf.device("/cpu:0"):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels,
                                                      num_samples,
                                                      self.vocab_size)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = rnn_cell.GRUCell(size)
        if use_lstm:
            single_cell = rnn_cell.BasicLSTMCell(size)
        cell = single_cell  #i, j, f, o = array_ops.split(1, 4, concat)
        if num_layers > 1:
            cell = rnn_cell.MultiRNNCell(
                [single_cell] *
                num_layers)  #cur_inp, array_ops.concat(1, new_states)

        # The seq2seq function: we use embedding for the input and attention (if applicable).
        if model_type is 'embedding_attention':

            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    vocab_size,
                    vocab_size,
                    output_projection=output_projection,
                    feed_previous=do_decode)
        else:  # just build embedding model, I should probably change this to throw an error

            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return seq2seq.embedding_rnn_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    vocab_size,
                    vocab_size,
                    output_projection=output_projection,
                    feed_previous=do_decode)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []

        # NOTE: If the model is not bucketed, these try blocks will throw an AttributeError and execute code to build a non-bucketed model.
        try:
            encoder_range = self.buckets[-1][0]
            decoder_range = self.buckets[-1][1]
        except AttributeError:
            encoder_range, decoder_range = self.max_sentence_length, self.max_sentence_length

        for i in xrange(encoder_range):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(decoder_range + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        try:
            if forward_only:
                self.outputs, self.losses = seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    self.buckets,
                    self.vocab_size,
                    lambda x, y: seq2seq_f(x, y, True),
                    softmax_loss_function=softmax_loss_function)
                # If we use output projection, we need to project outputs for decoding.
                if output_projection is not None:
                    for b in xrange(len(self.buckets)):
                        self.outputs[b] = [
                            tf.nn.xw_plus_b(output, output_projection[0],
                                            output_projection[1])
                            for output in self.outputs[b]
                        ]
            else:
                self.outputs, self.losses = seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    self.buckets,
                    self.vocab_size,
                    lambda x, y: seq2seq_f(x, y, False),
                    softmax_loss_function=softmax_loss_function)

        except AttributeError:
            if forward_only:
                self.outputs, self.states = seq2seq_f(self.encoder_inputs,
                                                      self.decoder_inputs[:-1],
                                                      True)
                self.losses = seq2seq.sequence_loss(
                    self.outputs,
                    targets,
                    self.target_weights[:-1],
                    self.vocab_size,
                    softmax_loss_function=softmax_loss_function)
                # Project outputs for decoding
                if output_projection is not None:
                    self.outputs = [
                        tf.nn.xw_plus_b(output, output_projection[0],
                                        output_projection[1])
                        for output in self.outputs
                    ]
            else:
                self.outputs, self.states = seq2seq_f(self.encoder_inputs,
                                                      self.decoder_inputs[:-1],
                                                      False)
                self.losses = (seq2seq.sequence_loss(
                    self.outputs,
                    targets,
                    self.target_weights[:-1],
                    self.vocab_size,
                    softmax_loss_function=softmax_loss_function))

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        self.params = params  # Hold onto this for Woz
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)

            try:
                for b in xrange(len(self.buckets)):
                    gradients = tf.gradients(self.losses[b], params)
                    clipped_gradients, norm = tf.clip_by_global_norm(
                        gradients, max_gradient_norm)
                    self.gradient_norms.append(norm)
                    self.updates.append(
                        opt.apply_gradients(zip(clipped_gradients, params),
                                            global_step=self.global_step))
            except AttributeError:
                gradients = tf.gradients(self.losses, params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms = norm
                self.updates = opt.apply_gradients(
                    zip(clipped_gradients, params),
                    global_step=self.global_step)

        self.saver = tf.train.Saver(tf.all_variables())
Example #3
0
    def __init__(self, enc_out, target_vocab_size, buckets, embedding_size, hidden_size,
                 num_layers, batch_size, use_lstm=False, num_samples=512, 
                 encoder="reverse", use_src_mask=False, maxout_layer=False, init_backward=False,
                 variable_prefix=None, init_const=False, use_bow_mask=False, initializer=None):
        super(TFSeq2SeqSingleStepDecodingGraph, self).__init__(buckets, batch_size)
        self.target_vocab_size = target_vocab_size
        self.num_heads = 1
    
        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
          with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                               reuse=True), tf.device("/cpu:0"):
            w = tf.get_variable("proj_w", [hidden_size, self.target_vocab_size])
            w_t = tf.transpose(w)
            b = tf.get_variable("proj_b", [self.target_vocab_size])
          output_projection = (w, b)
            
          def sampled_loss(inputs, labels):
            with tf.device("/cpu:0"):
              labels = tf.reshape(labels, [-1, 1])
              return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
                                                self.target_vocab_size)
          softmax_loss_function = sampled_loss
        else:
          logging.info("Using maxout_layer=%d and full softmax loss" % maxout_layer)          
    
        # Create the internal multi-layer cell for our RNN.
        if use_lstm:
          logging.info("Using LSTM cells of size={}".format(hidden_size))
          if initializer:
            single_cell = rnn_cell.LSTMCell(hidden_size, initializer=initializer)
          else:
            # NOTE: to use peephole connections, cell clipping or a projection layer, use LSTMCell instead
            single_cell = rnn_cell.BasicLSTMCell(hidden_size)
        else:
          logging.info("Using GRU cells of size={}".format(hidden_size))
          single_cell = rnn_cell.GRUCell(hidden_size)
        cell = single_cell

        if encoder == "bidirectional":
          logging.info("Bidirectional model")
          if init_backward:
            logging.info("Use backward encoder state to initialize decoder state")
          cell = BidirectionalRNNCell([single_cell] * 2)
        elif encoder == "bow":
          logging.info("BOW model")
          if num_layers > 1:
            logging.info("Model with %d layers for the decoder" % num_layers)
            cell = BOWCell(rnn_cell.MultiRNNCell([single_cell] * num_layers))
          else:
            cell = BOWCell(single_cell)
        elif num_layers > 1:
          logging.info("Model with %d layers" % num_layers)
          cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)
    
        # List of placeholders deeper within the decoder (i.e. bucket dependent)
        self.enc_hidden = []
        self.enc_hidden_features = []
        self.enc_v = []
        self.dec_attns = []

        # Placeholder for last state
        if encoder == "bidirectional":
          if cell._cells[0]._state_is_tuple:
            dec_state_c = tf.placeholder(dtypes.float32, shape=[None, cell.fw_state_size/2], name="dec_state_c")
            dec_state_h = tf.placeholder(dtypes.float32, shape=[None, cell.fw_state_size/2], name="dec_state_h")
            self.dec_state = rnn_cell.LSTMStateTuple(dec_state_c, dec_state_h)
          else:
            self.dec_state = tf.placeholder(dtypes.float32, shape=[None, cell.fw_state_size], name="dec_state")
        elif encoder == "reverse" or encoder == "bow":
          if cell._state_is_tuple:
            dec_state_c = tf.placeholder(dtypes.float32, shape=[None, cell.state_size/2], name="dec_state_c")
            dec_state_h = tf.placeholder(dtypes.float32, shape=[None, cell.state_size/2], name="dec_state_h")
            self.dec_state = rnn_cell.LSTMStateTuple(dec_state_c, dec_state_h)
          else:
            self.dec_state = tf.placeholder(dtypes.float32, shape=[None, cell.state_size], name="dec_state")

        if use_src_mask:
          logging.info("Using source mask for decoder") 
          self.src_mask = tf.placeholder(dtypes.float32, shape=[None, None],
                                         name="src_mask")
        else:
          self.src_mask = None

        if use_bow_mask:
          logging.info("Using bow mask for output layer") 
          self.bow_mask = tf.placeholder(dtypes.float32, shape=[None, None],
                                         name="bow_mask")
        else:
          self.bow_mask = None          

        # placeholder to indicate whether we're at the start of the target sentence
        self.start = tf.placeholder(tf.bool, name="start")

        # The seq2seq function: we use embedding for the input and attention.
        scope = None
        if variable_prefix is not None:
          scope = variable_prefix+"/embedding_attention_seq2seq"
          logging.info("Using variable scope {}".format(scope))
        def seq2seq_f(bucket_enc_out, decoder_input):
            return self._tf_dec_embedding_attention_seq2seq(bucket_enc_out,
                decoder_input, self.dec_state, cell, target_vocab_size, embedding_size, 
                output_projection=output_projection, encoder=encoder, 
                src_mask=self.src_mask, maxout_layer=maxout_layer, init_backward=init_backward,
                start=self.start, scope=scope, init_const=init_const, bow_mask=self.bow_mask)
    
        self.dec_decoder_input = tf.placeholder(tf.int32, shape=[None],
                                                    name="dec_decoder_input")
        self.outputs = self._tf_dec_model_with_buckets(enc_out,
            self.dec_decoder_input, buckets, seq2seq_f)
        # If we use output projection, we need to project outputs for decoding.
        if output_projection is not None:
            # self.outputs contains outputs, new_attns, new_state in flattened list
            for b in xrange(len(buckets)): 
                output = self.outputs[b][0]
                ''' Standard implementation uses following code here to get the previous output (_extract_argmax_and_embed):
                output = tf.nn.xw_plus_b(output, output_projection[0],
                                                 output_projection[1])
                                                 
                However, we have to normalize during decoding using a softmax
                (and then taking a log to produce logprobs),
                as described in nn.py, def sampled_softmax_loss:
                "This operation is for training only.  It is generally an underestimate of the full softmax loss. 
                At inference time, you can compute full softmax probabilities with the expression 
                `tf.nn.softmax(tf.matmul(inputs, weights) + biases)`."
                Note: tf.matmul(i, w) + b does the same as tf.nn.xw_plus_b(i, w, b)
                '''
                output = tf.log(tf.nn.softmax(tf.nn.xw_plus_b(output, output_projection[0],
                                                 output_projection[1])))
                self.outputs[b][0] = output
        else:
          logging.info("Apply full softmax")
          for b in xrange(len(buckets)):
            self.outputs[b][0] = tf.log(tf.nn.softmax(self.outputs[b][0]))
                
        # for update_buckets
        self.enc_out = enc_out
        self.seq2seq_f = seq2seq_f
        self.output_projection = output_projection
Example #4
0
    def __init__(self, source_vocab_size, buckets, embedding_size, hidden_size,
                 num_layers, batch_size, use_lstm=False, num_samples=512, 
                 encoder="reverse", use_sequence_length=False, init_backward=False,
                 variable_prefix=None, initializer=None):
        super(TFSeq2SeqEncodingGraph, self).__init__(buckets, batch_size)
        self.source_vocab_size = source_vocab_size
        self.num_heads = 1
    
        # Create the internal multi-layer cell for our RNN.
        if use_lstm:
          logging.info("Using LSTM cells of size={}".format(hidden_size))
          if initializer:
            single_cell = rnn_cell.LSTMCell(hidden_size, initializer=initializer)
          else:
            # NOTE: to use peephole connections, cell clipping or a projection layer, use LSTMCell instead
            single_cell = rnn_cell.BasicLSTMCell(hidden_size)
        else:
          logging.info("Using GRU cells of size={}".format(hidden_size))
          single_cell = rnn_cell.GRUCell(hidden_size)
        cell = single_cell

        if encoder == "bidirectional":
          logging.info("Bidirectional model")
          if init_backward:
            logging.info("Use backward encoder state to initialize decoder state")
          cell = BidirectionalRNNCell([single_cell] * 2)
        elif encoder == "bow":
          logging.info("BOW model")
          if num_layers > 1:
            logging.info("Model with %d layers for the decoder" % num_layers)
            cell = BOWCell(rnn_cell.MultiRNNCell([single_cell] * num_layers))
          else:
            cell = BOWCell(single_cell)
        elif num_layers > 1:
          logging.info("Model with %d layers" % num_layers)
          cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)
    
        # The seq2seq function: we use embedding for the input and attention.
        scope = None
        if variable_prefix is not None:
          scope = variable_prefix+"/embedding_attention_seq2seq"
          logging.info("Using variable scope {}".format(scope))    
        def seq2seq_f(encoder_inputs, bucket_length):
          return self._tf_enc_embedding_attention_seq2seq(encoder_inputs, cell, source_vocab_size, embedding_size, 
                                                          encoder=encoder, 
                                                          sequence_length=self.sequence_length,
                                                          bucket_length=bucket_length,
                                                          init_backward=init_backward,
                                                          bow_emb_size=hidden_size,
                                                          scope=scope)
    
        # Feeds for inputs.
        self.encoder_inputs = []
        self.sequence_lengths = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
          self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                    name="encoder{0}".format(i)))
        if use_sequence_length:
          logging.info("Using sequence length for encoder")                                            
          self.sequence_length = tf.placeholder(tf.int32, shape=[None], name="seq_len")          
        else:
          self.sequence_length = None
    
        self.outputs = self._tf_enc_model_with_buckets(self.encoder_inputs, buckets, seq2seq_f)
            
        # for update_buckets            
        self.seq2seq_f = seq2seq_f
Example #5
0
    def __init__(self,
                 vocab_size,
                 max_sentence_length,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 num_samples=512,
                 forward_only=False,
                 model_type):

        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.max_sentence_length = max_sentence_length
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # output projection for sampled softmax:
        output_projection = None
        softmax_loss_function = None

        if num_samples > 0 and num_samples < self.vocab_size:
            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [size, self.vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.vocab_size])
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                with tf.device("/cpu:0"):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels,
                                                      num_samples,
                                                      self.vocab_size)

            softmax_loss_function = sampled_loss

        # single LSTM cell creation, use to build hidden layers
        single_cell = rnn_cell.BasicLSTMCell(size)
        cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)

        if model_type == 'embedding_attention':

            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    vocab_size,
                    vocab_size,
                    output_projection=output_projection,
                    feed_previous=do_decode)
        else:

            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return seq2seq.embedding_rnn_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    vocab_size,
                    vocab_size,
                    output_projection=output_projection,
                    feed_previous=do_decode)

        # feeds for inputs are limited to max_sentence_length
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(max_sentence_length):
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(max_sentence_length + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # targets are decoder inputs shifted by one
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # training outputs and losses
        if forward_only:
            self.outputs, self.states = seq2seq_f(self.encoder_inputs,
                                                  self.decoder_inputs[:-1],
                                                  True)
            self.losses = seq2seq.sequence_loss(
                self.outputs,
                targets,
                self.target_weights[:-1],
                self.vocab_size,
                softmax_loss_function=softmax_loss_function)
            # project outputs for decoding
            if output_projection is not None:
                self.outputs = [
                    tf.nn.xw_plus_b(output, output_projection[0],
                                    output_projection[1])
                    for output in self.outputs
                ]
        else:
            self.outputs, self.states = seq2seq_f(self.encoder_inputs,
                                                  self.decoder_inputs[:-1],
                                                  False)
            self.losses = (seq2seq.sequence_loss(
                self.outputs,
                targets,
                self.target_weights[:-1],
                self.vocab_size,
                softmax_loss_function=softmax_loss_function))

        # gradients and SGD update operation for training
        params = tf.trainable_variables()
        self.params = params
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)

            gradients = tf.gradients(self.losses, params)
            clipped_gradients, norm = tf.clip_by_global_norm(
                gradients, max_gradient_norm)
            self.gradient_norms = norm
            self.updates = opt.apply_gradients(zip(clipped_gradients, params),
                                               global_step=self.global_step)

        self.saver = tf.train.Saver(tf.all_variables())
Example #6
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 embedding_size,
                 hidden_size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False,
                 dtype=tf.float32,
                 opt_algorithm="sgd",
                 encoder="reverse",
                 use_sequence_length=False,
                 use_src_mask=False,
                 maxout_layer=False,
                 init_backward=False,
                 no_pad_symbol=False,
                 variable_prefix=None,
                 rename_variable_prefix=None,
                 init_const=False,
                 use_bow_mask=False,
                 max_to_keep=0,
                 keep_prob=1.0,
                 initializer=None,
                 legacy=False,
                 train_align=None):
        """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
      dtype: the data type to use to store internal variables.
    """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        with tf.variable_scope(variable_prefix or ""):
            self.learning_rate = tf.Variable(float(learning_rate),
                                             trainable=False)
            self.global_step = tf.Variable(0, trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.no_pad_symbol = no_pad_symbol

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            w_t = tf.get_variable("proj_w",
                                  [self.target_vocab_size, hidden_size],
                                  dtype=dtype)
            w = tf.transpose(w_t)
            b = tf.get_variable("proj_b", [self.target_vocab_size],
                                dtype=dtype)
            logging.info("Using output projection of shape (%d, %d)" %
                         (hidden_size, self.target_vocab_size))
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                labels = tf.reshape(labels, [-1, 1])
                # We need to compute the sampled_softmax_loss using 32bit floats to
                # avoid numerical instabilities.
                local_w_t = tf.cast(w_t, tf.float32)
                local_b = tf.cast(b, tf.float32)
                local_inputs = tf.cast(inputs, tf.float32)
                return tf.cast(
                    tf.nn.sampled_softmax_loss(local_w_t, local_b,
                                               local_inputs, labels,
                                               num_samples,
                                               self.target_vocab_size), dtype)

            softmax_loss_function = sampled_loss
        else:
            logging.info("Using maxout_layer=%r and full softmax loss" %
                         maxout_layer)

        # Create the internal multi-layer cell for our RNN.
        if use_lstm:
            logging.info("Using LSTM cells of size={}".format(hidden_size))
            if initializer:
                single_cell = rnn_cell.LSTMCell(hidden_size,
                                                initializer=initializer)
            else:
                # NOTE: to use peephole connections, cell clipping or a projection layer, use LSTMCell instead
                single_cell = rnn_cell.BasicLSTMCell(hidden_size)
        else:
            logging.info("Using GRU cells of size={}".format(hidden_size))
            single_cell = rnn_cell.GRUCell(hidden_size)
        cell = single_cell

        if encoder == "bidirectional":
            logging.info("Bidirectional model")
            if init_backward:
                logging.info(
                    "Use backward encoder state to initialize decoder state")
            cell = BidirectionalRNNCell([single_cell] * 2)
        elif encoder == "bow":
            logging.info("BOW model")
            if not forward_only and use_lstm and keep_prob < 1:
                logging.info("Adding dropout wrapper around lstm cells")
                single_cell = rnn_cell.DropoutWrapper(
                    single_cell, output_keep_prob=keep_prob)
            if num_layers > 1:
                logging.info("Model with %d layers for the decoder" %
                             num_layers)
                cell = BOWCell(
                    rnn_cell.MultiRNNCell([single_cell] * num_layers))
            else:
                cell = BOWCell(single_cell)
        elif num_layers > 1:
            logging.info("Model with %d layers" % num_layers)
            cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)

        # The seq2seq function: we use embedding for the input and attention.
        logging.info("Embedding size={}".format(embedding_size))
        scope = None
        if variable_prefix is not None:
            scope = variable_prefix + "/embedding_attention_seq2seq"
            logging.info("Using variable scope {}".format(scope))

        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode,
                      bucket_length):
            return embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=source_vocab_size,
                num_decoder_symbols=target_vocab_size,
                embedding_size=embedding_size,
                output_projection=output_projection,
                feed_previous=do_decode,
                dtype=dtype,
                encoder=encoder,
                sequence_length=self.sequence_length,
                bucket_length=bucket_length,
                src_mask=self.src_mask,
                maxout_layer=maxout_layer,
                init_backward=init_backward,
                bow_emb_size=hidden_size,
                scope=scope,
                init_const=init_const,
                bow_mask=self.bow_mask,
                keep_prob=keep_prob,
                legacy=legacy)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        self.alignments = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(dtype, shape=[None],
                               name="weight{0}".format(i)))
        if train_align is not None and not forward_only:
            for i in xrange(self.batch_size):
                self.alignments.append(
                    tf.placeholder(tf.float32,
                                   shape=[None],
                                   name="align{0}".format(i)))

        if use_sequence_length is True:
            logging.info("Using sequence length for encoder")
            self.sequence_length = tf.placeholder(tf.int32,
                                                  shape=[None],
                                                  name="seq_len")
        else:
            self.sequence_length = None

        if use_src_mask:
            logging.info("Using source mask for decoder")
            self.src_mask = tf.placeholder(tf.float32,
                                           shape=[None, None],
                                           name="src_mask")
        else:
            self.src_mask = None

        if use_bow_mask:
            logging.info("Using bow mask for output layer")
            self.bow_mask = tf.placeholder(tf.float32,
                                           shape=[None, None],
                                           name="bow_mask")
        else:
            self.bow_mask = None

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses = model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y, z: seq2seq_f(x, y, True, z),
                softmax_loss_function=softmax_loss_function)
            # If we use output projection, we need to project outputs for decoding.
            if output_projection is not None:
                for b in xrange(len(buckets)):
                    # This is similar to what is done in the loop function (where xw_plus_b is used instead of matmul).
                    # The loop function also takes the argmax, but the result is not saved, we pass the logits
                    # and take the argmax again in the vanilla decoder.
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y, z: seq2seq_f(x, y, False, z),
                softmax_loss_function=softmax_loss_function,
                alignments=self.alignments)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            if opt_algorithm == "sgd":
                logging.info("Using optimizer GradientDescentOptimizer")
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            elif opt_algorithm == "adagrad":
                print("Using optimizer AdagradOptimizer")
                lr = 3.0
                init_acc = 0.1
                opt = tf.train.AdagradOptimizer(lr, init_acc)
            elif opt_algorithm == "adadelta":
                print("Using optimizer AdadeltaOptimizer")
                rho = 0.95
                epsilon = 1e-6
                opt = tf.train.AdadeltaOptimizer(rho=rho, epsilon=epsilon)

            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        if variable_prefix:
            # save only the variables that belong to the prefix
            logging.info("Using variable prefix={}".format(variable_prefix))
            self.saver = tf.train.Saver(
                {
                    v.op.name: v
                    for v in tf.global_variables()
                    if v.op.name.startswith(variable_prefix)
                },
                max_to_keep=max_to_keep,
                write_version=saver_pb2.SaverDef.V1)
        else:
            self.saver = tf.train.Saver(tf.global_variables(),
                                        max_to_keep=max_to_keep,
                                        write_version=saver_pb2.SaverDef.V1)

        if rename_variable_prefix:
            # create a saver that explicitly stores model variables with a prefix
            logging.info("Saving model with new prefix={}".format(
                rename_variable_prefix))
            self.saver_prefix = tf.train.Saver(
                {
                    v.op.name.replace(variable_prefix, rename_variable_prefix):
                    v
                    for v in tf.global_variables()
                },
                write_version=saver_pb2.SaverDef.V1)
Example #7
0
    def __init__(self,
                 config,
                 variable_prefix,
                 is_training,
                 use_log_probs=False,
                 optimizer="sgd",
                 rename_variable_prefix=None):
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        hidden_size = config.hidden_size
        vocab_size = config.vocab_size

        self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        self._targets = tf.placeholder(tf.int32, [batch_size, num_steps])

        self.global_step = tf.Variable(0, trainable=False)

        # Slightly better results can be obtained with forget gate biases
        # initialized to 1 but the hyperparameters of the model would need to be
        # different than reported in the paper.
        lstm_cell = rnn_cell.BasicLSTMCell(hidden_size, forget_bias=0.0)
        if is_training and config.keep_prob < 1:
            lstm_cell = rnn_cell.DropoutWrapper(
                lstm_cell, output_keep_prob=config.keep_prob)
        cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers)

        if is_training or use_log_probs:
            logging.info("Using LSTM cells of size={}".format(hidden_size))
            logging.info("Model with %d layer(s)" % config.num_layers)
            logging.info("Model with %i unrolled step(s)" % config.num_steps)

        self._initial_state = cell.zero_state(batch_size, tf.float32)

        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, hidden_size])
            inputs = tf.nn.embedding_lookup(embedding, self._input_data)

        if is_training and config.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob)

        outputs = []
        state = self._initial_state
        # Simplified version of tensorflow.models.rnn.rnn.py's rnn().
        # This builds an unrolled LSTM for tutorial purposes only.
        # In general, use the rnn() or state_saving_rnn() from rnn.py.
        #with tf.variable_scope("RNN"):
        #  for time_step in range(num_steps):
        #    if time_step > 0: tf.get_variable_scope().reuse_variables()
        #    (cell_output, state) = cell(inputs[:, time_step, :], state)
        #    outputs.append(cell_output)
        inputs = [
            tf.squeeze(input_, [1])
            for input_ in tf.split(1, num_steps, inputs)
        ]
        outputs, state = rnn.rnn(cell,
                                 inputs,
                                 initial_state=self._initial_state)
        self._final_state = state

        output = tf.reshape(tf.concat(1, outputs), [-1, hidden_size])
        softmax_w = tf.get_variable("softmax_w", [hidden_size, vocab_size])
        softmax_b = tf.get_variable("softmax_b", [vocab_size])
        logits = tf.matmul(output, softmax_w) + softmax_b

        if use_log_probs:
            logging.info("Softmax")
            probs = tf.nn.softmax(logits)
            self._log_probs = tf.log(probs)
        else:
            loss = tf.nn.seq2seq.sequence_loss_by_example(
                [logits], [tf.reshape(self._targets, [-1])],
                [tf.ones([batch_size * num_steps])])
            self._cost = cost = tf.reduce_sum(loss) / batch_size

        if is_training:
            self._lr = tf.Variable(0.0, trainable=False)
            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                              config.max_grad_norm)
            if optimizer == "adadelta":
                self.lr = 1.0
                rho = 0.95
                epsilon = 1e-6
                logging.info("Use AdaDeltaOptimizer with lr={}".format(
                    self.lr))
                optimizer = tf.train.AdadeltaOptimizer(self.lr,
                                                       rho=rho,
                                                       epsilon=epsilon)
            elif optimizer == "adagrad":
                self.lr = 0.5
                logging.info("Use AdaGradOptimizer with lr={}".format(self.lr))
                optimizer = tf.train.AdagradOptimizer(self.lr)
            elif optimizer == "adam":
                # Default values are same as in Keras library
                logging.info("Use AdamOptimizer with default values")
                optimizer = tf.train.AdamOptimizer()
            elif optimizer == "rmsprop":
                self.lr = 0.5
                logging.info("Use RMSPropOptimizer with lr={}".format(self.lr))
                optimizer = tf.train.RMSPropOptimizer(self.lr)
            else:
                logging.info("Use GradientDescentOptimizer")
                optimizer = tf.train.GradientDescentOptimizer(self.lr)
            self._train_op = optimizer.apply_gradients(
                zip(grads, tvars), global_step=self.global_step)

        self.saver = tf.train.Saver(
            {
                v.op.name: v
                for v in tf.all_variables()
                if v.op.name.startswith(variable_prefix)
            },
            max_to_keep=2)

        if rename_variable_prefix:
            self.saver_prefix = tf.train.Saver({ v.op.name.replace(variable_prefix, rename_variable_prefix): \
                                                 v for v in tf.all_variables() if v.op.name.startswith(variable_prefix) }, max_to_keep=2)