Esempio n. 1
0
        def SampleGRUSeq2Seq(enc_inp, dec_inp, weights):
            """Example sequence-to-sequence model that uses GRU cells."""
            def GRUSeq2Seq(enc_inp, dec_inp):
                cell = rnn_cell.MultiRNNCell([rnn_cell.GRUCell(24)] * 2)
                return seq2seq.embedding_attention_seq2seq(
                    enc_inp,
                    dec_inp,
                    cell,
                    classes,
                    classes,
                    output_projection=(w, b))

            targets = [dec_inp[i + 1] for i in xrange(len(dec_inp) - 1)] + [0]

            def SampledLoss(inputs, labels):
                labels = tf.reshape(labels, [-1, 1])
                return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, 8,
                                                  classes)

            return seq2seq.model_with_buckets(
                enc_inp,
                dec_inp,
                targets,
                weights,
                buckets,
                classes,
                GRUSeq2Seq,
                softmax_loss_function=SampledLoss)
Esempio n. 2
0
 def SampleGRUSeq2Seq(enc_inp, dec_inp, weights):
   """Example sequence-to-sequence model that uses GRU cells."""
   def GRUSeq2Seq(enc_inp, dec_inp):
     cell = rnn_cell.MultiRNNCell([rnn_cell.GRUCell(24)] * 2)
     return seq2seq.embedding_attention_seq2seq(
         enc_inp, dec_inp, cell, classes, classes, output_projection=(w, b))
   targets = [dec_inp[i+1] for i in xrange(len(dec_inp) - 1)] + [0]
   def SampledLoss(inputs, labels):
     labels = tf.reshape(labels, [-1, 1])
     return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, 8, classes)
   return seq2seq.model_with_buckets(enc_inp, dec_inp, targets, weights,
                                     buckets, classes, GRUSeq2Seq,
                                     softmax_loss_function=SampledLoss)
Esempio n. 3
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 source_proj_size,
                 target_proj_size,
                 encoder_size,
                 decoder_size,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 optimizer='sgd',
                 input_feeding=False,
                 combine_inp_attn=False,
                 dropout=0.0,
                 attention_f=None,
                 window_size=10,
                 content_function=None,
                 decoder_attention_f="None",
                 num_samples=512,
                 forward_only=False,
                 max_len=100,
                 cpu_only=False,
                 early_stop_patience=0,
                 save_best_model=True,
                 dtype=tf.float32):
        super(NMTModel, self).__init__()

        if cpu_only:
            device = "/cpu:0"
        else:
            device = "/gpu:0"

        with tf.device(device):

            self.source_vocab_size = source_vocab_size
            self.target_vocab_size = target_vocab_size
            self.buckets = buckets
            self.batch_size = batch_size
            self.attention_f = attention_f
            self.content_function = content_function
            self.window_size = window_size

            self.combine_inp_attn = combine_inp_attn

            if decoder_attention_f == "None":
                self.decoder_attention_f = None
            else:
                self.decoder_attention_f = decoder_attention_f

            # learning rate ops
            self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
            self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor)

            # epoch ops
            self.epoch = tf.Variable(0, trainable=False)
            self.epoch_update_op = self.epoch.assign(self.epoch + 1)

            # samples seen ops
            self.samples_seen = tf.Variable(0, trainable=False)
            self.samples_seen_update_op = self.samples_seen.assign(self.samples_seen + batch_size)
            self.samples_seen_reset_op = self.samples_seen.assign(0)

            # global step variable - controled by the model
            self.global_step = tf.Variable(0.0, trainable=False)

            # average loss ops
            self.current_loss = tf.Variable(0.0, trainable=False)
            self.current_loss_update_op = None
            self.avg_loss = tf.Variable(0.0, trainable=False)
            self.avg_loss_update_op = self.avg_loss.assign(tf.div(self.current_loss, self.global_step))

            if early_stop_patience > 0 or save_best_model:
                self.best_eval_loss = tf.Variable(numpy.inf, trainable=False)
                self.estop_counter = tf.Variable(0, trainable=False)
                self.estop_counter_update_op = self.estop_counter.assign(self.estop_counter + 1)
                self.estop_counter_reset_op = self.estop_counter.assign(0)
            else:
                self.best_eval_loss = None
                self.estop_counter = None
                self.estop_counter_update_op = None
                self.estop_counter_reset_op = None

            self.source_proj_size = source_proj_size
            self.target_proj_size = target_proj_size
            self.encoder_size = encoder_size
            self.decoder_size = decoder_size

            self.input_feeding = input_feeding

            self.max_len = max_len
            self.dropout = dropout
            self.dropout_feed = tf.placeholder(tf.float32, name="dropout_rate")
            self.step_num = tf.Variable(0, trainable=False)

            self.dtype = dtype

            # If we use sampled softmax, we need an output projection.
            loss_function = None

            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [decoder_size, self.target_vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.target_vocab_size])
            self.output_projection = (w, b)

            self.sampled_softmax = False

            # Sampled softmax only makes sense if we sample less than vocabulary size.
            if 0 < num_samples < self.target_vocab_size:
                self.sampled_softmax = True
                def sampled_loss(inputs, labels):
                    with tf.device("/cpu:0"):
                        labels = tf.reshape(labels, [-1, 1])
                        return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
                                                          self.target_vocab_size)

                loss_function = sampled_loss

            # create the embedding matrix - this must be done in the CPU for now
            with tf.device("/cpu:0"):
                self.src_embedding = tf.Variable(
                    tf.truncated_normal(
                        [source_vocab_size, source_proj_size], stddev=0.01
                    ),
                    name='embedding_src'
                )

                # decoder with attention
                with tf.name_scope('decoder_with_attention') as scope:
                    # create this variable to be used inside the embedding_attention_decoder
                    self.tgt_embedding = tf.Variable(
                        tf.truncated_normal(
                            [target_vocab_size, target_proj_size], stddev=0.01
                        ),
                        name='embedding'
                    )

            # Create the internal multi-layer cell for our RNN.
            self.encoder_cell_fw, self.encoder_cell_bw, self.decoder_cell = cells.build_nmt_bidirectional_cell(
                encoder_size, decoder_size, source_proj_size, target_proj_size, dropout=dropout)

            # The seq2seq function: we use embedding for the input and attention.
            def seq2seq_f(encoder_inputs, decoder_inputs):
                return self.inference(encoder_inputs, decoder_inputs)

            # Feeds for inputs.
            self.encoder_inputs = []
            self.decoder_inputs = []
            self.target_weights = []

            for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
                self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))

            for i in xrange(buckets[-1][1] + 1):
                self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None, ], name="decoder{0}".format(i)))
                self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i)))

            # Our targets are decoder inputs shifted by one.
            targets = [self.decoder_inputs[i + 1]
                       for i in xrange(len(self.decoder_inputs) - 1)]

            self.decoder_states_holders = None

            # Training outputs and losses.
            if forward_only:

                # self.batch_size = beam_size

                for i in xrange(len(self.encoder_inputs), self.max_len):
                    self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))

                b_size = array_ops.shape(self.encoder_inputs[0])[0]

                # context, decoder_initial_state, attention_states, input_length
                self.ret0, self.ret1, self.ret2 = self.encode(self.encoder_inputs, b_size)

                self.decoder_init_plcholder = tf.placeholder(tf.float32,
                                                             shape=[None, (target_proj_size) * 2],
                                                             name="decoder_init")

                # shape of this placeholder: the first None indicate the batch size and the second the input length
                self.attn_plcholder = tf.placeholder(tf.float32,
                                                     shape=[None, self.ret2.get_shape()[1], target_proj_size],
                                                     name="attention_states")

                # decoder_states = None
                if self.decoder_attention_f is not None:
                    self.decoder_states_holders = tf.placeholder(tf.float32, shape=[None, None, 1, decoder_size],
                                                                 name="decoder_state")
                decoder_states = self.decoder_states_holders

                self.logits, self.states = attention_decoder_nmt(
                    decoder_inputs=[self.decoder_inputs[0]], initial_state=self.decoder_init_plcholder,
                    attention_states=self.attn_plcholder, cell=self.decoder_cell,
                    num_symbols=target_vocab_size, attention_f=attention_f,
                    window_size=window_size, content_function=content_function,
                    decoder_attention_f=decoder_attention_f, combine_inp_attn=combine_inp_attn,
                    input_feeding=input_feeding, dropout=self.dropout_feed, initializer=None,
                    dtype=dtype
                )

                # If we use output projection, we need to project outputs for decoding.
                self.logits = tf.nn.xw_plus_b(self.logits[-1], self.output_projection[0], self.output_projection[1])
                self.logits = nn_ops.softmax(self.logits)

            else:

                tf_version = pkg_resources.get_distribution("tensorflow").version

                if tf_version == "0.6.0" or tf_version == "0.5.0":

                    self.outputs, self.losses = seq2seq.model_with_buckets(
                        encoder_inputs=self.encoder_inputs, decoder_inputs=self.decoder_inputs,
                        targets=targets, weights=self.target_weights, num_decoder_symbols=self.target_vocab_size,
                        buckets=buckets, seq2seq=lambda x, y: seq2seq_f(x, y), softmax_loss_function=loss_function)

                else:

                    self.outputs, self.losses = model_with_buckets(
                        encoder_inputs=self.encoder_inputs, decoder_inputs=self.decoder_inputs,
                        targets=targets, weights=self.target_weights, buckets=buckets,
                        seq2seq_f=lambda x, y: seq2seq_f(x, y), softmax_loss_function=loss_function)

            # Gradients and SGD update operation for training the model.
            params = tf.trainable_variables()
            if not forward_only:
                self.gradient_norms = []
                self.updates = []
                # opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                opt = optimization_ops.get_optimizer(optimizer, learning_rate)
                for b in xrange(len(buckets)):
                    gradients = tf.gradients(self.losses[b], params)
                    clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                                     max_gradient_norm)
                    self.gradient_norms.append(norm)
                    self.updates.append(opt.apply_gradients(
                        zip(clipped_gradients, params), global_step=self.global_step))

            self.saver = tf.train.Saver(tf.all_variables())
            self.saver_best = tf.train.Saver(tf.all_variables())
Esempio n. 4
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False):
        """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
    """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [size, self.target_vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.target_vocab_size])
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                with tf.device("/cpu:0"):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels,
                                                      num_samples,
                                                      self.target_vocab_size)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = rnn_cell.GRUCell(size)
        if use_lstm:
            single_cell = rnn_cell.BasicLSTMCell(size)
        cell = single_cell
        if num_layers > 1:
            cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                source_vocab_size,
                target_vocab_size,
                output_projection=output_projection,
                feed_previous=do_decode)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)
            # If we use output projection, we need to project outputs for decoding.
            if output_projection is not None:
                for b in xrange(len(buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables())
Esempio n. 5
0
  def __init__(self, source_vocab_size, target_vocab_size, buckets, size,
               num_layers, max_gradient_norm, batch_size, learning_rate,
               learning_rate_decay_factor, use_lstm=False,
               num_samples=512, forward_only=False):
    """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
    """
    self.source_vocab_size = source_vocab_size
    self.target_vocab_size = target_vocab_size
    self.buckets = buckets
    self.batch_size = batch_size
    self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
    self.learning_rate_decay_op = self.learning_rate.assign(
        self.learning_rate * learning_rate_decay_factor)
    self.global_step = tf.Variable(0, trainable=False)

    # If we use sampled softmax, we need an output projection.
    output_projection = None
    softmax_loss_function = None
    # Sampled softmax only makes sense if we sample less than vocabulary size.
    if num_samples > 0 and num_samples < self.target_vocab_size:
      with tf.device("/cpu:0"):
        w = tf.get_variable("proj_w", [size, self.target_vocab_size])
        w_t = tf.transpose(w)
        b = tf.get_variable("proj_b", [self.target_vocab_size])
      output_projection = (w, b)

      def sampled_loss(inputs, labels):
        with tf.device("/cpu:0"):
          labels = tf.reshape(labels, [-1, 1])
          return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
                                            self.target_vocab_size)
      softmax_loss_function = sampled_loss

    # Create the internal multi-layer cell for our RNN.
    single_cell = rnn_cell.GRUCell(size)
    if use_lstm:
      single_cell = rnn_cell.BasicLSTMCell(size)
    cell = single_cell
    if num_layers > 1:
      cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)

    # The seq2seq function: we use embedding for the input and attention.
    def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
      return seq2seq.embedding_attention_seq2seq(
          encoder_inputs, decoder_inputs, cell, source_vocab_size,
          target_vocab_size, output_projection=output_projection,
          feed_previous=do_decode)

    # Feeds for inputs.
    self.encoder_inputs = []
    self.decoder_inputs = []
    self.target_weights = []
    for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
      self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="encoder{0}".format(i)))
    for i in xrange(buckets[-1][1] + 1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="decoder{0}".format(i)))
      self.target_weights.append(tf.placeholder(tf.float32, shape=[None],
                                                name="weight{0}".format(i)))

    # Our targets are decoder inputs shifted by one.
    targets = [self.decoder_inputs[i + 1]
               for i in xrange(len(self.decoder_inputs) - 1)]

    # Training outputs and losses.
    if forward_only:
      self.outputs, self.losses = seq2seq.model_with_buckets(
          self.encoder_inputs, self.decoder_inputs, targets,
          self.target_weights, buckets, self.target_vocab_size,
          lambda x, y: seq2seq_f(x, y, True),
          softmax_loss_function=softmax_loss_function)
      # If we use output projection, we need to project outputs for decoding.
      if output_projection is not None:
        for b in xrange(len(buckets)):
          self.outputs[b] = [
              tf.matmul(output, output_projection[0]) + output_projection[1]
              for output in self.outputs[b]
          ]
    else:
      self.outputs, self.losses = seq2seq.model_with_buckets(
          self.encoder_inputs, self.decoder_inputs, targets,
          self.target_weights, buckets, self.target_vocab_size,
          lambda x, y: seq2seq_f(x, y, False),
          softmax_loss_function=softmax_loss_function)

    # Gradients and SGD update operation for training the model.
    params = tf.trainable_variables()
    if not forward_only:
      self.gradient_norms = []
      self.updates = []
      opt = tf.train.GradientDescentOptimizer(self.learning_rate)
      for b in xrange(len(buckets)):
        gradients = tf.gradients(self.losses[b], params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                         max_gradient_norm)
        self.gradient_norms.append(norm)
        self.updates.append(opt.apply_gradients(
            zip(clipped_gradients, params), global_step=self.global_step))

    self.saver = tf.train.Saver(tf.all_variables())
Esempio n. 6
0
    def __init__(self,
                 vocab_size,
                 buckets_or_sentence_length,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 model_type,
                 use_lstm=True,
                 num_samples=512,
                 forward_only=False):
        """Create the model.  This constructor can be used to created an embedded or embedded-attention, bucketed or non-bucketed model made of single or multi-layer RNN cells. 
    Args:
      vocab_size: Size of the vocabulary.
      target_vocab_size: Size of the target vocabulary.
      buckets_or_sentence_length: 
        If using buckets:
          A list of pairs (I, O), where I specifies maximum input length
          that will be processed in that bucket, and O specifies maximum output
          length. Training instances that have inputs longer than I or outputs
          longer than O will be pushed to the next bucket and padded accordingly.
          We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
        Else:
          Number of the maximum number of words per sentence.
      size: Number of units in each layer of the model.
      num_layers: Number of layers in the model.
      max_gradient_norm: Gradients will be clipped to maximally this norm.
      batch_size: The size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: Learning rate to start with.
      learning_rate_decay_factor: Decay learning rate by this much when needed.
      num_samples: Number of samples for sampled softmax.
      forward_only: If set, we do not construct the backward pass in the model.
    """
        # Need to determine if we're using buckets or not:
        if type(buckets_or_sentence_length) == list:
            self.buckets = buckets_or_sentence_length
        else:
            self.max_sentence_length = buckets_or_sentence_length

        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.vocab_size:
            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [size, self.vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.vocab_size])
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                with tf.device("/cpu:0"):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels,
                                                      num_samples,
                                                      self.vocab_size)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = rnn_cell.GRUCell(size)
        if use_lstm:
            single_cell = rnn_cell.BasicLSTMCell(size)
        cell = single_cell  #i, j, f, o = array_ops.split(1, 4, concat)
        if num_layers > 1:
            cell = rnn_cell.MultiRNNCell(
                [single_cell] *
                num_layers)  #cur_inp, array_ops.concat(1, new_states)

        # The seq2seq function: we use embedding for the input and attention (if applicable).
        if model_type is 'embedding_attention':

            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    vocab_size,
                    vocab_size,
                    output_projection=output_projection,
                    feed_previous=do_decode)
        else:  # just build embedding model, I should probably change this to throw an error

            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return seq2seq.embedding_rnn_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    vocab_size,
                    vocab_size,
                    output_projection=output_projection,
                    feed_previous=do_decode)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []

        # NOTE: If the model is not bucketed, these try blocks will throw an AttributeError and execute code to build a non-bucketed model.
        try:
            encoder_range = self.buckets[-1][0]
            decoder_range = self.buckets[-1][1]
        except AttributeError:
            encoder_range, decoder_range = self.max_sentence_length, self.max_sentence_length

        for i in xrange(encoder_range):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(decoder_range + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        try:
            if forward_only:
                self.outputs, self.losses = seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    self.buckets,
                    self.vocab_size,
                    lambda x, y: seq2seq_f(x, y, True),
                    softmax_loss_function=softmax_loss_function)
                # If we use output projection, we need to project outputs for decoding.
                if output_projection is not None:
                    for b in xrange(len(self.buckets)):
                        self.outputs[b] = [
                            tf.nn.xw_plus_b(output, output_projection[0],
                                            output_projection[1])
                            for output in self.outputs[b]
                        ]
            else:
                self.outputs, self.losses = seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    self.buckets,
                    self.vocab_size,
                    lambda x, y: seq2seq_f(x, y, False),
                    softmax_loss_function=softmax_loss_function)

        except AttributeError:
            if forward_only:
                self.outputs, self.states = seq2seq_f(self.encoder_inputs,
                                                      self.decoder_inputs[:-1],
                                                      True)
                self.losses = seq2seq.sequence_loss(
                    self.outputs,
                    targets,
                    self.target_weights[:-1],
                    self.vocab_size,
                    softmax_loss_function=softmax_loss_function)
                # Project outputs for decoding
                if output_projection is not None:
                    self.outputs = [
                        tf.nn.xw_plus_b(output, output_projection[0],
                                        output_projection[1])
                        for output in self.outputs
                    ]
            else:
                self.outputs, self.states = seq2seq_f(self.encoder_inputs,
                                                      self.decoder_inputs[:-1],
                                                      False)
                self.losses = (seq2seq.sequence_loss(
                    self.outputs,
                    targets,
                    self.target_weights[:-1],
                    self.vocab_size,
                    softmax_loss_function=softmax_loss_function))

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        self.params = params  # Hold onto this for Woz
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)

            try:
                for b in xrange(len(self.buckets)):
                    gradients = tf.gradients(self.losses[b], params)
                    clipped_gradients, norm = tf.clip_by_global_norm(
                        gradients, max_gradient_norm)
                    self.gradient_norms.append(norm)
                    self.updates.append(
                        opt.apply_gradients(zip(clipped_gradients, params),
                                            global_step=self.global_step))
            except AttributeError:
                gradients = tf.gradients(self.losses, params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms = norm
                self.updates = opt.apply_gradients(
                    zip(clipped_gradients, params),
                    global_step=self.global_step)

        self.saver = tf.train.Saver(tf.all_variables())
Esempio n. 7
0
  def __init__(self, source_vocab_size, target_vocab_size, buckets, size,
               num_layers, max_gradient_norm, batch_size, learning_rate,
               learning_rate_decay_factor, use_lstm=False,
               num_samples=512, forward_only=False):
   
    self.source_vocab_size = source_vocab_size    
    self.target_vocab_size = target_vocab_size
    self.buckets = buckets
    self.batch_size = batch_size
    self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
    self.learning_rate_decay_op = self.learning_rate.assign(
        self.learning_rate * learning_rate_decay_factor)
    self.global_step = tf.Variable(0, trainable=False)

    
    output_projection = None
    softmax_loss_function = None
    
    if num_samples > 0 and num_samples < self.target_vocab_size:     
      with tf.device("/cpu:0"):
        w = tf.get_variable("proj_w", [size, self.target_vocab_size])            
        w_t = tf.transpose(w)
        b = tf.get_variable("proj_b", [self.target_vocab_size])
      output_projection = (w, b)                                        


      def sampled_loss(inputs, labels):
        with tf.device("/cpu:0"):
          labels = tf.reshape(labels, [-1, 1])
          return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
                                            self.target_vocab_size)
      softmax_loss_function = sampled_loss

    
    single_cell = rnn_cell.GRUCell(size)
    if use_lstm:                                    
      single_cell = rnn_cell.BasicLSTMCell(size)    
    cell = single_cell
    if num_layers > 1:
      cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)


    def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
      return seq2seq.embedding_attention_seq2seq(
          encoder_inputs, decoder_inputs, cell, source_vocab_size,
          target_vocab_size, output_projection=output_projection,
          feed_previous=do_decode)

    self.encoder_inputs = []
    self.decoder_inputs = []
    self.target_weights = []
    for i in xrange(buckets[-1][0]):  
      self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="encoder{0}".format(i)))
    for i in xrange(buckets[-1][1] + 1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="decoder{0}".format(i)))
      self.target_weights.append(tf.placeholder(tf.float32, shape=[None],
                                                name="weight{0}".format(i)))


    targets = [self.decoder_inputs[i + 1]
               for i in xrange(len(self.decoder_inputs) - 1)]


    if forward_only:
      self.outputs, self.losses = seq2seq.model_with_buckets(
          self.encoder_inputs, self.decoder_inputs, targets,
          self.target_weights, buckets, self.target_vocab_size,
          lambda x, y: seq2seq_f(x, y, True),
          softmax_loss_function=softmax_loss_function)

      if output_projection is not None:
        for b in xrange(len(buckets)):
          self.outputs[b] = [tf.matmul(output, output_projection[0]) +
                             output_projection[1]
                             for output in self.outputs[b]]
    else:
      self.outputs, self.losses = seq2seq.model_with_buckets(
          self.encoder_inputs, self.decoder_inputs, targets,
          self.target_weights, buckets, self.target_vocab_size,
          lambda x, y: seq2seq_f(x, y, False),
          softmax_loss_function=softmax_loss_function)


    params = tf.trainable_variables()
    if not forward_only:
      self.gradient_norms = []
      self.updates = []
      opt = tf.train.GradientDescentOptimizer(self.learning_rate)
      for b in xrange(len(buckets)):
        gradients = tf.gradients(self.losses[b], params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                         max_gradient_norm)
        self.gradient_norms.append(norm)
        self.updates.append(opt.apply_gradients(
            zip(clipped_gradients, params), global_step=self.global_step))

    self.saver = tf.train.Saver(tf.all_variables())
Esempio n. 8
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False):

        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        output_projection = None
        softmax_loss_function = None

        if num_samples > 0 and num_samples < self.target_vocab_size:
            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [size, self.target_vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.target_vocab_size])
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                with tf.device("/cpu:0"):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels,
                                                      num_samples,
                                                      self.target_vocab_size)

            softmax_loss_function = sampled_loss

        single_cell = rnn_cell.GRUCell(size)
        if use_lstm:
            single_cell = rnn_cell.BasicLSTMCell(size)
        cell = single_cell
        if num_layers > 1:
            cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)

        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                source_vocab_size,
                target_vocab_size,
                output_projection=output_projection,
                feed_previous=do_decode)

        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        if forward_only:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                self.target_vocab_size,
                lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)

            if output_projection is not None:
                for b in xrange(len(buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                self.target_vocab_size,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables())
Esempio n. 9
0
  def __init__(self, vocab_size, buckets_or_sentence_length, size,
               num_layers, max_gradient_norm, batch_size, learning_rate,
               learning_rate_decay_factor, model_type, use_lstm=True,
               num_samples=512, forward_only=False):
    """Create the model.  This constructor can be used to created an embedded or embedded-attention, bucketed or non-bucketed model made of single or multi-layer RNN cells. 
    Args:
      vocab_size: Size of the vocabulary.
      target_vocab_size: Size of the target vocabulary.
      buckets_or_sentence_length: 
        If using buckets:
          A list of pairs (I, O), where I specifies maximum input length
          that will be processed in that bucket, and O specifies maximum output
          length. Training instances that have inputs longer than I or outputs
          longer than O will be pushed to the next bucket and padded accordingly.
          We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
        Else:
          Number of the maximum number of words per sentence.
      size: Number of units in each layer of the model.
      num_layers: Number of layers in the model.
      max_gradient_norm: Gradients will be clipped to maximally this norm.
      batch_size: The size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: Learning rate to start with.
      learning_rate_decay_factor: Decay learning rate by this much when needed.
      num_samples: Number of samples for sampled softmax.
      forward_only: If set, we do not construct the backward pass in the model.
    """
    # Need to determine if we're using buckets or not:
    if type(buckets_or_sentence_length) == list:
      self.buckets = buckets_or_sentence_length
    else:
      self.max_sentence_length = buckets_or_sentence_length
    
    self.vocab_size = vocab_size
    self.batch_size = batch_size
    self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
    self.learning_rate_decay_op = self.learning_rate.assign(
        self.learning_rate * learning_rate_decay_factor)
    self.global_step = tf.Variable(0, trainable=False) 

    # Summary variables. NOTE: added these.
    # self.summary_op_learning_rate = tf.scalar_summary('learning rate', self.learning_rate)
    # self.summary_op_global_step = tf.scalar_summary('global step', self.global_step)

    # If we use sampled softmax, we need an output projection.
    output_projection = None
    softmax_loss_function = None
    # Sampled softmax only makes sense if we sample less than vocabulary size.
    if num_samples > 0 and num_samples < self.vocab_size:
      with tf.device("/cpu:0"):
        w = tf.get_variable("proj_w", [size, self.vocab_size])
        w_t = tf.transpose(w)
        b = tf.get_variable("proj_b", [self.vocab_size])
      output_projection = (w, b)

      def sampled_loss(inputs, labels):
        with tf.device("/cpu:0"):
          labels = tf.reshape(labels, [-1, 1])
          return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
                                            self.vocab_size)
      softmax_loss_function = sampled_loss

    # Create the internal multi-layer cell for our RNN.
    single_cell = rnn_cell.GRUCell(size)
    if use_lstm:
      single_cell = rnn_cell.BasicLSTMCell(size)
    cell = single_cell #i, j, f, o = array_ops.split(1, 4, concat)
    if num_layers > 1:
      cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) #cur_inp, array_ops.concat(1, new_states)

    # The seq2seq function: we use embedding for the input and attention (if applicable).
    if model_type is 'embedding_attention':
      def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
        return seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode)
    else: # just build embedding model, I should probably change this to throw an error
      def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
        return seq2seq.embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode)

    # Feeds for inputs.
    self.encoder_inputs = []
    self.decoder_inputs = []
    self.target_weights = []

    # NOTE: If the model is not bucketed, these try blocks will throw an AttributeError and execute code to build a non-bucketed model. 
    try:
      encoder_range = self.buckets[-1][0]
      decoder_range = self.buckets[-1][1]
    except AttributeError:
      encoder_range, decoder_range = self.max_sentence_length, self.max_sentence_length
    
    for i in xrange(encoder_range):  # Last bucket is the biggest one.
      self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="encoder{0}".format(i)))
    for i in xrange(decoder_range + 1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="decoder{0}".format(i)))
      self.target_weights.append(tf.placeholder(tf.float32, shape=[None],
                                                name="weight{0}".format(i)))

    # Our targets are decoder inputs shifted by one.
    targets = [self.decoder_inputs[i + 1]
               for i in xrange(len(self.decoder_inputs) - 1)]

    # Training outputs and losses.
    try:
      if forward_only:
        self.outputs, self.losses = seq2seq.model_with_buckets(
            self.encoder_inputs, self.decoder_inputs, targets,
            self.target_weights, self.buckets, self.vocab_size,
            lambda x, y: seq2seq_f(x, y, True),
            softmax_loss_function=softmax_loss_function)
        # If we use output projection, we need to project outputs for decoding.
        if output_projection is not None:
          for b in xrange(len(self.buckets)):
            self.outputs[b] = [tf.nn.xw_plus_b(output, output_projection[0],
                                               output_projection[1])
                               for output in self.outputs[b]]
      else:
        self.outputs, self.losses = seq2seq.model_with_buckets(
            self.encoder_inputs, self.decoder_inputs, targets,
            self.target_weights, self.buckets, self.vocab_size,
            lambda x, y: seq2seq_f(x, y, False),
            softmax_loss_function=softmax_loss_function)

    except AttributeError:
      if forward_only:
        self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], True)
        self.losses = seq2seq.sequence_loss(self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function)
        # Project outputs for decoding
        if output_projection is not None:
          self.outputs = [tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs]
      else:
        self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], False)
        self.losses = (seq2seq.sequence_loss(self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function))


    # Gradients and SGD update operation for training the model.
    params = tf.trainable_variables()
    self.params = params # Hold onto this for Woz
    if not forward_only:
      self.gradient_norms = []
      self.updates = []
      opt = tf.train.GradientDescentOptimizer(self.learning_rate)
      
      try:
        for b in xrange(len(self.buckets)):
          gradients = tf.gradients(self.losses[b], params)
          clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                           max_gradient_norm)
          self.gradient_norms.append(norm)
          self.updates.append(opt.apply_gradients(
              zip(clipped_gradients, params), global_step=self.global_step))
      except AttributeError:
        gradients = tf.gradients(self.losses, params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients,max_gradient_norm)
        self.gradient_norms = norm
        self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

    self.saver = tf.train.Saver(tf.all_variables())
    def __init__(self, vocab_size, buckets, size,
                 num_layers, max_gradient_norm, batch_size, learning_rate,
                 learning_rate_decay_factor, use_lstm=False,
                 num_samples=512):
        """Create the model.

        Args:
          vocab_size: size of the source/target vocabulary.
          buckets: a list of pairs (I, O), where I specifies maximum input length
            that will be processed in that bucket, and O specifies maximum output
            length. Training instances that have inputs longer than I or outputs
            longer than O will be pushed to the next bucket and padded accordingly.
            We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
          size: number of units in each layer of the model.
          num_layers: number of layers in the model.
          max_gradient_norm: gradients will be clipped to maximally this norm.
          batch_size: the size of the batches used during training;
            the model construction is independent of batch_size, so it can be
            changed after initialization if this is convenient, e.g., for decoding.
          learning_rate: learning rate to start with.
          learning_rate_decay_factor: decay learning rate by this much when needed.
          use_lstm: if true, we use LSTM cells instead of GRU cells.
        """
        self.vocab_size = vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # Create the internal multi-layer cell for our RNN.
        single_cell = rnn_cell.GRUCell(size)
        if use_lstm:
            single_cell = rnn_cell.BasicLSTMCell(size)
        cell = single_cell
        if num_layers > 1:
            cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)

        # The seq2seq function: we use embedding for the input and attention.
        # def seq2seq_f(encoder_inputs, decoder_inputs):
        # return seq2seq.tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell)
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode=False):
            return seq2seq.embedding_attention_seq2seq(
                encoder_inputs, decoder_inputs, cell, vocab_size,
                vocab_size, feed_previous=False)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                      name="encoder{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                      name="decoder{0}".format(i)))
            self.target_weights.append(tf.placeholder(tf.float32, shape=[None],
                                                      name="weight{0}".format(i)))

        # Our targets are decoder inputs shifted by one.
        targets = [self.decoder_inputs[i + 1]
                   for i in xrange(len(self.decoder_inputs) - 1)]

        # Training outputs and losses.
        self.outputs, self.losses = seq2seq.model_with_buckets(
            self.encoder_inputs, self.decoder_inputs, targets,
            self.target_weights, buckets, self.vocab_size,
            lambda x, y: seq2seq_f(x, y), softmax_loss_function=None)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        self.gradient_norms = []
        self.updates = []
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        for b in xrange(len(buckets)):
            gradients = tf.gradients(self.losses[b], params)
            clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                             max_gradient_norm)
            self.gradient_norms.append(norm)
            self.updates.append(opt.apply_gradients(
                zip(clipped_gradients, params), global_step=self.global_step))
        self.saver = tf.train.Saver(tf.all_variables())
Esempio n. 11
0
    def __init__(self,
                 num_input_tokens, num_target_tokens, # 辞書サイズ
                 max_input_seq_length, max_target_seq_length, # インプット&アウトプットする文の文字数
                 input_word2idx, target_word2idx, # { 文字: インデックス}
                 input_idx2word, target_idx2word, # { インデックス: 文字}
                 num_samples=512, use_lstm=False, # トレーニングデータの総文字数, RNNそうに使うDLアーキテクチャ(LSTM)
                 NUM_HIDDEN_UNITS, NUM_HIDDEN_LAYERS, # 隠れ層のユニット数とレイヤー数
                 buckets,
                 batch_size,
                 learning_rate, learning_rate_decay_factor,
                 forward_only=False,
                 max_gradient_norm, 
                 # size, num_layers,
                 ):

        """
        Seq2Seqを実現するニューラルネットを構築するのに必要な情報やハイパラメータたち
        1. 入力データの整形(辞書数,パディング,バケツ化)
        2. 入出力層の長さ
        3. 隠れ層の構造(単層or多層)、タイプ(LSTM, RNN)
        4. 損失関数や勾配降下最適化アルゴリズム
        5. 学習率やミニバッチサイズなどのハイパラメータ
        """
        self.num_input_tokens = num_input_tokens
        self.num_output_tokens = num_target_tokens
        self.max_input_seq_length = max_input_seq_length
        self.max_output_seq_length = max_target_seq_length
        self.input_word2idx = input_word2idx
        self.output_word2idx = target_word2idx
        self.input_idx2word = input_idx2word
        self.output_idx2word = target_idx2word

        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        """
        出力層の定義
        """
        output_projection = None
        softmax_loss_function = None
        if num_samples > 0 and num_samples < self.num_input_tokens:

            with tf.device("/cpu0"):
                # tf.get_variableは、既に存在すれば取得し、なければ変数を作成する関数, 第一引数に変数の名前を指定する
                weight_matrix = tf.get_variable("proj_w", [size, self.num_target_tokens])
                # transpose word_matrix
                weight_matrix_T = tf.transpose(weight_matrix)
                # サンプル数(デフォルトでは512)がターゲット語彙サイズよりも小さい場合にのみサンプリング・ソフトマックスを構築する
                bias = tf.get_variable("proj_b", [self.num_target_tokens])
            # 重み行列とバイアス・ベクトルのペア
            # RNN セルは、バッチサイズ × target_vocab_size ではなく、バッチサイズ × size の形状のベクトルを返す
            # ロジットを取り出すために、重み行列を乗算し、バイアスを加える必要がある
            output_projection = (weight_matrix, bias)

            """
            <where>における誤差関数の定義
            """
            # inputs, labels = input_idx2word.keys(), input_idx2word.values()
            def sampled_loss(inputs, labels):

                with tf.device("/cpu0"):
                    #
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(weight_matrix_T, bias, labels,
                                                      inputs, num_samples, self.num_target_tokens)

            softmax_loss_function = sampled_loss

        """
        隠れ層のアーキテクチャ設定: 「隠れ層のセルの種類」と「隠れ層の数」の定義!
        """
        single_cell = rnn_cell.GRUCell(NUM_HIDDEN_UNITS)
        if use_lstm:
            single_cell = rnn_cell.BasicLSTMCell(NUM_HIDDEN_UNITS)
        # 隠れ層のユニット数の定義
        cell = single_cell
        if NUM_HIDDEN_LAYERS > 1:
            cell = rnn_cell.MultiRNNCell([single_cell] * NUM_HIDDEN_LAYERS)

        """
        Integrate each part of Neural Network Aechitecture!
        """
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return seq2seq.embedding_attention_seq2seq(
                                encoder_inputs, decoder_inputs,
                                cell, num_input_tokens,
                                num_target_tokens, output_projection=output_projection,
                                feed_previous=do_decode)

        """
        入出力データ(Sentence)の矯正その1
        # バケッティングは、文が短いときに不必要に多くの PAD 埋めを防具ために存在する
        # 入出力長を数種類に固定 (例えば,[(5,10),(10,15),(20,25),(40,50)]) して、数パターンのバケツを用意する
        """
        # 入力レイヤーの定義
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []

        for i in xrange(buckets[-1][0]):
            # tf.placeholderは、事前に変数の値を定義する必要のない、計算グラフ内の変数の容れ物
            # tf.placeholderの使い方をマスターすると、実行時に任意の値を入れてTensorFlowに計算させることができる
            # 機械学習のコードでは、主に入力層に渡す変数をtf.placeholderで定義して、実行時に学習に入力画像や情報をバッチ毎に供給するために使用する
            self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))

        for i in xrange(buckets[-1][1] + 1):
            # decoder
            self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i)))
            # target_weights
            self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i)))

        #
        targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)]

        #
        if forward_only:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                                            self.encoder_inputs, self.decoder_inputs,
                                            targets, self.target_weights,
                                            buckets, self.num_target_tokens,
                                            lambda x, y: seq2seq_f(encoder_inputs=x, encoder_inputs=y, do_decode=True),
                                            softmax_loss_function=softmax_loss_function)

        else:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                                            self.encoder_inputs, self.decoder_inputs,
                                            targets, self.target_weights,
                                            buckets, self.target_vocab_size,
                                            lambda x, y: seq2seq_f(encoder_inputs=x, encoder_inputs=y, do_decode=False),
                                            softmax_loss_function=softmax_loss_function)


        """切れた"""

        """
        バックプロパゲーションの勾配降下の設定
        """
        params = tf.trainable_variables()
        if not forward_only:
          self.gradient_norms = []
          self.updates = []
          opt = tf.train.GradientDescentOptimizer(self.learning_rate)
          for b in xrange(len(buckets)):
            gradients = tf.gradients(self.losses[b], params)
            clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm)
            self.gradient_norms.append(norm)
            self.updates.append(opt.apply_gradients(
                zip(clipped_gradients, params), global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables())