Example #1
0
  def testSequenceLoss(self):
    with self.test_session() as sess:
      output_classes = 5
      logits = [tf.constant(i + 0.5, shape=[2, 5]) for i in xrange(3)]
      targets = [tf.constant(i, tf.int32, shape=[2]) for i in xrange(3)]
      weights = [tf.constant(1.0, shape=[2]) for i in xrange(3)]

      average_loss_per_example = seq2seq.sequence_loss(
          logits, targets, weights, output_classes,
          average_across_timesteps=True,
          average_across_batch=True)
      res = sess.run(average_loss_per_example)
      self.assertAllClose(res, 1.60944)

      average_loss_per_sequence = seq2seq.sequence_loss(
          logits, targets, weights, output_classes,
          average_across_timesteps=False,
          average_across_batch=True)
      res = sess.run(average_loss_per_sequence)
      self.assertAllClose(res, 4.828314)

      total_loss = seq2seq.sequence_loss(
          logits, targets, weights, output_classes,
          average_across_timesteps=False,
          average_across_batch=False)
      res = sess.run(total_loss)
      self.assertAllClose(res, 9.656628)
Example #2
0
def build_autoencoder(dpg):
    hidden_dim = dpg.spec.policy_dims[0]
    dec_cell = util.GRUCell(FLAGS.embedding_dim, hidden_dim)
    dec_cell = rnn_cell.OutputProjectionWrapper(dec_cell, FLAGS.vocab_size)

    dec_inp = [
        tf.zeros_like(dpg.input_tokens[0], name="adec_inp%i" % t)
        for t in range(dpg.seq_length)
    ]
    dec_out, _ = util.embedding_rnn_decoder(dec_inp,
                                            dpg.encoder_states[-1],
                                            dec_cell,
                                            FLAGS.vocab_size,
                                            feed_previous=True,
                                            embedding=dpg.embeddings,
                                            scope="adec")

    labels = [
        tf.placeholder(tf.int32, shape=(None, ), name="labels%i" % t)
        for t in range(dpg.seq_length)
    ]
    weights = [tf.ones_like(labels_t, dtype=tf.float32) for labels_t in labels]

    loss = seq2seq.sequence_loss(dec_out, labels, weights, FLAGS.vocab_size)

    optimizer = tf.train.AdamOptimizer(0.01)
    train_op = optimizer.minimize(loss)  # TODO wrt what?

    return labels, loss, train_op
Example #3
0
    def __init__(self, vocab_size, sequence_length, num_units,
        max_gradient_norm, batch_size, learning_rate,
        learning_rate_decay_factor):
        self.vocab_size = vocab_size
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        w = training.utils.gaussian_weights_variable([num_units, self.vocab_size])
        b = tf.Variable(tf.zeros([self.vocab_size]))

        lstm_cell = rnn_cell.LSTMCell(num_units, vocab_size)

        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for _ in range(sequence_length):
            self.encoder_inputs.append(tf.placeholder(
                tf.float32, shape=(batch_size, self.vocab_size)))
            self.decoder_inputs.append(tf.placeholder(
                tf.float32, shape=(batch_size, self.vocab_size)))
            self.target_weights.append(tf.placeholder(
                tf.float32, shape=(batch_size,)))

        # Decoder has one extra cell because it starts with the GO symbol,
        # and the targets are shifted by one.
        # Not sure this is actually useful, as it is always set to 0.
        # As this is inspired by TensorFlow seq2seq models, there might be
        # something dodgy in there.
        self.decoder_inputs.append(tf.placeholder(
            tf.float32, shape=(batch_size, self.vocab_size)))
        self.target_weights.append(np.ones((batch_size,)))

        # Targets used by the sequence loss must be integer indices.
        targets = [tf.cast(tf.argmax(i, 1), dtype=tf.int32)
            for i in self.decoder_inputs[1:]]

        outputs, self.state = seq2seq.basic_rnn_seq2seq(
            self.encoder_inputs, self.decoder_inputs, lstm_cell)

        self.logits = [tf.nn.xw_plus_b(o, w, b) for o in outputs]
        self.loss = seq2seq.sequence_loss(self.logits[:self.sequence_length],
            targets, self.target_weights[:self.sequence_length],
            self.vocab_size)

        params = tf.trainable_variables()
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(self.loss, params)
        clipped_gradients, self.gradient_norms = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.updates = opt.apply_gradients(
            zip(clipped_gradients, params), global_step=self.global_step)

        self.saver = tf.train.Saver(tf.all_variables())
    def __load_optimizer(self):
        # loss function
        self.loss = seq2seq.sequence_loss(self.dec_outputs, self.labels, \
                                                self.weights, self.vocab_size)

        # optimizer
        self.optimizer = tf.train.MomentumOptimizer(self.learning_rate, \
                                                    self.momentum)
        self.train_op = self.optimizer.minimize(self.loss)
    def __load_optimizer(self):
        # loss function
        with tf.variable_scope("forward"):
            self.loss_fwd = seq2seq.sequence_loss(self.dec_outputs_fwd, self.labels, \
                                                self.weights, self.vocab_size)

            # optimizer
            self.optimizer_fwd = tf.train.MomentumOptimizer(self.learning_rate, \
                                                    self.momentum)
            self.train_op_fwd = self.optimizer_fwd.minimize(self.loss_fwd)

        with tf.variable_scope("backward"):
            self.loss_bwd = seq2seq.sequence_loss(self.dec_outputs_bwd, self.labels, \
                                                self.weights, self.vocab_size)

            # optimizer
            self.optimizer_bwd = tf.train.MomentumOptimizer(self.learning_rate, \
                                                    self.momentum)
            self.train_op_bwd = self.optimizer_bwd.minimize(self.loss_bwd)
    def __load_optimizer(self):
        # loss function
        with tf.variable_scope("forward"):
            self.loss_fwd = seq2seq.sequence_loss(self.dec_outputs_fwd, self.labels, \
                                                self.weights, self.vocab_size)

            # optimizer
            self.optimizer_fwd = tf.train.MomentumOptimizer(self.learning_rate, \
                                                    self.momentum)
            self.train_op_fwd = self.optimizer_fwd.minimize(self.loss_fwd)

        with tf.variable_scope("backward"):
            self.loss_bwd = seq2seq.sequence_loss(self.dec_outputs_bwd, self.labels, \
                                                self.weights, self.vocab_size)

            # optimizer
            self.optimizer_bwd = tf.train.MomentumOptimizer(self.learning_rate, \
                                                    self.momentum)
            self.train_op_bwd = self.optimizer_bwd.minimize(self.loss_bwd)
Example #7
0
def build_autoencoder(dpg):
  hidden_dim = dpg.spec.policy_dims[0]
  dec_cell = util.GRUCell(FLAGS.embedding_dim, hidden_dim)
  dec_cell = rnn_cell.OutputProjectionWrapper(dec_cell,
                                              FLAGS.vocab_size)

  dec_inp = [tf.zeros_like(dpg.input_tokens[0], name="adec_inp%i" % t)
             for t in range(dpg.seq_length)]
  dec_out, _ = util.embedding_rnn_decoder(
      dec_inp, dpg.encoder_states[-1], dec_cell, FLAGS.vocab_size,
      feed_previous=True, embedding=dpg.embeddings, scope="adec")

  labels = [tf.placeholder(tf.int32, shape=(None,), name="labels%i" % t)
            for t in range(dpg.seq_length)]
  weights = [tf.ones_like(labels_t, dtype=tf.float32) for labels_t in labels]

  loss = seq2seq.sequence_loss(dec_out, labels, weights, FLAGS.vocab_size)

  optimizer = tf.train.AdamOptimizer(0.01)
  train_op = optimizer.minimize(loss) # TODO wrt what?

  return labels, loss, train_op
Example #8
0
with tf.variable_scope("RNN/EmbeddingWrapper", reuse=True):
    embeddings = tf.get_variable("embedding")
    inp_embedded = [tf.nn.embedding_lookup(embeddings, inp_t)
                    for inp_t in inp]

cell = rnn_cell.GRUCell(memory_dim)
attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
                            for e in enc_outputs])
dec_inp = [tf.zeros((batch_size, cell.input_size), dtype=tf.float32)
           for _ in range(seq_length)]

dec_outputs, dec_states = seq2seq.attention_decoder(dec_inp, enc_states[-1],
                                                    attn_states, cell, output_size=seq_length,
                                                    loop_function=make_loop_function(inp_embedded, cell))
loss = seq2seq.sequence_loss(dec_outputs, labels, weights, seq_length)

learning_rate = 0.05
momentum = 0.9
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
train_op = optimizer.minimize(loss)
summary_op = loss # tf.merge_all_summaries()

sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())

def train_batch(batch_size):
    X = [np.random.choice(vocab_size, size=(seq_length,), replace=False)
         for _ in range(batch_size)]
    y = [np.argsort(x) for x in X] # [np.arange(seq_length) for _ in X]
    
Example #9
0
cell = rnn_cell.GRUCell(memory_dim)
attn_states = tf.concat(
    1, [tf.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs])
dec_inp = [
    tf.zeros((batch_size, cell.input_size), dtype=tf.float32)
    for _ in range(seq_length)
]

dec_outputs, dec_states = seq2seq.attention_decoder(
    dec_inp,
    enc_states[-1],
    attn_states,
    cell,
    output_size=seq_length,
    loop_function=make_loop_function(inp_embedded, cell))
loss = seq2seq.sequence_loss(dec_outputs, labels, weights, seq_length)

learning_rate = 0.05
momentum = 0.9
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
train_op = optimizer.minimize(loss)
summary_op = loss  # tf.merge_all_summaries()

sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())


def train_batch(batch_size):
    X = [
        np.random.choice(vocab_size, size=(seq_length, ), replace=False)
        for _ in range(batch_size)
Example #10
0
    def __init__(self,
                 vocab,
                 tagset,
                 alphabet,
                 word_embedding_size,
                 char_embedding_size,
                 num_chars,
                 num_steps,
                 optimizer_desc,
                 generate_lemmas,
                 l2,
                 dropout_prob_values,
                 experiment_name,
                 supply_form_characters_to_lemma,
                 threads=0,
                 seed=None,
                 write_summaries=True,
                 use_attention=True,
                 scheduled_sampling=None):
        """
        Builds the tagger computation graph and initializes it in a TensorFlow
        session.

        Arguments:

            vocab: Vocabulary of word forms.

            tagset: Vocabulary of possible tags.

            alphabet: Vocabulary of possible characters.

            word_embedding_size (int): Size of the form-based word embedding.

            char_embedding_size (int): Size of character embeddings, i.e. a
                half of the size of the character-based words embeddings.

            num_chars: Maximum length of a word.

            num_steps: Maximum lenght of a sentence.

            optimizer_desc: Description of the optimizer.

            generate_lemmas: Generate lemmas during tagging.

            seed: TensorFlow seed

            write_summaries: Write summaries using TensorFlow interface.
        """

        self.num_steps = num_steps
        self.num_chars = num_chars

        self.word_embedding_size = word_embedding_size
        self.char_embedding_size = char_embedding_size
        self.lstm_size = word_embedding_size + 2 * char_embedding_size  ###

        self.vocab = vocab
        self.tagset = tagset
        self.alphabet = alphabet

        self.dropout_prob_values = dropout_prob_values

        self.forward_initial_state = tf.placeholder(
            tf.float32,
            [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size],
            name="forward_lstm_initial_state")
        self.backward_initial_state = tf.placeholder(
            tf.float32,
            [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size],
            name="backward_lstm_initial_state")
        self.sentence_lengths = tf.placeholder(tf.int64, [None],
                                               name="sentence_lengths")
        self.tags = tf.placeholder(tf.int32, [None, num_steps],
                                   name="ground_truth_tags")
        self.dropout_prob = tf.placeholder(tf.float32, [None],
                                           name="dropout_keep_p")
        self.generate_lemmas = generate_lemmas

        global_step = tf.Variable(0, trainable=False)

        input_list = []
        regularize = []

        # Word-level embeddings
        if word_embedding_size:
            self.words = tf.placeholder(tf.int32, [None, num_steps],
                                        name='words')
            word_embeddings = tf.Variable(
                tf.random_uniform([len(vocab), word_embedding_size], -1.0,
                                  1.0))
            we_lookup = tf.nn.embedding_lookup(word_embeddings, self.words)

            input_list.append(we_lookup)

        # Character-level embeddings
        if char_embedding_size:
            self.chars = tf.placeholder(tf.int32, [None, num_steps, num_chars],
                                        name='chars')
            self.chars_lengths = tf.placeholder(tf.int64, [None, num_steps],
                                                name='chars_lengths')

            char_embeddings = \
                tf.Variable(tf.random_uniform([len(alphabet), char_embedding_size], -1.0, 1.0))
            ce_lookup = tf.nn.embedding_lookup(char_embeddings, self.chars)

            reshaped_ce_lookup = tf.reshape(
                ce_lookup, [-1, num_chars, char_embedding_size],
                name="reshape-char_inputs")
            char_inputs = [
                tf.squeeze(input_, [1])
                for input_ in tf.split(1, num_chars, reshaped_ce_lookup)
            ]

            char_inputs_lengths = tf.reshape(self.chars_lengths, [-1])

            with tf.variable_scope('char_forward'):
                char_lstm = rnn_cell.BasicLSTMCell(char_embedding_size)
                _, char_last_state = rnn.rnn(
                    cell=char_lstm,
                    inputs=char_inputs,
                    sequence_length=char_inputs_lengths,
                    dtype=tf.float32)
                tf.get_variable_scope().reuse_variables()
                regularize.append(
                    tf.get_variable('RNN/BasicLSTMCell/Linear/Matrix'))

            with tf.variable_scope('char_backward'):
                char_lstm_rev = rnn_cell.BasicLSTMCell(char_embedding_size)
                _, char_last_state_rev = rnn.rnn(
                    cell=char_lstm_rev,
                    inputs=self._reverse_seq(char_inputs, char_inputs_lengths),
                    sequence_length=char_inputs_lengths,
                    dtype=tf.float32)
                tf.get_variable_scope().reuse_variables()
                regularize.append(
                    tf.get_variable('RNN/BasicLSTMCell/Linear/Matrix'))

            last_char_lstm_state = tf.split(1, 2, char_last_state)[1]
            last_char_lstm_state_rev = tf.split(1, 2, char_last_state_rev)[1]

            last_char_states = \
                tf.reshape(last_char_lstm_state, [-1, num_steps, char_embedding_size],
                           name="reshape-charstates")
            last_char_states_rev = tf.reshape(
                last_char_lstm_state_rev, [-1, num_steps, char_embedding_size],
                name="reshape-charstates_rev")

            char_output = tf.concat(2,
                                    [last_char_states, last_char_states_rev])

            input_list.append(char_output)

        # All inputs correctly sliced
        input_list_dropped = [
            tf.nn.dropout(x, self.dropout_prob[0]) for x in input_list
        ]
        inputs = [
            tf.squeeze(input_, [1]) for input_ in tf.split(
                1, num_steps, tf.concat(2, input_list_dropped))
        ]

        with tf.variable_scope('forward'):
            lstm = rnn_cell.BasicLSTMCell(self.lstm_size)
            outputs, last_state = rnn.rnn(
                cell=lstm,
                inputs=inputs,
                dtype=tf.float32,
                initial_state=self.forward_initial_state,
                sequence_length=self.sentence_lengths)

            tf.get_variable_scope().reuse_variables()
            regularize.append(
                tf.get_variable('RNN/BasicLSTMCell/Linear/Matrix'))

        with tf.variable_scope('backward'):
            lstm_rev = rnn_cell.BasicLSTMCell(self.lstm_size)
            outputs_rev_rev, last_state_rev = rnn.rnn(
                cell=lstm_rev,
                inputs=self._reverse_seq(inputs, self.sentence_lengths),
                dtype=tf.float32,
                initial_state=self.backward_initial_state,
                sequence_length=self.sentence_lengths)

            outputs_rev = self._reverse_seq(outputs_rev_rev,
                                            self.sentence_lengths)

            tf.get_variable_scope().reuse_variables()
            regularize.append(
                tf.get_variable('RNN/BasicLSTMCell/Linear/Matrix'))

        #outputs_forward = tf.reshape(tf.concat(1, outputs), [-1, self.lstm_size],
        #                    name="reshape-outputs_forward")

        #outputs_backward = tf.reshape(tf.concat(1, outputs_rev), [-1, self.lstm_size],
        #                    name="reshape-outputs_backward")

        #forward_w = tf.get_variable("forward_w", [self.lstm_size, self.lstm_size])
        #backward_w = tf.get_variable("backward_w", [self.lstm_size, self.lstm_size])
        #non_linearity_bias = tf.get_variable("non_linearity_b", [self.lstm_size])

        outputs_bidi = [
            tf.concat(1, [o1, o2])
            for o1, o2 in zip(outputs, reversed(outputs_rev))
        ]

        #output = tf.tanh(tf.matmul(outputs_forward, forward_w) + tf.matmul(outputs_backward, backward_w) + non_linearity_bias)
        output = tf.reshape(tf.concat(1, outputs_bidi),
                            [-1, 2 * self.lstm_size],
                            name="reshape-outputs_bidi")
        output_dropped = tf.nn.dropout(output, self.dropout_prob[1])

        # We are computing only the logits, not the actual softmax -- while
        # computing the loss, it is done by the sequence_loss_by_example and
        # during the runtime classification, the argmax over logits is enough.

        softmax_w = tf.get_variable(
            "softmax_w", [2 * self.lstm_size, len(tagset)])
        logits_flatten = tf.nn.xw_plus_b(
            output_dropped, softmax_w,
            tf.get_variable("softmax_b", [len(tagset)]))
        #tf.get_variable_scope().reuse_variables()
        regularize.append(softmax_w)

        self.logits = tf.reshape(logits_flatten,
                                 [-1, num_steps, len(tagset)],
                                 name="reshape-logits")
        estimated_tags_flat = tf.to_int32(
            tf.argmax(logits_flatten, dimension=1))
        self.last_state = last_state

        # output maks: compute loss only if it insn't a padded word (i.e. zero index)
        output_mask = tf.reshape(tf.to_float(tf.not_equal(self.tags, 0)), [-1])

        gt_tags_flat = tf.reshape(self.tags, [-1])
        tagging_loss = seq2seq.sequence_loss_by_example(
            logits=[logits_flatten],
            targets=[gt_tags_flat],
            weights=[output_mask])

        tagging_accuracy = \
            tf.reduce_sum(tf.to_float(tf.equal(estimated_tags_flat, gt_tags_flat)) * output_mask) \
                / tf.reduce_sum(output_mask)
        tf.scalar_summary('train_accuracy',
                          tagging_accuracy,
                          collections=["train"])
        tf.scalar_summary('dev_accuracy',
                          tagging_accuracy,
                          collections=["dev"])

        self.cost = tf.reduce_mean(tagging_loss)

        tf.scalar_summary('train_tagging_loss',
                          tf.reduce_mean(tagging_loss),
                          collections=["train"])
        tf.scalar_summary('dev_tagging_loss',
                          tf.reduce_mean(tagging_loss),
                          collections=["dev"])

        if generate_lemmas:
            with tf.variable_scope('decoder'):
                self.lemma_chars = tf.placeholder(
                    tf.int32, [None, num_steps, num_chars + 2],
                    name='lemma_chars')

                lemma_state_size = self.lstm_size

                lemma_w = tf.Variable(tf.random_uniform(
                    [lemma_state_size, len(alphabet)], 0.5),
                                      name="state_to_char_w")
                lemma_b = tf.Variable(tf.fill([len(alphabet)],
                                              -math.log(len(alphabet))),
                                      name="state_to_char_b")
                lemma_char_embeddings = tf.Variable(tf.random_uniform([
                    len(alphabet), lemma_state_size /
                    (2 if supply_form_characters_to_lemma else 1)
                ], -0.5, 0.5),
                                                    name="char_embeddings")

                lemma_char_inputs = \
                    [tf.squeeze(input_, [1]) for input_ in
                        tf.split(1, num_chars + 2, tf.reshape(self.lemma_chars, [-1, num_chars + 2],
                                                              name="reshape-lemma_char_inputs"))]

                if supply_form_characters_to_lemma:
                    char_inputs_zeros = \
                        [tf.squeeze(chars, [1]) for chars in
                            tf.split(1, num_chars, tf.reshape(self.chars, [-1, num_chars],
                                                              name="reshape-char_inputs_zeros"))]
                    char_inputs_zeros.append(char_inputs_zeros[0] * 0)

                    def loop(prev_state, i):
                        # it takes the previous hidden state, finds the character and formats it
                        # as input for the next time step ... used in the decoder in the "real decoding scenario"
                        out_activation = tf.matmul(prev_state,
                                                   lemma_w) + lemma_b
                        prev_char_index = tf.argmax(out_activation, 1)
                        return tf.concat(1, [
                            tf.nn.embedding_lookup(lemma_char_embeddings,
                                                   prev_char_index),
                            tf.nn.embedding_lookup(lemma_char_embeddings,
                                                   char_inputs_zeros[i])
                        ])

                    embedded_lemma_characters = []
                    for lemma_chars, form_chars in zip(lemma_char_inputs[:-1],
                                                       char_inputs_zeros):
                        embedded_lemma_characters.append(
                            tf.concat(1, [
                                tf.nn.embedding_lookup(lemma_char_embeddings,
                                                       lemma_chars),
                                tf.nn.embedding_lookup(lemma_char_embeddings,
                                                       form_chars)
                            ]))
                else:

                    def loop(prev_state, _):
                        # it takes the previous hidden state, finds the character and formats it
                        # as input for the next time step ... used in the decoder in the "real decoding scenario"
                        out_activation = tf.matmul(prev_state,
                                                   lemma_w) + lemma_b
                        prev_char_index = tf.argmax(out_activation, 1)
                        return tf.nn.embedding_lookup(lemma_char_embeddings,
                                                      prev_char_index)

                    embedded_lemma_characters = []
                    for lemma_chars in lemma_char_inputs[:-1]:
                        embedded_lemma_characters.append(
                            tf.nn.embedding_lookup(lemma_char_embeddings,
                                                   lemma_chars))

                def sampling_loop(prev_state, i):
                    threshold = scheduled_sampling / (
                        scheduled_sampling + tf.exp(tf.to_float(global_step)))
                    condition = tf.less_equal(
                        tf.random_uniform(
                            tf.shape(embedded_lemma_characters[0])), threshold)
                    return tf.select(condition, embedded_lemma_characters[i],
                                     loop(prev_state, i))

                decoder_cell = rnn_cell.BasicLSTMCell(lemma_state_size)

                if scheduled_sampling:
                    lf = sampling_loop
                else:
                    lf = None

                if use_attention:
                    lemma_outputs_train, _ = seq2seq.attention_decoder(
                        embedded_lemma_characters,
                        output_dropped,
                        reshaped_ce_lookup,
                        decoder_cell,
                        loop_function=lf)
                else:
                    lemma_outputs_train, _ = seq2seq.rnn_decoder(
                        embedded_lemma_characters,
                        output_dropped,
                        decoder_cell,
                        loop_function=lf)

                tf.get_variable_scope().reuse_variables()
                #regularize.append(tf.get_variable('attention_decoder/BasicLSTMCell/Linear/Matrix'))

                tf.get_variable_scope().reuse_variables()

                if use_attention:
                    lemma_outputs_runtime, _ = \
                        seq2seq.attention_decoder(embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell,
                            loop_function=loop)
                else:
                    lemma_outputs_runtime, _ = \
                        seq2seq.rnn_decoder(embedded_lemma_characters, output_dropped, decoder_cell,
                            loop_function=loop)

                lemma_char_logits_train = \
                    [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_train]

                lemma_char_logits_runtime = \
                    [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_runtime]

                self.lemmas_decoded = \
                    tf.reshape(tf.transpose(tf.argmax(tf.pack(lemma_char_logits_runtime), 2)), [-1, num_steps, num_chars + 1])

                lemma_char_weights = []
                for lemma_chars in lemma_char_inputs[1:]:
                    lemma_char_weights.append(
                        tf.to_float(tf.not_equal(lemma_chars, 0)))

                lemmatizer_loss = seq2seq.sequence_loss(
                    lemma_char_logits_train, lemma_char_inputs[1:],
                    lemma_char_weights)

                lemmatizer_loss_runtime = \
                        seq2seq.sequence_loss(lemma_char_logits_runtime, lemma_char_inputs[1:],
                                              lemma_char_weights)

                tf.scalar_summary('train_lemma_loss_with_gt_inputs',
                                  tf.reduce_mean(lemmatizer_loss),
                                  collections=["train"])
                tf.scalar_summary('dev_lemma_loss_with_gt_inputs',
                                  tf.reduce_mean(lemmatizer_loss),
                                  collections=["dev"])

                tf.scalar_summary('train_lemma_loss_with_decoded_inputs',
                                  tf.reduce_mean(lemmatizer_loss_runtime),
                                  collections=["train"])
                tf.scalar_summary('dev_lemma_loss_with_decoded_inputs',
                                  tf.reduce_mean(lemmatizer_loss_runtime),
                                  collections=["dev"])

                self.cost += tf.reduce_mean(lemmatizer_loss) + tf.reduce_mean(
                    lemmatizer_loss_runtime)

        self.cost += l2 * sum(
            [tf.nn.l2_loss(variable) for variable in regularize])

        tf.scalar_summary('train_optimization_cost',
                          self.cost,
                          collections=["train"])
        tf.scalar_summary('dev_optimization_cost',
                          self.cost,
                          collections=["dev"])

        def decay(learning_rate, exponent, iteration_steps):
            return tf.train.exponential_decay(learning_rate,
                                              global_step,
                                              iteration_steps,
                                              exponent,
                                              staircase=True)

        optimizer = eval('tf.train.' + optimizer_desc)
        self.train = optimizer.minimize(self.cost, global_step=global_step)

        if threads > 0:
            self.session = tf.Session(
                config=tf.ConfigProto(inter_op_parallelism_threads=threads,
                                      intra_op_parallelism_threads=threads))
        else:
            self.session = tf.Session()
        self.session.run(tf.initialize_all_variables())

        if write_summaries:
            self.summary_train = tf.merge_summary(tf.get_collection("train"))
            self.summary_dev = tf.merge_summary(tf.get_collection("dev"))
            timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
            self.summary_writer = tf.train.SummaryWriter("logs/" + timestamp +
                                                         "_" + experiment_name)

        self.steps = 0
Example #11
0
    def __init__(
        self,
        vocab,
        tagset,
        alphabet,
        word_embedding_size,
        char_embedding_size,
        num_chars,
        num_steps,
        optimizer_desc,
        generate_lemmas,
        l2,
        dropout_prob_values,
        experiment_name,
        supply_form_characters_to_lemma,
        threads=0,
        seed=None,
        write_summaries=True,
        use_attention=True,
        scheduled_sampling=None,
    ):
        """
        Builds the tagger computation graph and initializes it in a TensorFlow
        session.

        Arguments:

            vocab: Vocabulary of word forms.

            tagset: Vocabulary of possible tags.

            alphabet: Vocabulary of possible characters.

            word_embedding_size (int): Size of the form-based word embedding.

            char_embedding_size (int): Size of character embeddings, i.e. a
                half of the size of the character-based words embeddings.

            num_chars: Maximum length of a word.

            num_steps: Maximum lenght of a sentence.

            optimizer_desc: Description of the optimizer.

            generate_lemmas: Generate lemmas during tagging.

            seed: TensorFlow seed

            write_summaries: Write summaries using TensorFlow interface.
        """

        self.num_steps = num_steps
        self.num_chars = num_chars

        self.word_embedding_size = word_embedding_size
        self.char_embedding_size = char_embedding_size
        self.lstm_size = word_embedding_size + 2 * char_embedding_size  ###

        self.vocab = vocab
        self.tagset = tagset
        self.alphabet = alphabet

        self.dropout_prob_values = dropout_prob_values

        self.forward_initial_state = tf.placeholder(
            tf.float32, [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size], name="forward_lstm_initial_state"
        )
        self.backward_initial_state = tf.placeholder(
            tf.float32, [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size], name="backward_lstm_initial_state"
        )
        self.sentence_lengths = tf.placeholder(tf.int64, [None], name="sentence_lengths")
        self.tags = tf.placeholder(tf.int32, [None, num_steps], name="ground_truth_tags")
        self.dropout_prob = tf.placeholder(tf.float32, [None], name="dropout_keep_p")
        self.generate_lemmas = generate_lemmas

        global_step = tf.Variable(0, trainable=False)

        input_list = []
        regularize = []

        # Word-level embeddings
        if word_embedding_size:
            self.words = tf.placeholder(tf.int32, [None, num_steps], name="words")
            word_embeddings = tf.Variable(tf.random_uniform([len(vocab), word_embedding_size], -1.0, 1.0))
            we_lookup = tf.nn.embedding_lookup(word_embeddings, self.words)

            input_list.append(we_lookup)

        # Character-level embeddings
        if char_embedding_size:
            self.chars = tf.placeholder(tf.int32, [None, num_steps, num_chars], name="chars")
            self.chars_lengths = tf.placeholder(tf.int64, [None, num_steps], name="chars_lengths")

            char_embeddings = tf.Variable(tf.random_uniform([len(alphabet), char_embedding_size], -1.0, 1.0))
            ce_lookup = tf.nn.embedding_lookup(char_embeddings, self.chars)

            reshaped_ce_lookup = tf.reshape(ce_lookup, [-1, num_chars, char_embedding_size], name="reshape-char_inputs")
            char_inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_chars, reshaped_ce_lookup)]

            char_inputs_lengths = tf.reshape(self.chars_lengths, [-1])

            with tf.variable_scope("char_forward"):
                char_lstm = rnn_cell.BasicLSTMCell(char_embedding_size)
                _, char_last_state = rnn.rnn(
                    cell=char_lstm, inputs=char_inputs, sequence_length=char_inputs_lengths, dtype=tf.float32
                )
                tf.get_variable_scope().reuse_variables()
                regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix"))

            with tf.variable_scope("char_backward"):
                char_lstm_rev = rnn_cell.BasicLSTMCell(char_embedding_size)
                _, char_last_state_rev = rnn.rnn(
                    cell=char_lstm_rev,
                    inputs=self._reverse_seq(char_inputs, char_inputs_lengths),
                    sequence_length=char_inputs_lengths,
                    dtype=tf.float32,
                )
                tf.get_variable_scope().reuse_variables()
                regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix"))

            last_char_lstm_state = tf.split(1, 2, char_last_state)[1]
            last_char_lstm_state_rev = tf.split(1, 2, char_last_state_rev)[1]

            last_char_states = tf.reshape(
                last_char_lstm_state, [-1, num_steps, char_embedding_size], name="reshape-charstates"
            )
            last_char_states_rev = tf.reshape(
                last_char_lstm_state_rev, [-1, num_steps, char_embedding_size], name="reshape-charstates_rev"
            )

            char_output = tf.concat(2, [last_char_states, last_char_states_rev])

            input_list.append(char_output)

        # All inputs correctly sliced
        input_list_dropped = [tf.nn.dropout(x, self.dropout_prob[0]) for x in input_list]
        inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, tf.concat(2, input_list_dropped))]

        with tf.variable_scope("forward"):
            lstm = rnn_cell.BasicLSTMCell(self.lstm_size)
            outputs, last_state = rnn.rnn(
                cell=lstm,
                inputs=inputs,
                dtype=tf.float32,
                initial_state=self.forward_initial_state,
                sequence_length=self.sentence_lengths,
            )

            tf.get_variable_scope().reuse_variables()
            regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix"))

        with tf.variable_scope("backward"):
            lstm_rev = rnn_cell.BasicLSTMCell(self.lstm_size)
            outputs_rev_rev, last_state_rev = rnn.rnn(
                cell=lstm_rev,
                inputs=self._reverse_seq(inputs, self.sentence_lengths),
                dtype=tf.float32,
                initial_state=self.backward_initial_state,
                sequence_length=self.sentence_lengths,
            )

            outputs_rev = self._reverse_seq(outputs_rev_rev, self.sentence_lengths)

            tf.get_variable_scope().reuse_variables()
            regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix"))

        # outputs_forward = tf.reshape(tf.concat(1, outputs), [-1, self.lstm_size],
        #                    name="reshape-outputs_forward")

        # outputs_backward = tf.reshape(tf.concat(1, outputs_rev), [-1, self.lstm_size],
        #                    name="reshape-outputs_backward")

        # forward_w = tf.get_variable("forward_w", [self.lstm_size, self.lstm_size])
        # backward_w = tf.get_variable("backward_w", [self.lstm_size, self.lstm_size])
        # non_linearity_bias = tf.get_variable("non_linearity_b", [self.lstm_size])

        outputs_bidi = [tf.concat(1, [o1, o2]) for o1, o2 in zip(outputs, reversed(outputs_rev))]

        # output = tf.tanh(tf.matmul(outputs_forward, forward_w) + tf.matmul(outputs_backward, backward_w) + non_linearity_bias)
        output = tf.reshape(tf.concat(1, outputs_bidi), [-1, 2 * self.lstm_size], name="reshape-outputs_bidi")
        output_dropped = tf.nn.dropout(output, self.dropout_prob[1])

        # We are computing only the logits, not the actual softmax -- while
        # computing the loss, it is done by the sequence_loss_by_example and
        # during the runtime classification, the argmax over logits is enough.

        softmax_w = tf.get_variable("softmax_w", [2 * self.lstm_size, len(tagset)])
        logits_flatten = tf.nn.xw_plus_b(output_dropped, softmax_w, tf.get_variable("softmax_b", [len(tagset)]))
        # tf.get_variable_scope().reuse_variables()
        regularize.append(softmax_w)

        self.logits = tf.reshape(logits_flatten, [-1, num_steps, len(tagset)], name="reshape-logits")
        estimated_tags_flat = tf.to_int32(tf.argmax(logits_flatten, dimension=1))
        self.last_state = last_state

        # output maks: compute loss only if it insn't a padded word (i.e. zero index)
        output_mask = tf.reshape(tf.to_float(tf.not_equal(self.tags, 0)), [-1])

        gt_tags_flat = tf.reshape(self.tags, [-1])
        tagging_loss = seq2seq.sequence_loss_by_example(
            logits=[logits_flatten], targets=[gt_tags_flat], weights=[output_mask]
        )

        tagging_accuracy = tf.reduce_sum(
            tf.to_float(tf.equal(estimated_tags_flat, gt_tags_flat)) * output_mask
        ) / tf.reduce_sum(output_mask)
        tf.scalar_summary("train_accuracy", tagging_accuracy, collections=["train"])
        tf.scalar_summary("dev_accuracy", tagging_accuracy, collections=["dev"])

        self.cost = tf.reduce_mean(tagging_loss)

        tf.scalar_summary("train_tagging_loss", tf.reduce_mean(tagging_loss), collections=["train"])
        tf.scalar_summary("dev_tagging_loss", tf.reduce_mean(tagging_loss), collections=["dev"])

        if generate_lemmas:
            with tf.variable_scope("decoder"):
                self.lemma_chars = tf.placeholder(tf.int32, [None, num_steps, num_chars + 2], name="lemma_chars")

                lemma_state_size = self.lstm_size

                lemma_w = tf.Variable(tf.random_uniform([lemma_state_size, len(alphabet)], 0.5), name="state_to_char_w")
                lemma_b = tf.Variable(tf.fill([len(alphabet)], -math.log(len(alphabet))), name="state_to_char_b")
                lemma_char_embeddings = tf.Variable(
                    tf.random_uniform(
                        [len(alphabet), lemma_state_size / (2 if supply_form_characters_to_lemma else 1)], -0.5, 0.5
                    ),
                    name="char_embeddings",
                )

                lemma_char_inputs = [
                    tf.squeeze(input_, [1])
                    for input_ in tf.split(
                        1,
                        num_chars + 2,
                        tf.reshape(self.lemma_chars, [-1, num_chars + 2], name="reshape-lemma_char_inputs"),
                    )
                ]

                if supply_form_characters_to_lemma:
                    char_inputs_zeros = [
                        tf.squeeze(chars, [1])
                        for chars in tf.split(
                            1, num_chars, tf.reshape(self.chars, [-1, num_chars], name="reshape-char_inputs_zeros")
                        )
                    ]
                    char_inputs_zeros.append(char_inputs_zeros[0] * 0)

                    def loop(prev_state, i):
                        # it takes the previous hidden state, finds the character and formats it
                        # as input for the next time step ... used in the decoder in the "real decoding scenario"
                        out_activation = tf.matmul(prev_state, lemma_w) + lemma_b
                        prev_char_index = tf.argmax(out_activation, 1)
                        return tf.concat(
                            1,
                            [
                                tf.nn.embedding_lookup(lemma_char_embeddings, prev_char_index),
                                tf.nn.embedding_lookup(lemma_char_embeddings, char_inputs_zeros[i]),
                            ],
                        )

                    embedded_lemma_characters = []
                    for lemma_chars, form_chars in zip(lemma_char_inputs[:-1], char_inputs_zeros):
                        embedded_lemma_characters.append(
                            tf.concat(
                                1,
                                [
                                    tf.nn.embedding_lookup(lemma_char_embeddings, lemma_chars),
                                    tf.nn.embedding_lookup(lemma_char_embeddings, form_chars),
                                ],
                            )
                        )
                else:

                    def loop(prev_state, _):
                        # it takes the previous hidden state, finds the character and formats it
                        # as input for the next time step ... used in the decoder in the "real decoding scenario"
                        out_activation = tf.matmul(prev_state, lemma_w) + lemma_b
                        prev_char_index = tf.argmax(out_activation, 1)
                        return tf.nn.embedding_lookup(lemma_char_embeddings, prev_char_index)

                    embedded_lemma_characters = []
                    for lemma_chars in lemma_char_inputs[:-1]:
                        embedded_lemma_characters.append(tf.nn.embedding_lookup(lemma_char_embeddings, lemma_chars))

                def sampling_loop(prev_state, i):
                    threshold = scheduled_sampling / (scheduled_sampling + tf.exp(tf.to_float(global_step)))
                    condition = tf.less_equal(tf.random_uniform(tf.shape(embedded_lemma_characters[0])), threshold)
                    return tf.select(condition, embedded_lemma_characters[i], loop(prev_state, i))

                decoder_cell = rnn_cell.BasicLSTMCell(lemma_state_size)

                if scheduled_sampling:
                    lf = sampling_loop
                else:
                    lf = None

                if use_attention:
                    lemma_outputs_train, _ = seq2seq.attention_decoder(
                        embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell, loop_function=lf
                    )
                else:
                    lemma_outputs_train, _ = seq2seq.rnn_decoder(
                        embedded_lemma_characters, output_dropped, decoder_cell, loop_function=lf
                    )

                tf.get_variable_scope().reuse_variables()
                # regularize.append(tf.get_variable('attention_decoder/BasicLSTMCell/Linear/Matrix'))

                tf.get_variable_scope().reuse_variables()

                if use_attention:
                    lemma_outputs_runtime, _ = seq2seq.attention_decoder(
                        embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell, loop_function=loop
                    )
                else:
                    lemma_outputs_runtime, _ = seq2seq.rnn_decoder(
                        embedded_lemma_characters, output_dropped, decoder_cell, loop_function=loop
                    )

                lemma_char_logits_train = [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_train]

                lemma_char_logits_runtime = [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_runtime]

                self.lemmas_decoded = tf.reshape(
                    tf.transpose(tf.argmax(tf.pack(lemma_char_logits_runtime), 2)), [-1, num_steps, num_chars + 1]
                )

                lemma_char_weights = []
                for lemma_chars in lemma_char_inputs[1:]:
                    lemma_char_weights.append(tf.to_float(tf.not_equal(lemma_chars, 0)))

                lemmatizer_loss = seq2seq.sequence_loss(
                    lemma_char_logits_train, lemma_char_inputs[1:], lemma_char_weights
                )

                lemmatizer_loss_runtime = seq2seq.sequence_loss(
                    lemma_char_logits_runtime, lemma_char_inputs[1:], lemma_char_weights
                )

                tf.scalar_summary(
                    "train_lemma_loss_with_gt_inputs", tf.reduce_mean(lemmatizer_loss), collections=["train"]
                )
                tf.scalar_summary("dev_lemma_loss_with_gt_inputs", tf.reduce_mean(lemmatizer_loss), collections=["dev"])

                tf.scalar_summary(
                    "train_lemma_loss_with_decoded_inputs",
                    tf.reduce_mean(lemmatizer_loss_runtime),
                    collections=["train"],
                )
                tf.scalar_summary(
                    "dev_lemma_loss_with_decoded_inputs", tf.reduce_mean(lemmatizer_loss_runtime), collections=["dev"]
                )

                self.cost += tf.reduce_mean(lemmatizer_loss) + tf.reduce_mean(lemmatizer_loss_runtime)

        self.cost += l2 * sum([tf.nn.l2_loss(variable) for variable in regularize])

        tf.scalar_summary("train_optimization_cost", self.cost, collections=["train"])
        tf.scalar_summary("dev_optimization_cost", self.cost, collections=["dev"])

        def decay(learning_rate, exponent, iteration_steps):
            return tf.train.exponential_decay(learning_rate, global_step, iteration_steps, exponent, staircase=True)

        optimizer = eval("tf.train." + optimizer_desc)
        self.train = optimizer.minimize(self.cost, global_step=global_step)

        if threads > 0:
            self.session = tf.Session(
                config=tf.ConfigProto(inter_op_parallelism_threads=threads, intra_op_parallelism_threads=threads)
            )
        else:
            self.session = tf.Session()
        self.session.run(tf.initialize_all_variables())

        if write_summaries:
            self.summary_train = tf.merge_summary(tf.get_collection("train"))
            self.summary_dev = tf.merge_summary(tf.get_collection("dev"))
            timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
            self.summary_writer = tf.train.SummaryWriter("logs/" + timestamp + "_" + experiment_name)

        self.steps = 0
Example #12
0
    def _init_neural_network(self):
        """Initializing the NN (building a TensorFlow graph and initializing session)."""

        # set TensorFlow random seed
        tf.set_random_seed(rnd.randint(-sys.maxint, sys.maxint))

        # create placeholders for input & output (always batch-size * 1, list of up to num. steps)
        self.enc_inputs = []
        self.enc_inputs_drop = []
        for i in xrange(self.max_da_len):
            enc_input = tf.placeholder(tf.int32, [None],
                                       name=('enc_inp-%d' % i))
            self.enc_inputs.append(enc_input)
            if self.dropout_keep_prob < 1:
                enc_input_drop = tf.nn.dropout(enc_input,
                                               self.dropout_keep_prob,
                                               name=('enc_inp-drop-%d' % i))
                self.enc_inputs_drop.append(enc_input_drop)

        self.dec_inputs = []
        for i in xrange(self.max_tree_len):
            self.dec_inputs.append(
                tf.placeholder(tf.int32, [None], name=('dec_inp-%d' % i)))

        # targets are just decoder inputs shifted by one (+pad with one empty spot)
        self.targets = [
            self.dec_inputs[i + 1] for i in xrange(len(self.dec_inputs) - 1)
        ]
        self.targets.append(
            tf.placeholder(tf.int32, [None], name=('target-pad')))

        # prepare cells
        self.initial_state = tf.placeholder(tf.float32, [None, self.emb_size])
        if self.cell_type.startswith('gru'):
            self.cell = rnn_cell.GRUCell(self.emb_size)
        else:
            self.cell = rnn_cell.BasicLSTMCell(self.emb_size)

        if self.cell_type.endswith('/2'):
            self.cell = rnn_cell.MultiRNNCell([self.cell] * 2)

        # build the actual LSTM Seq2Seq network (for training and decoding)
        with tf.variable_scope(self.scope_name) as scope:

            rnn_func = embedding_rnn_seq2seq
            if self.nn_type == 'emb_attention_seq2seq':
                rnn_func = embedding_attention_seq2seq
            elif self.nn_type == 'emb_attention2_seq2seq':
                rnn_func = partial(embedding_attention_seq2seq, num_heads=2)
            elif self.nn_type == 'emb_attention_seq2seq_context':
                rnn_func = embedding_attention_seq2seq_context
            elif self.nn_type == 'emb_attention2_seq2seq_context':
                rnn_func = partial(embedding_attention_seq2seq_context,
                                   num_heads=2)

            # for training: feed_previous == False, using dropout if available
            # outputs = batch_size * num_decoder_symbols ~ i.e. output logits at each steps
            # states = cell states at each steps
            self.outputs, self.states = rnn_func(
                self.enc_inputs_drop
                if self.enc_inputs_drop else self.enc_inputs,
                self.dec_inputs,
                self.cell,
                self.da_dict_size,
                self.tree_dict_size,
                scope=scope)

            scope.reuse_variables()

            # for decoding: feed_previous == True
            self.dec_outputs, self.dec_states = rnn_func(self.enc_inputs,
                                                         self.dec_inputs,
                                                         self.cell,
                                                         self.da_dict_size,
                                                         self.tree_dict_size,
                                                         feed_previous=True,
                                                         scope=scope)

        # TODO use output projection ???

        # target weights
        # TODO change to actual weights, zero after the end of tree ???
        self.cost_weights = [
            tf.ones_like(trg, tf.float32, name='cost_weights')
            for trg in self.targets
        ]

        # cost
        self.tf_cost = sequence_loss(self.outputs, self.targets,
                                     self.cost_weights, self.tree_dict_size)
        self.dec_cost = sequence_loss(self.dec_outputs, self.targets,
                                      self.cost_weights, self.tree_dict_size)
        if self.use_dec_cost:
            self.cost = 0.5 * (self.tf_cost + self.dec_cost)
        else:
            self.cost = self.tf_cost

        self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")

        # optimizer (default to Adam)
        if self.optimizer_type == 'sgd':
            self.optimizer = tf.train.GradientDescentOptimizer(
                self.learning_rate)
        if self.optimizer_type == 'adagrad':
            self.optimizer = tf.train.AdagradOptimizer(self.learning_rate)
        else:
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_func = self.optimizer.minimize(self.cost)

        # initialize session
        session_config = None
        if self.max_cores:
            session_config = tf.ConfigProto(
                inter_op_parallelism_threads=self.max_cores,
                intra_op_parallelism_threads=self.max_cores)
        self.session = tf.Session(config=session_config)

        # this helps us load/save the model
        self.saver = tf.train.Saver(tf.all_variables())
Example #13
0
File: model.py Project: lacker/ai
  def __init__(self):

    # Set up hyperparameters
    self.num_layers = 3
    self.layer_size = 256

    # Set up the core RNN cells of the tensor network
    single_cell = rnn_cell.BasicLSTMCell(self.layer_size)
    self.cell = rnn_cell.MultiRNNCell([single_cell] * self.num_layers)

    # Set up placeholders for the inputs and outputs.
    # Leave batch size unspecified as a None shape.

    # The input problem
    self.encoder_inputs = [tf.placeholder(tf.int32,
                                          shape=[None],
                                          name='encoder{0}'.format(i))
                           for i in range(SOURCE_LEN)]

    # The correct answers
    self.labels = [tf.placeholder(tf.int32,
                                  shape=[None],
                                  name='labels{0}'.format(i))
                   for i in range(TARGET_LEN)]

    # Each item is equal, so weights are ones
    self.weights = [tf.ones_like(label, dtype=tf.float32)
                    for label in self.labels]

    # decoder_inputs has the correct output from the previous timestep,
    # with a zero-hot "go" token on the first one
    go_token = tf.zeros_like(self.labels[0], dtype=np.int32, name="GO")
    self.decoder_inputs = [go_token] + self.labels[:-1]
    
    # Construct the guts of the model.
    # This same model will be used for training and testing, so we
    # don't feed_previous.
    self.outputs, self.states = seq2seq.embedding_rnn_seq2seq(
      self.encoder_inputs,
      self.decoder_inputs,
      self.cell,
      len(SOURCE_VOCAB),
      len(TARGET_VOCAB),
      feed_previous=False)

    self.loss = seq2seq.sequence_loss(
      self.outputs,
      self.labels,
      self.weights)

    # Set up the ops we need for training

    if True: # momentum
      learning_rate = 0.05
      momentum = 0.9
      self.optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
      self.train_op = self.optimizer.minimize(self.loss)
    else: # adam
      # Assumes batch size of 100
      self.cost = tf.reduce_sum(self.loss) / TARGET_LEN / 100
      self.lr = tf.Variable(0.0, trainable=False)
      tvars = tf.trainable_variables()
      # Clip gradients at 5.0
      grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                        5.0)
      optimizer = tf.train.AdamOptimizer(self.lr)
      self.train_op = optimizer.apply_gradients(zip(grads, tvars))

    self.sess = tf.Session()
    self.sess.run(tf.initialize_all_variables())
Example #14
0
    def __init__(self, vocab_size, sequence_length, num_units,
                 max_gradient_norm, batch_size, learning_rate,
                 learning_rate_decay_factor):
        self.vocab_size = vocab_size
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        w = training.utils.gaussian_weights_variable(
            [num_units, self.vocab_size])
        b = tf.Variable(tf.zeros([self.vocab_size]))

        lstm_cell = rnn_cell.LSTMCell(num_units, vocab_size)

        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for _ in range(sequence_length):
            self.encoder_inputs.append(
                tf.placeholder(tf.float32,
                               shape=(batch_size, self.vocab_size)))
            self.decoder_inputs.append(
                tf.placeholder(tf.float32,
                               shape=(batch_size, self.vocab_size)))
            self.target_weights.append(
                tf.placeholder(tf.float32, shape=(batch_size, )))

        # Decoder has one extra cell because it starts with the GO symbol,
        # and the targets are shifted by one.
        # Not sure this is actually useful, as it is always set to 0.
        # As this is inspired by TensorFlow seq2seq models, there might be
        # something dodgy in there.
        self.decoder_inputs.append(
            tf.placeholder(tf.float32, shape=(batch_size, self.vocab_size)))
        self.target_weights.append(np.ones((batch_size, )))

        # Targets used by the sequence loss must be integer indices.
        targets = [
            tf.cast(tf.argmax(i, 1), dtype=tf.int32)
            for i in self.decoder_inputs[1:]
        ]

        outputs, self.state = seq2seq.basic_rnn_seq2seq(
            self.encoder_inputs, self.decoder_inputs, lstm_cell)

        self.logits = [tf.nn.xw_plus_b(o, w, b) for o in outputs]
        self.loss = seq2seq.sequence_loss(
            self.logits[:self.sequence_length], targets,
            self.target_weights[:self.sequence_length], self.vocab_size)

        params = tf.trainable_variables()
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(self.loss, params)
        clipped_gradients, self.gradient_norms = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.updates = opt.apply_gradients(zip(clipped_gradients, params),
                                           global_step=self.global_step)

        self.saver = tf.train.Saver(tf.all_variables())
Example #15
0
cell = rnn_cell.GRUCell(memory_dim)

#encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols,  embedding_size

dec_outputs, dec_state, enc_state = seq2seq_new.embedding_rnn_seq2seq_new(
    enc_inp, dec_inp, cell, vocab_size, vocab_size, embedding_dim)

#print dec_outputs[0], len(dec_outputs), tf.shape(dec_state)
print '** enc memory ', enc_state.get_shape()
print '** dec memory ', dec_state.get_shape()

# Objective 1

# loss function - mean cross entropy across sequence
loss = seq2seq.sequence_loss(dec_outputs, labels, weights, vocab_size)

learning_rate = 0.05
momentum = 0.9
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
train_op = optimizer.minimize(loss)

# Objective 2

# Set model weights

W = tf.Variable(tf.random_normal([100, 1], stddev=0.35), name="weights")
b = tf.Variable(tf.zeros([1]), name="biases")

# Construct a linear model
activation = tf.add(tf.matmul(enc_state, W), b)
Example #16
0
    def __init__(self):

        # Set up hyperparameters
        self.num_layers = 3
        self.layer_size = 256

        # Set up the core RNN cells of the tensor network
        single_cell = rnn_cell.BasicLSTMCell(self.layer_size)
        self.cell = rnn_cell.MultiRNNCell([single_cell] * self.num_layers)

        # Set up placeholders for the inputs and outputs.
        # Leave batch size unspecified as a None shape.

        # The input problem
        self.encoder_inputs = [
            tf.placeholder(tf.int32, shape=[None], name='encoder{0}'.format(i))
            for i in range(SOURCE_LEN)
        ]

        # The correct answers
        self.labels = [
            tf.placeholder(tf.int32, shape=[None], name='labels{0}'.format(i))
            for i in range(TARGET_LEN)
        ]

        # Each item is equal, so weights are ones
        self.weights = [
            tf.ones_like(label, dtype=tf.float32) for label in self.labels
        ]

        # decoder_inputs has the correct output from the previous timestep,
        # with a zero-hot "go" token on the first one
        go_token = tf.zeros_like(self.labels[0], dtype=np.int32, name="GO")
        self.decoder_inputs = [go_token] + self.labels[:-1]

        # Construct the guts of the model.
        # This same model will be used for training and testing, so we
        # don't feed_previous.
        self.outputs, self.states = seq2seq.embedding_rnn_seq2seq(
            self.encoder_inputs,
            self.decoder_inputs,
            self.cell,
            len(SOURCE_VOCAB),
            len(TARGET_VOCAB),
            feed_previous=False)

        self.loss = seq2seq.sequence_loss(self.outputs, self.labels,
                                          self.weights)

        # Set up the ops we need for training

        if True:  # momentum
            learning_rate = 0.05
            momentum = 0.9
            self.optimizer = tf.train.MomentumOptimizer(
                learning_rate, momentum)
            self.train_op = self.optimizer.minimize(self.loss)
        else:  # adam
            # Assumes batch size of 100
            self.cost = tf.reduce_sum(self.loss) / TARGET_LEN / 100
            self.lr = tf.Variable(0.0, trainable=False)
            tvars = tf.trainable_variables()
            # Clip gradients at 5.0
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                              5.0)
            optimizer = tf.train.AdamOptimizer(self.lr)
            self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        self.sess = tf.Session()
        self.sess.run(tf.initialize_all_variables())
Example #17
0
    def __init__(self,
                 vocab_size,
                 buckets_or_sentence_length,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 model_type,
                 use_lstm=True,
                 num_samples=512,
                 forward_only=False):
        """Create the model.  This constructor can be used to created an embedded or embedded-attention, bucketed or non-bucketed model made of single or multi-layer RNN cells. 
    Args:
      vocab_size: Size of the vocabulary.
      target_vocab_size: Size of the target vocabulary.
      buckets_or_sentence_length: 
        If using buckets:
          A list of pairs (I, O), where I specifies maximum input length
          that will be processed in that bucket, and O specifies maximum output
          length. Training instances that have inputs longer than I or outputs
          longer than O will be pushed to the next bucket and padded accordingly.
          We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
        Else:
          Number of the maximum number of words per sentence.
      size: Number of units in each layer of the model.
      num_layers: Number of layers in the model.
      max_gradient_norm: Gradients will be clipped to maximally this norm.
      batch_size: The size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: Learning rate to start with.
      learning_rate_decay_factor: Decay learning rate by this much when needed.
      num_samples: Number of samples for sampled softmax.
      forward_only: If set, we do not construct the backward pass in the model.
    """
        # Need to determine if we're using buckets or not:
        if type(buckets_or_sentence_length) == list:
            self.buckets = buckets_or_sentence_length
        else:
            self.max_sentence_length = buckets_or_sentence_length

        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.vocab_size:
            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [size, self.vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.vocab_size])
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                with tf.device("/cpu:0"):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels,
                                                      num_samples,
                                                      self.vocab_size)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = rnn_cell.GRUCell(size)
        if use_lstm:
            single_cell = rnn_cell.BasicLSTMCell(size)
        cell = single_cell  #i, j, f, o = array_ops.split(1, 4, concat)
        if num_layers > 1:
            cell = rnn_cell.MultiRNNCell(
                [single_cell] *
                num_layers)  #cur_inp, array_ops.concat(1, new_states)

        # The seq2seq function: we use embedding for the input and attention (if applicable).
        if model_type is 'embedding_attention':

            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    vocab_size,
                    vocab_size,
                    output_projection=output_projection,
                    feed_previous=do_decode)
        else:  # just build embedding model, I should probably change this to throw an error

            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return seq2seq.embedding_rnn_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    vocab_size,
                    vocab_size,
                    output_projection=output_projection,
                    feed_previous=do_decode)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []

        # NOTE: If the model is not bucketed, these try blocks will throw an AttributeError and execute code to build a non-bucketed model.
        try:
            encoder_range = self.buckets[-1][0]
            decoder_range = self.buckets[-1][1]
        except AttributeError:
            encoder_range, decoder_range = self.max_sentence_length, self.max_sentence_length

        for i in xrange(encoder_range):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(decoder_range + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        try:
            if forward_only:
                self.outputs, self.losses = seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    self.buckets,
                    self.vocab_size,
                    lambda x, y: seq2seq_f(x, y, True),
                    softmax_loss_function=softmax_loss_function)
                # If we use output projection, we need to project outputs for decoding.
                if output_projection is not None:
                    for b in xrange(len(self.buckets)):
                        self.outputs[b] = [
                            tf.nn.xw_plus_b(output, output_projection[0],
                                            output_projection[1])
                            for output in self.outputs[b]
                        ]
            else:
                self.outputs, self.losses = seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    self.buckets,
                    self.vocab_size,
                    lambda x, y: seq2seq_f(x, y, False),
                    softmax_loss_function=softmax_loss_function)

        except AttributeError:
            if forward_only:
                self.outputs, self.states = seq2seq_f(self.encoder_inputs,
                                                      self.decoder_inputs[:-1],
                                                      True)
                self.losses = seq2seq.sequence_loss(
                    self.outputs,
                    targets,
                    self.target_weights[:-1],
                    self.vocab_size,
                    softmax_loss_function=softmax_loss_function)
                # Project outputs for decoding
                if output_projection is not None:
                    self.outputs = [
                        tf.nn.xw_plus_b(output, output_projection[0],
                                        output_projection[1])
                        for output in self.outputs
                    ]
            else:
                self.outputs, self.states = seq2seq_f(self.encoder_inputs,
                                                      self.decoder_inputs[:-1],
                                                      False)
                self.losses = (seq2seq.sequence_loss(
                    self.outputs,
                    targets,
                    self.target_weights[:-1],
                    self.vocab_size,
                    softmax_loss_function=softmax_loss_function))

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        self.params = params  # Hold onto this for Woz
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)

            try:
                for b in xrange(len(self.buckets)):
                    gradients = tf.gradients(self.losses[b], params)
                    clipped_gradients, norm = tf.clip_by_global_norm(
                        gradients, max_gradient_norm)
                    self.gradient_norms.append(norm)
                    self.updates.append(
                        opt.apply_gradients(zip(clipped_gradients, params),
                                            global_step=self.global_step))
            except AttributeError:
                gradients = tf.gradients(self.losses, params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms = norm
                self.updates = opt.apply_gradients(
                    zip(clipped_gradients, params),
                    global_step=self.global_step)

        self.saver = tf.train.Saver(tf.all_variables())
Example #18
0
    def _init_neural_network(self):
        """Initializing the NN (building a TensorFlow graph and initializing session)."""

        # set TensorFlow random seed
        tf.set_random_seed(rnd.randint(-sys.maxint, sys.maxint))

        # create placeholders for input & output (always batch-size * 1, list of up to num. steps)
        self.enc_inputs = []
        self.enc_inputs_drop = []
        for i in xrange(self.max_da_len):
            enc_input = tf.placeholder(tf.int32, [None], name=('enc_inp-%d' % i))
            self.enc_inputs.append(enc_input)
            if self.dropout_keep_prob < 1:
                enc_input_drop = tf.nn.dropout(enc_input, self.dropout_keep_prob,
                                               name=('enc_inp-drop-%d' % i))
                self.enc_inputs_drop.append(enc_input_drop)

        self.dec_inputs = []
        for i in xrange(self.max_tree_len):
            self.dec_inputs.append(tf.placeholder(tf.int32, [None], name=('dec_inp-%d' % i)))

        # targets are just decoder inputs shifted by one (+pad with one empty spot)
        self.targets = [self.dec_inputs[i + 1] for i in xrange(len(self.dec_inputs) - 1)]
        self.targets.append(tf.placeholder(tf.int32, [None], name=('target-pad')))

        # prepare cells
        self.initial_state = tf.placeholder(tf.float32, [None, self.emb_size])
        if self.cell_type.startswith('gru'):
            self.cell = rnn_cell.GRUCell(self.emb_size)
        else:
            self.cell = rnn_cell.BasicLSTMCell(self.emb_size)

        if self.cell_type.endswith('/2'):
            self.cell = rnn_cell.MultiRNNCell([self.cell] * 2)

        # build the actual LSTM Seq2Seq network (for training and decoding)
        with tf.variable_scope(self.scope_name) as scope:

            rnn_func = embedding_rnn_seq2seq
            if self.nn_type == 'emb_attention_seq2seq':
                rnn_func = embedding_attention_seq2seq
            elif self.nn_type == 'emb_attention2_seq2seq':
                rnn_func = partial(embedding_attention_seq2seq, num_heads=2)
            elif self.nn_type == 'emb_attention_seq2seq_context':
                rnn_func = embedding_attention_seq2seq_context
            elif self.nn_type == 'emb_attention2_seq2seq_context':
                rnn_func = partial(embedding_attention_seq2seq_context, num_heads=2)

            # for training: feed_previous == False, using dropout if available
            # outputs = batch_size * num_decoder_symbols ~ i.e. output logits at each steps
            # states = cell states at each steps
            self.outputs, self.states = rnn_func(
                self.enc_inputs_drop if self.enc_inputs_drop else self.enc_inputs,
                self.dec_inputs, self.cell,
                self.da_dict_size, self.tree_dict_size,
                scope=scope)

            scope.reuse_variables()

            # for decoding: feed_previous == True
            self.dec_outputs, self.dec_states = rnn_func(
                self.enc_inputs, self.dec_inputs, self.cell,
                self.da_dict_size, self.tree_dict_size,
                feed_previous=True, scope=scope)

        # TODO use output projection ???

        # target weights
        # TODO change to actual weights, zero after the end of tree ???
        self.cost_weights = [tf.ones_like(trg, tf.float32, name='cost_weights')
                             for trg in self.targets]

        # cost
        self.tf_cost = sequence_loss(self.outputs, self.targets,
                                     self.cost_weights, self.tree_dict_size)
        self.dec_cost = sequence_loss(self.dec_outputs, self.targets,
                                      self.cost_weights, self.tree_dict_size)
        if self.use_dec_cost:
            self.cost = 0.5 * (self.tf_cost + self.dec_cost)
        else:
            self.cost = self.tf_cost

        self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")

        # optimizer (default to Adam)
        if self.optimizer_type == 'sgd':
            self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
        if self.optimizer_type == 'adagrad':
            self.optimizer = tf.train.AdagradOptimizer(self.learning_rate)
        else:
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_func = self.optimizer.minimize(self.cost)

        # initialize session
        session_config = None
        if self.max_cores:
            session_config = tf.ConfigProto(inter_op_parallelism_threads=self.max_cores,
                                            intra_op_parallelism_threads=self.max_cores)
        self.session = tf.Session(config=session_config)

        # this helps us load/save the model
        self.saver = tf.train.Saver(tf.all_variables())
Example #19
0
  def __init__(self, vocab_size, buckets_or_sentence_length, size,
               num_layers, max_gradient_norm, batch_size, learning_rate,
               learning_rate_decay_factor, model_type, use_lstm=True,
               num_samples=512, forward_only=False):
    """Create the model.  This constructor can be used to created an embedded or embedded-attention, bucketed or non-bucketed model made of single or multi-layer RNN cells. 
    Args:
      vocab_size: Size of the vocabulary.
      target_vocab_size: Size of the target vocabulary.
      buckets_or_sentence_length: 
        If using buckets:
          A list of pairs (I, O), where I specifies maximum input length
          that will be processed in that bucket, and O specifies maximum output
          length. Training instances that have inputs longer than I or outputs
          longer than O will be pushed to the next bucket and padded accordingly.
          We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
        Else:
          Number of the maximum number of words per sentence.
      size: Number of units in each layer of the model.
      num_layers: Number of layers in the model.
      max_gradient_norm: Gradients will be clipped to maximally this norm.
      batch_size: The size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: Learning rate to start with.
      learning_rate_decay_factor: Decay learning rate by this much when needed.
      num_samples: Number of samples for sampled softmax.
      forward_only: If set, we do not construct the backward pass in the model.
    """
    # Need to determine if we're using buckets or not:
    if type(buckets_or_sentence_length) == list:
      self.buckets = buckets_or_sentence_length
    else:
      self.max_sentence_length = buckets_or_sentence_length
    
    self.vocab_size = vocab_size
    self.batch_size = batch_size
    self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
    self.learning_rate_decay_op = self.learning_rate.assign(
        self.learning_rate * learning_rate_decay_factor)
    self.global_step = tf.Variable(0, trainable=False) 

    # Summary variables. NOTE: added these.
    # self.summary_op_learning_rate = tf.scalar_summary('learning rate', self.learning_rate)
    # self.summary_op_global_step = tf.scalar_summary('global step', self.global_step)

    # If we use sampled softmax, we need an output projection.
    output_projection = None
    softmax_loss_function = None
    # Sampled softmax only makes sense if we sample less than vocabulary size.
    if num_samples > 0 and num_samples < self.vocab_size:
      with tf.device("/cpu:0"):
        w = tf.get_variable("proj_w", [size, self.vocab_size])
        w_t = tf.transpose(w)
        b = tf.get_variable("proj_b", [self.vocab_size])
      output_projection = (w, b)

      def sampled_loss(inputs, labels):
        with tf.device("/cpu:0"):
          labels = tf.reshape(labels, [-1, 1])
          return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
                                            self.vocab_size)
      softmax_loss_function = sampled_loss

    # Create the internal multi-layer cell for our RNN.
    single_cell = rnn_cell.GRUCell(size)
    if use_lstm:
      single_cell = rnn_cell.BasicLSTMCell(size)
    cell = single_cell #i, j, f, o = array_ops.split(1, 4, concat)
    if num_layers > 1:
      cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) #cur_inp, array_ops.concat(1, new_states)

    # The seq2seq function: we use embedding for the input and attention (if applicable).
    if model_type is 'embedding_attention':
      def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
        return seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode)
    else: # just build embedding model, I should probably change this to throw an error
      def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
        return seq2seq.embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode)

    # Feeds for inputs.
    self.encoder_inputs = []
    self.decoder_inputs = []
    self.target_weights = []

    # NOTE: If the model is not bucketed, these try blocks will throw an AttributeError and execute code to build a non-bucketed model. 
    try:
      encoder_range = self.buckets[-1][0]
      decoder_range = self.buckets[-1][1]
    except AttributeError:
      encoder_range, decoder_range = self.max_sentence_length, self.max_sentence_length
    
    for i in xrange(encoder_range):  # Last bucket is the biggest one.
      self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="encoder{0}".format(i)))
    for i in xrange(decoder_range + 1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="decoder{0}".format(i)))
      self.target_weights.append(tf.placeholder(tf.float32, shape=[None],
                                                name="weight{0}".format(i)))

    # Our targets are decoder inputs shifted by one.
    targets = [self.decoder_inputs[i + 1]
               for i in xrange(len(self.decoder_inputs) - 1)]

    # Training outputs and losses.
    try:
      if forward_only:
        self.outputs, self.losses = seq2seq.model_with_buckets(
            self.encoder_inputs, self.decoder_inputs, targets,
            self.target_weights, self.buckets, self.vocab_size,
            lambda x, y: seq2seq_f(x, y, True),
            softmax_loss_function=softmax_loss_function)
        # If we use output projection, we need to project outputs for decoding.
        if output_projection is not None:
          for b in xrange(len(self.buckets)):
            self.outputs[b] = [tf.nn.xw_plus_b(output, output_projection[0],
                                               output_projection[1])
                               for output in self.outputs[b]]
      else:
        self.outputs, self.losses = seq2seq.model_with_buckets(
            self.encoder_inputs, self.decoder_inputs, targets,
            self.target_weights, self.buckets, self.vocab_size,
            lambda x, y: seq2seq_f(x, y, False),
            softmax_loss_function=softmax_loss_function)

    except AttributeError:
      if forward_only:
        self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], True)
        self.losses = seq2seq.sequence_loss(self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function)
        # Project outputs for decoding
        if output_projection is not None:
          self.outputs = [tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs]
      else:
        self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], False)
        self.losses = (seq2seq.sequence_loss(self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function))


    # Gradients and SGD update operation for training the model.
    params = tf.trainable_variables()
    self.params = params # Hold onto this for Woz
    if not forward_only:
      self.gradient_norms = []
      self.updates = []
      opt = tf.train.GradientDescentOptimizer(self.learning_rate)
      
      try:
        for b in xrange(len(self.buckets)):
          gradients = tf.gradients(self.losses[b], params)
          clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                           max_gradient_norm)
          self.gradient_norms.append(norm)
          self.updates.append(opt.apply_gradients(
              zip(clipped_gradients, params), global_step=self.global_step))
      except AttributeError:
        gradients = tf.gradients(self.losses, params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients,max_gradient_norm)
        self.gradient_norms = norm
        self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

    self.saver = tf.train.Saver(tf.all_variables())
dec_inp = ([tf.zeros_like(enc_inp[0], dtype=np.float32, name="GO")]
           + enc_inp[:-1])

# Initial memory value for recurrence.
#prev_mem = tf.zeros((batch_size, memory_dim))

print("shapes", np.array(enc_inp).shape, np.array(dec_inp).shape, np.array(labels).shape)
cell = rnn_cell.GRUCell(memory_dim)

dec_outputs, dec_memory = seq2seq.basic_rnn_seq2seq(
    enc_inp, dec_inp, cell)

labels_t = tf.reshape(labels, [5,100])
print(labels_t)
print(dec_outputs)
loss = seq2seq.sequence_loss(dec_outputs, labels_t, weights, vocab_size)
tf.scalar_summary("loss", loss)
#magnitude = tf.sqrt(tf.reduce_sum(tf.square(dec_memory[1])))
#tf.scalar_summary("magnitude at t=1", magnitude)
summary_op = tf.merge_all_summaries()


learning_rate = 0.05
momentum = 0.9
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
train_op = optimizer.minimize(loss)
logdir = tempfile.mkdtemp()
print(logdir)
summary_writer = tf.train.SummaryWriter(logdir, sess.graph_def)

sess.run(tf.initialize_all_variables())
Example #21
0
memory_dim    = 100

x_seq   = [tf.placeholder(tf.int32, shape=(None,), name="x%i" % t) for t in range(seq_length)]
t_seq   = [tf.placeholder(tf.int32, shape=(None,), name="t%i" % t) for t in range(seq_length)]
weights = [tf.ones_like(t_i, dtype=tf.float32) for t_i in t_seq]

# Decoder input: prepend some "GO" token and drop the final token of the encoder input
dec_inp = ([tf.zeros_like(x_seq[0], dtype=np.int32, name="GO")] + x_seq[:-1])

# Initial memory value for recurrence.
prev_mem = tf.zeros((batch_size, memory_dim))

# GRU
cell = rnn_cell.GRUCell(memory_dim)
dec_outputs, dec_memory = seq2seq.embedding_rnn_seq2seq(x_seq, dec_inp, cell, vocab_size, vocab_size)
loss = seq2seq.sequence_loss(dec_outputs, t_seq, weights, vocab_size)
tf.scalar_summary("loss", loss)

magnitude = tf.sqrt(tf.reduce_sum(tf.square(dec_memory[1])))
tf.scalar_summary("magnitude at t=1", magnitude)

summary_op = tf.merge_all_summaries()

learning_rate = 0.05
momentum = 0.9
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
train_op = optimizer.minimize(loss)
logdir = tempfile.mkdtemp()
print logdir

def generate_data():
Example #22
0
def model_with_buckets(encoder_inputs, decoder_inputs, targets, weights,
                       buckets, seq2seq_f, softmax_loss_function=None,
                       per_example_loss=False, name=None):
    """Create a sequence-to-sequence model with support for bucketing.

    The seq2seq argument is a function that defines a sequence-to-sequence model,
    e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24))

    Args:
      encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input.
      decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input.
      targets: A list of 1D batch-sized int32 Tensors (desired output sequence).
      weights: List of 1D batch-sized float-Tensors to weight the targets.
      buckets: A list of pairs of (input size, output size) for each bucket.
      seq2seq_f: A sequence-to-sequence model function; it takes 2 input that
        agree with encoder_inputs and decoder_inputs, and returns a pair
        consisting of outputs and states (as, e.g., basic_rnn_seq2seq).
      softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
        to be used instead of the standard softmax (the default if this is None).
      per_example_loss: Boolean. If set, the returned loss will be a batch-sized
        tensor of losses for each sequence in the batch. If unset, it will be
        a scalar with the averaged loss from all examples.
      name: Optional name for this operation, defaults to "model_with_buckets".

    Returns:
      A tuple of the form (outputs, losses), where:
        outputs: The outputs for each bucket. Its j'th element consists of a list
          of 2D Tensors of shape [batch_size x num_decoder_symbols] (jth outputs).
        losses: List of scalar Tensors, representing losses for each bucket, or,
          if per_example_loss is set, a list of 1D batch-sized float Tensors.

    Raises:
      ValueError: If length of encoder_inputsut, targets, or weights is smaller
        than the largest (last) bucket.
    """
    if len(encoder_inputs) < buckets[-1][0]:
        raise ValueError("Length of encoder_inputs (%d) must be at least that of la"
                         "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0]))
    if len(targets) < buckets[-1][1]:
        raise ValueError("Length of targets (%d) must be at least that of last"
                         "bucket (%d)." % (len(targets), buckets[-1][1]))
    if len(weights) < buckets[-1][1]:
        raise ValueError("Length of weights (%d) must be at least that of last"
                         "bucket (%d)." % (len(weights), buckets[-1][1]))

    all_inputs = encoder_inputs + decoder_inputs + targets + weights
    losses = []
    outputs = []
    with ops.op_scope(all_inputs, name, "model_with_buckets"):
        for j, bucket in enumerate(buckets):
            with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                               reuse=True if j > 0 else None):
                bucket_outputs, _ = seq2seq_f(encoder_inputs[:bucket[0]],
                                            decoder_inputs[:bucket[1]])
                outputs.append(bucket_outputs)

                if per_example_loss:
                    losses.append(seq2seq.sequence_loss_by_example(
                        outputs[-1], targets[:bucket[1]], weights[:bucket[1]],
                        average_across_timesteps=True,
                        softmax_loss_function=softmax_loss_function))
                else:
                    losses.append(seq2seq.sequence_loss(
                        outputs[-1], targets[:bucket[1]], weights[:bucket[1]],
                        average_across_timesteps=True,
                        softmax_loss_function=softmax_loss_function))

    return outputs, losses
Example #23
0
# Initial memory value for recurrence.
prev_mem = tf.zeros((batch_size, memory_dim))

cell = rnn_cell.BasicLSTMCell(memory_dim)


#enc_inp = np.tile(enc_inp, 2).tolist()
logits, state = seq2seq.basic_rnn_seq2seq(
        enc_inp, dec_inp, cell)#, vocab_size, vocab_size)

for i, inp in enumerate(enc_inp):
    print(i, inp)
print("logits", logits)
print('labels', labels)
loss = seq2seq.sequence_loss(logits, labels, weights)
summary_op = tf.scalar_summary("loss", loss)

square = tf.square(state)
sum = tf.reduce_sum(square)
magnitude = tf.sqrt(sum)
tf.scalar_summary("magnitude at t=1", magnitude)

learning_rate = 0.05
momentum = 0.9
optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
train_op = optimizer.minimize(loss)


logdir = tempfile.mkdtemp()
print(logdir)