def single_test(l, model, sess, task, nprint, batch_size, print_out=True,
                offset=None):
  """Test model on test data of length l using the given session."""
  inpt, target = data.get_batch(l, batch_size, False, task, offset)
  _, res, _, steps = model.step(sess, inpt, target, False)
  errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint)
  seq_err = float(seq_err) / batch_size
  if total > 0:
    errors = float(errors) / total
  if print_out:
    data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
                   % (task, l, 100*errors, 100*seq_err))
  return errors, seq_err, (steps, inpt, [np.argmax(o, axis=1) for o in res])
def assign_vectors(word_vector_file, embedding_key, vocab_path, sess):
  """Assign the embedding_key variable from the given word vectors file."""
  # For words in the word vector file, set their embedding at start.
  if not tf.gfile.Exists(word_vector_file):
    data.print_out("Word vector file does not exist: %s" % word_vector_file)
    sys.exit(1)
  vocab, _ = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  data.print_out("Pre-setting word vectors from %s" % word_vector_file)
  with tf.gfile.GFile(word_vector_file, mode="r") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for line in f:
      line_parts = line.split()
      # The first part is the word.
      word = line_parts[0]
      if word in vocab:
        # Remaining parts are components of the vector.
        word_vector = np.array(map(float, line_parts[1:]))
        if len(word_vector) != FLAGS.vec_size:
          data.print_out("Warn: Word '%s', Expecting vector size %d, "
                         "found %d" % (word, FLAGS.vec_size,
                                       len(word_vector)))
        else:
          vectors[vocab[word]] = word_vector
  # Assign the modified vectors to the vectors_variable in the graph.
  sess.run([vectors_variable.initializer],
           {vectors_variable.initializer.inputs[1]: vectors})
def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n")
def single_test(bin_id, model, sess, nprint, batch_size, dev, p, print_out=True,
                offset=None, beam_model=None):
  """Test model on test data of length l using the given session."""
  if not dev[p][bin_id]:
    data.print_out("  bin %d (%d)\t%s\tppl NA errors NA seq-errors NA"
                   % (bin_id, data.bins[bin_id], p))
    return 1.0, 1.0, 0.0
  inpt, target = data.get_batch(
      bin_id, batch_size, dev[p], FLAGS.height, offset)
  if FLAGS.beam_size > 1 and beam_model:
    loss, res, new_tgt, scores = m_step(
        model, beam_model, sess, batch_size, inpt, target, bin_id,
        FLAGS.eval_beam_steps, p)
    score_avgs = [sum(s) / float(len(s)) for s in scores]
    score_maxs = [max(s) for s in scores]
    score_str = ["(%.2f, %.2f)" % (score_avgs[i], score_maxs[i])
                 for i in xrange(FLAGS.eval_beam_steps)]
    data.print_out("  == scores (avg, max): %s" % "; ".join(score_str))
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint, new_tgt, scores[-1])
  else:
    loss, res, _, _ = model.step(sess, inpt, target, False)
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint)
  seq_err = float(seq_err) / batch_size
  if total > 0:
    errors = float(errors) / total
  if print_out:
    data.print_out("  bin %d (%d)\t%s\tppl %.2f errors %.2f seq-errors %.2f"
                   % (bin_id, data.bins[bin_id], p, data.safe_exp(loss),
                      100 * errors, 100 * seq_err))
  return (errors, seq_err, loss)
def multi_test(l,
               model,
               sess,
               task,
               nprint,
               batch_size,
               offset=None,
               ensemble=None):
    """Run multiple tests at lower batch size to save memory."""
    errors, seq_err = 0.0, 0.0
    to_print = nprint
    low_batch = FLAGS.low_batch_size
    low_batch = min(low_batch, batch_size)
    for mstep in xrange(batch_size / low_batch):
        cur_offset = None if offset is None else offset + mstep * low_batch
        err, sq_err, _ = single_test(l,
                                     model,
                                     sess,
                                     task,
                                     to_print,
                                     low_batch,
                                     False,
                                     cur_offset,
                                     ensemble=ensemble)
        to_print = max(0, to_print - low_batch)
        errors += err
        seq_err += sq_err
        if FLAGS.mode > 0:
            cur_errors = float(low_batch * errors) / ((mstep + 1) * low_batch)
            cur_seq_err = float(low_batch * seq_err) / (
                (mstep + 1) * low_batch)
            data.print_out(
                "    %s multitest current errors %.2f sequence-errors %.2f" %
                (task, 100 * cur_errors, 100 * cur_seq_err))
    errors = float(low_batch) * float(errors) / batch_size
    seq_err = float(low_batch) * float(seq_err) / batch_size
    data.print_out("  %s len %d errors %.2f sequence-errors %.2f" %
                   (task, l, 100 * errors, 100 * seq_err))
    return errors, seq_err
def multi_test(l, model, sess, task, nprint, batch_size, offset=None):
  """Run multiple tests at lower batch size to save memory."""
  errors, seq_err = 0.0, 0.0
  to_print = nprint
  low_batch = FLAGS.low_batch_size
  low_batch = min(low_batch, batch_size)
  for mstep in xrange(batch_size / low_batch):
    cur_offset = None if offset is None else offset + mstep * low_batch
    err, sq_err, _ = single_test(l, model, sess, task, to_print, low_batch,
                                 False, cur_offset)
    to_print = max(0, to_print - low_batch)
    errors += err
    seq_err += sq_err
    if FLAGS.mode > 0:
      cur_errors = float(low_batch * errors) / ((mstep+1) * low_batch)
      cur_seq_err = float(low_batch * seq_err) / ((mstep+1) * low_batch)
      data.print_out("    %s multitest current errors %.2f sequence-errors %.2f"
                     % (task, 100*cur_errors, 100*cur_seq_err))
  errors = float(low_batch) * float(errors) / batch_size
  seq_err = float(low_batch) * float(seq_err) / batch_size
  data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
                 % (task, l, 100*errors, 100*seq_err))
  return errors, seq_err
Exemple #7
0
 def spec(self, inp, task, nclass):
     """Return the target given the input for some tasks."""
     if task == "sort":
         return sorted(inp)
     elif task == "id":
         return inp
     elif task == "rev":
         return [i for i in reversed(inp)]
     elif task == "shuffle":  # bit reverse permutation
         n_bits = (len(inp) - 1).bit_length()
         res = []
         for i in range(len(inp)):
             i1 = reverse_bit(i, n_bits) % len(inp)
             res.append(inp[i1])
         return res
     elif task == "incr":
         carry = 1
         res = []
         for i in range(len(inp)):
             if inp[i] + carry < nclass:
                 res.append(inp[i] + carry)
                 carry = 0
             else:
                 res.append(1)
                 carry = 1
         return res
     elif task == "left":
         return [inp[0]]
     elif task == "right":
         return [inp[-1]]
     elif task == "left-shift":
         return [inp[l - 1] for l in range(len(inp))]
     elif task == "right-shift":
         return [inp[l + 1] for l in range(len(inp))]
     else:
         data_utils.print_out("Unknown spec for task " + str(task))
         sys.exit()
def single_test(l,
                model,
                sess,
                task,
                nprint,
                batch_size,
                print_out=True,
                offset=None,
                ensemble=None,
                get_steps=False):
    """Test model on test data of length l using the given session."""
    inpt, target = data.get_batch(l, batch_size, False, task, offset)
    _, res, _, steps = model.step(sess,
                                  inpt,
                                  target,
                                  False,
                                  get_steps=get_steps)
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint)
    seq_err = float(seq_err) / batch_size
    if total > 0:
        errors = float(errors) / total
    if print_out:
        data.print_out("  %s len %d errors %.2f sequence-errors %.2f" %
                       (task, l, 100 * errors, 100 * seq_err))
    # Ensemble eval.
    if ensemble:
        results = []
        for m in ensemble:
            model.saver.restore(sess, m)
            _, result, _, _ = model.step(sess, inpt, target, False)
            m_errors, m_total, m_seq_err = data.accuracy(
                inpt, result, target, batch_size, nprint)
            m_seq_err = float(m_seq_err) / batch_size
            if total > 0:
                m_errors = float(m_errors) / m_total
            data.print_out(
                "     %s len %d m-errors %.2f m-sequence-errors %.2f" %
                (task, l, 100 * m_errors, 100 * m_seq_err))
            results.append(result)
        ens = [sum(o) for o in zip(*results)]
        errors, total, seq_err = data.accuracy(inpt, ens, target, batch_size,
                                               nprint)
        seq_err = float(seq_err) / batch_size
        if total > 0:
            errors = float(errors) / total
        if print_out:
            data.print_out(
                "  %s len %d ens-errors %.2f ens-sequence-errors %.2f" %
                (task, l, 100 * errors, 100 * seq_err))
    return errors, seq_err, (steps, inpt, [np.argmax(o, axis=1) for o in res])
Exemple #9
0
def single_test(l, model, sess, task, nprint, batch_size, print_out=True,
                offset=None, ensemble=None, get_steps=False):
  """Test model on test data of length l using the given session."""
  inpt, target = data.get_batch(l, batch_size, False, task, offset)
  _, res, _, steps = model.step(sess, inpt, target, False, get_steps=get_steps)
  errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint)
  seq_err = float(seq_err) / batch_size
  if total > 0:
    errors = float(errors) / total
  if print_out:
    data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
                   % (task, l, 100*errors, 100*seq_err))
  # Ensemble eval.
  if ensemble:
    results = []
    for m in ensemble:
      model.saver.restore(sess, m)
      _, result, _, _ = model.step(sess, inpt, target, False)
      m_errors, m_total, m_seq_err = data.accuracy(inpt, result, target,
                                                   batch_size, nprint)
      m_seq_err = float(m_seq_err) / batch_size
      if total > 0:
        m_errors = float(m_errors) / m_total
      data.print_out("     %s len %d m-errors %.2f m-sequence-errors %.2f"
                     % (task, l, 100*m_errors, 100*m_seq_err))
      results.append(result)
    ens = [sum(o) for o in zip(*results)]
    errors, total, seq_err = data.accuracy(inpt, ens, target,
                                           batch_size, nprint)
    seq_err = float(seq_err) / batch_size
    if total > 0:
      errors = float(errors) / total
    if print_out:
      data.print_out("  %s len %d ens-errors %.2f ens-sequence-errors %.2f"
                     % (task, l, 100*errors, 100*seq_err))
  return errors, seq_err, (steps, inpt, [np.argmax(o, axis=1) for o in res])
Exemple #10
0
def train():
  """Train the model."""
  batch_size = FLAGS.batch_size * FLAGS.num_gpus
  (model, beam_model, min_length, max_length, checkpoint_dir,
   (train_set, dev_set, en_vocab_path, fr_vocab_path), sv, sess) = initialize()
  with sess.as_default():
    quant_op = model.quantize_op
    max_cur_length = min(min_length + 3, max_length)
    prev_acc_perp = [1000000 for _ in xrange(5)]
    prev_seq_err = 1.0
    is_chief = FLAGS.task < 1
    do_report = False

    # Main traning loop.
    while not sv.ShouldStop():
      global_step, max_cur_length, learning_rate = sess.run(
          [model.global_step, model.cur_length, model.lr])
      acc_loss, acc_l1, acc_total, acc_errors, acc_seq_err = 0.0, 0.0, 0, 0, 0
      acc_grad_norm, step_count, step_c1, step_time = 0.0, 0, 0, 0.0

      # For words in the word vector file, set their embedding at start.
      bound1 = FLAGS.steps_per_checkpoint - 1
      if FLAGS.word_vector_file_en and global_step < bound1 and is_chief:
        assign_vectors(FLAGS.word_vector_file_en, "embedding:0",
                       en_vocab_path, sess)
        if FLAGS.max_target_vocab < 1:
          assign_vectors(FLAGS.word_vector_file_en, "target_embedding:0",
                         en_vocab_path, sess)

      if FLAGS.word_vector_file_fr and global_step < bound1 and is_chief:
        assign_vectors(FLAGS.word_vector_file_fr, "embedding:0",
                       fr_vocab_path, sess)
        if FLAGS.max_target_vocab < 1:
          assign_vectors(FLAGS.word_vector_file_fr, "target_embedding:0",
                         fr_vocab_path, sess)

      for _ in xrange(FLAGS.steps_per_checkpoint):
        step_count += 1
        step_c1 += 1
        global_step = int(model.global_step.eval())
        train_beam_anneal = global_step / float(FLAGS.train_beam_anneal)
        train_beam_freq = FLAGS.train_beam_freq * min(1.0, train_beam_anneal)
        p = random.choice(FLAGS.problem.split("-"))
        train_set = global_train_set[p][-1]
        bucket_id = get_bucket_id(train_buckets_scale[p][-1], max_cur_length,
                                  train_set)
        # Prefer longer stuff 60% of time if not wmt.
        if np.random.randint(100) < 60 and FLAGS.problem != "wmt":
          bucket1 = get_bucket_id(train_buckets_scale[p][-1], max_cur_length,
                                  train_set)
          bucket_id = max(bucket1, bucket_id)

        # Run a step and time it.
        start_time = time.time()
        inp, target = data.get_batch(bucket_id, batch_size, train_set,
                                     FLAGS.height)
        noise_param = math.sqrt(math.pow(global_step + 1, -0.55) *
                                prev_seq_err) * FLAGS.grad_noise_scale
        # In multi-step mode, we use best from beam for middle steps.
        state, new_target, scores, history = None, None, None, []
        while (FLAGS.beam_size > 1 and
               train_beam_freq > np.random.random_sample()):
          # Get the best beam (no training, just forward model).
          new_target, new_first, new_inp, scores = get_best_beam(
              beam_model, sess, inp, target,
              batch_size, FLAGS.beam_size, bucket_id, history, p)
          history.append(new_first)
          # Training step with the previous input and the best beam as target.
          _, _, _, state = model.step(sess, inp, new_target, FLAGS.do_train,
                                      noise_param, update_mem=True, state=state)
          # Change input to the new one for the next step.
          inp = new_inp
          # If all results are great, stop (todo: not to wait for all?).
          if FLAGS.nprint > 1:
            print(scores)
          if sum(scores) / float(len(scores)) >= 10.0:
            break
        # The final step with the true target.
        loss, res, gnorm, _ = model.step(
            sess, inp, target, FLAGS.do_train, noise_param,
            update_mem=True, state=state)
        step_time += time.time() - start_time
        acc_grad_norm += 0.0 if gnorm is None else float(gnorm)

        # Accumulate statistics.
        acc_loss += loss
        acc_l1 += loss
        errors, total, seq_err = data.accuracy(
            inp, res, target, batch_size, 0, new_target, scores)
        if FLAGS.nprint > 1:
          print("seq_err: ", seq_err)
        acc_total += total
        acc_errors += errors
        acc_seq_err += seq_err

        # Report summary every 10 steps.
        if step_count + 3 > FLAGS.steps_per_checkpoint:
          do_report = True  # Don't polute plot too early.
        if is_chief and step_count % 10 == 1 and do_report:
          cur_loss = acc_l1 / float(step_c1)
          acc_l1, step_c1 = 0.0, 0
          cur_perp = data.safe_exp(cur_loss)
          summary = tf.Summary()
          summary.value.extend(
              [tf.Summary.Value(tag="log_perplexity", simple_value=cur_loss),
               tf.Summary.Value(tag="perplexity", simple_value=cur_perp)])
          sv.SummaryComputed(sess, summary, global_step)

      # Normalize and print out accumulated statistics.
      acc_loss /= step_count
      step_time /= FLAGS.steps_per_checkpoint
      acc_seq_err = float(acc_seq_err) / (step_count * batch_size)
      prev_seq_err = max(0.0, acc_seq_err - 0.02)  # No noise at error < 2%.
      acc_errors = float(acc_errors) / acc_total if acc_total > 0 else 1.0
      t_size = float(sum([len(x) for x in train_set])) / float(1000000)
      msg = ("step %d step-time %.2f train-size %.3f lr %.6f grad-norm %.4f"
             % (global_step + 1, step_time, t_size, learning_rate,
                acc_grad_norm / FLAGS.steps_per_checkpoint))
      data.print_out("%s len %d ppl %.6f errors %.2f sequence-errors %.2f" %
                     (msg, max_cur_length, data.safe_exp(acc_loss),
                      100*acc_errors, 100*acc_seq_err))

      # If errors are below the curriculum threshold, move curriculum forward.
      is_good = FLAGS.curriculum_ppx > data.safe_exp(acc_loss)
      is_good = is_good and FLAGS.curriculum_seq > acc_seq_err
      if is_good and is_chief:
        if FLAGS.quantize:
          # Quantize weights.
          data.print_out("  Quantizing parameters.")
          sess.run([quant_op])
        # Increase current length (until the next with training data).
        sess.run(model.cur_length_incr_op)
        # Forget last perplexities if we're not yet at the end.
        if max_cur_length < max_length:
          prev_acc_perp.append(1000000)

      # Lower learning rate if we're worse than the last 5 checkpoints.
      acc_perp = data.safe_exp(acc_loss)
      if acc_perp > max(prev_acc_perp[-5:]) and is_chief:
        sess.run(model.lr_decay_op)
      prev_acc_perp.append(acc_perp)

      # Save checkpoint.
      if is_chief:
        checkpoint_path = os.path.join(checkpoint_dir, "neural_gpu.ckpt")
        model.saver.save(sess, checkpoint_path,
                         global_step=model.global_step)

        # Run evaluation.
        bin_bound = 4
        for p in FLAGS.problem.split("-"):
          total_loss, total_err, tl_counter = 0.0, 0.0, 0
          for bin_id in xrange(len(data.bins)):
            if bin_id < bin_bound or bin_id % FLAGS.eval_bin_print == 1:
              err, _, loss = single_test(bin_id, model, sess, FLAGS.nprint,
                                         batch_size * 4, dev_set, p,
                                         beam_model=beam_model)
              if loss > 0.0:
                total_loss += loss
                total_err += err
                tl_counter += 1
          test_loss = total_loss / max(1, tl_counter)
          test_err = total_err / max(1, tl_counter)
          test_perp = data.safe_exp(test_loss)
          summary = tf.Summary()
          summary.value.extend(
              [tf.Summary.Value(tag="test/%s/loss" % p, simple_value=test_loss),
               tf.Summary.Value(tag="test/%s/error" % p, simple_value=test_err),
               tf.Summary.Value(tag="test/%s/perplexity" % p,
                                simple_value=test_perp)])
          sv.SummaryComputed(sess, summary, global_step)
Exemple #11
0
    def __init__(self,
                 nmaps,
                 vec_size,
                 niclass,
                 noclass,
                 dropout,
                 max_grad_norm,
                 cutoff,
                 nconvs,
                 kw,
                 kh,
                 height,
                 mem_size,
                 learning_rate,
                 min_length,
                 num_gpus,
                 num_replicas,
                 grad_noise_scale,
                 sampling_rate,
                 act_noise=0.0,
                 do_rnn=False,
                 atrous=False,
                 beam_size=1,
                 backward=True,
                 do_layer_norm=False,
                 autoenc_decay=1.0):
        #todo Feeds for parameters and ops to update them.
        self.nmaps = nmaps
        if backward:
            self.global_step = tf.Variable(0,
                                           trainable=False,
                                           name="global_step")
            self.cur_length = tf.Variable(min_length, trainable=False)
            self.cur_length_incr_op = self.cur_length.assign_add(1)
            self.lr = tf.Variable(learning_rate, trainable=False)
            self.lr_decay_op = self.lr.assign(self.lr * 0.995)
        self.do_training = tf.placeholder(tf.float32, name="do_training")
        self.update_mem = tf.placeholder(tf.int32, name="update_mem")
        self.noise_param = tf.placeholder(tf.float32, name="noise_param")

        self.input = tf.placeholder(tf.int32, name="inp")
        self.target = tf.placeholder(tf.int32, name="tgt")
        self.prev_step = tf.placeholder(tf.float32, name="prev_step")
        gpu_input = tf.split(axis=0,
                             num_or_size_splits=num_gpus,
                             value=self.input)
        gpu_target = tf.split(axis=0,
                              num_or_size_splits=num_gpus,
                              value=self.target)
        gpu_prev_step = tf.split(axis=0,
                                 num_or_size_splits=num_gpus,
                                 value=self.prev_step)
        batch_size = tf.shape(gpu_input[0])[0]

        if backward:
            adam_lr = 0.005 * self.lr
            adam = tf.train.AdamOptimizer(adam_lr, epsilon=1e-3)

            def adam_update(grads):
                return adam.apply_gradients(zip(grads,
                                                tf.trainable_variables()),
                                            global_step=self.global_step,
                                            name="adam_update")

        #todo When switching from Adam to SGD we perform reverse-decay.
        if backward:
            global_step_float = tf.cast(self.global_step, tf.float32)
            sampling_decay_exponent = global_step_float / 100000.0
            sampling_decay = tf.maximum(0.05,
                                        tf.pow(0.5, sampling_decay_exponent))
            self.sampling = sampling_rate * 0.05 / sampling_decay
        else:
            self.sampling = tf.constant(0.0)

        #todo Cache variables on cpu if needed.
        if num_replicas > 1 or num_gpus > 1:
            with tf.device("/cpu:0"):
                caching_const = tf.constant(0)
            tf.get_variable_scope().set_caching_device(caching_const.op.device)

        def gpu_avg(l):
            if l[0] is None:
                for elem in l:
                    assert elem is None
                return 0.0
            if len(l) < 2:
                return l[0]
            return sum(l) / float(num_gpus)

        self.length_tensor = tf.placeholder(tf.int32, name="length")

        with tf.device("/cpu:0"):
            emb_weights = tf.get_variable(
                "embedding", [niclass, vec_size],
                initializer=tf.random_uniform_initializer(-1.7, 1.7))
            if beam_size > 0:
                target_emb_weights = tf.get_variable(
                    "target_embedding", [noclass, nmaps],
                    initializer=tf.random_uniform_initializer(-1.7, 1.7))
            e0 = tf.scatter_update(emb_weights,
                                   tf.constant(0, dtype=tf.int32, shape=[1]),
                                   tf.zeros([1, vec_size]))
            output_w = tf.get_variable("output_w", [nmaps, noclass],
                                       tf.float32)

        def conv_rate(layer):
            if atrous:
                return 2**layer
            return 1

        # pylint: disable=cell-var-from-loop
        def enc_step(step):
            """Encoder step."""
            if autoenc_decay < 1.0:
                quant_step = autoenc_quantize(step, 16, nmaps,
                                              self.do_training)
                if backward:
                    exp_glob = tf.train.exponential_decay(
                        1.0, self.global_step - 10000, 1000, autoenc_decay)
                    dec_factor = 1.0 - exp_glob  # * self.do_training
                    dec_factor = tf.cond(tf.less(self.global_step, 10500),
                                         lambda: tf.constant(0.05),
                                         lambda: dec_factor)
                else:
                    dec_factor = 1.0
                cur = tf.cond(tf.less(tf.random_uniform([]), dec_factor),
                              lambda: quant_step, lambda: step)
            else:
                cur = step
            if dropout > 0.0001:
                cur = tf.nn.dropout(cur, keep_prob)
            if act_noise > 0.00001:
                cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale
            # Do nconvs-many CGRU steps.
            if do_jit and tf.get_variable_scope().reuse:
                with jit_scope():
                    for layer in range(nconvs):
                        cur = conv_gru([], cur, kw, kh, nmaps,
                                       conv_rate(layer), cutoff,
                                       "ecgru_%d" % layer, do_layer_norm)
            else:
                for layer in range(nconvs):
                    cur = conv_gru([], cur, kw, kh, nmaps, conv_rate(layer),
                                   cutoff, "ecgru_%d" % layer, do_layer_norm)
            return cur

        zero_tgt = tf.zeros([batch_size, nmaps, 1])
        zero_tgt.set_shape([None, nmaps, 1])

        def dec_substep(step, decided):
            """Decoder sub-step."""
            cur = step
            if dropout > 0.0001:
                cur = tf.nn.dropout(cur, keep_prob)
            if act_noise > 0.00001:
                cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale
            # Do nconvs-many CGRU steps.
            if do_jit and tf.get_variable_scope().reuse:
                with jit_scope():
                    for layer in range(nconvs):
                        cur = conv_gru([decided], cur, kw, kh, nmaps,
                                       conv_rate(layer), cutoff,
                                       "dcgru_%d" % layer, do_layer_norm)
            else:
                for layer in range(nconvs):
                    cur = conv_gru([decided], cur, kw, kh, nmaps,
                                   conv_rate(layer), cutoff,
                                   "dcgru_%d" % layer, do_layer_norm)
            return cur

        # pylint: enable=cell-var-from-loop

        def dec_step(step, it, it_int, decided, output_ta, tgts, mloss,
                     nupd_in, out_idx, beam_cost):
            """Decoder step."""
            nupd, mem_loss = 0, 0.0
            if mem_size > 0:
                it_incr = tf.minimum(it + 1, length - 1)
                mem, mem_loss, nupd = memory_run(
                    step, nmaps, mem_size, batch_size, noclass,
                    self.global_step, self.do_training, self.update_mem, 10,
                    num_gpus, target_emb_weights, output_w, gpu_targets_tn,
                    it_incr)
            step = dec_substep(step, decided)
            output_l = tf.expand_dims(tf.expand_dims(step[:, it, 0, :], 1), 1)
            # Calculate argmax output.
            output = tf.reshape(output_l, [-1, nmaps])
            # pylint: disable=cell-var-from-loop
            output = tf.matmul(output, output_w)
            if beam_size > 1:
                beam_cost, output, out, reordered = reorder_beam(
                    beam_size, batch_size, beam_cost, output, it_int == 0,
                    [output_l, out_idx, step, decided])
                [output_l, out_idx, step, decided] = reordered
            else:
                # Scheduled sampling.
                out = tf.multinomial(tf.stop_gradient(output), 1)
                out = tf.to_int32(tf.squeeze(out, [1]))
            out_write = output_ta.write(it, output_l[:batch_size, :, :, :])
            output = tf.gather(target_emb_weights, out)
            output = tf.reshape(output, [-1, 1, nmaps])
            output = tf.concat(axis=1, values=[output] * height)
            tgt = tgts[it, :, :, :]
            selected = tf.cond(tf.less(tf.random_uniform([]), self.sampling),
                               lambda: output, lambda: tgt)
            # pylint: enable=cell-var-from-loop
            dec_write = place_at14(decided, tf.expand_dims(selected, 1), it)
            out_idx = place_at13(
                out_idx, tf.reshape(out, [beam_size * batch_size, 1, 1]), it)
            if mem_size > 0:
                mem = tf.concat(axis=2, values=[mem] * height)
                dec_write = place_at14(dec_write, mem, it_incr)
            return (step, dec_write, out_write, mloss + mem_loss,
                    nupd_in + nupd, out_idx, beam_cost)

        # Main model construction.
        gpu_outputs = []
        gpu_losses = []
        gpu_grad_norms = []
        grads_list = []
        gpu_out_idx = []
        self.after_enc_step = []
        for gpu in range(
                num_gpus):  # Multi-GPU towers, average gradients later.
            length = self.length_tensor
            length_float = tf.cast(length, tf.float32)
            if gpu > 0:
                tf.get_variable_scope().reuse_variables()
            gpu_outputs.append([])
            gpu_losses.append([])
            gpu_grad_norms.append([])
            with tf.name_scope("gpu%d" % gpu), tf.device("/gpu:%d" % gpu):
                # Main graph creation loop.
                data.print_out("Creating model.")
                start_time = time.time()

                # Embed inputs and calculate mask.
                with tf.device("/cpu:0"):
                    tgt_shape = tf.shape(tf.squeeze(gpu_target[gpu], [1]))
                    weights = tf.where(
                        tf.squeeze(gpu_target[gpu], [1]) > 0,
                        tf.ones(tgt_shape), tf.zeros(tgt_shape))

                    # Embed inputs and targets.
                    with tf.control_dependencies([e0]):
                        start = tf.gather(emb_weights,
                                          gpu_input[gpu])  # b x h x l x nmaps
                        gpu_targets_tn = gpu_target[gpu]  # b x 1 x len
                        if beam_size > 0:
                            embedded_targets_tn = tf.gather(
                                target_emb_weights, gpu_targets_tn)
                            embedded_targets_tn = tf.transpose(
                                embedded_targets_tn,
                                [2, 0, 1, 3])  # len x b x 1 x nmaps
                            embedded_targets_tn = tf.concat(
                                axis=2, values=[embedded_targets_tn] * height)

                # First image comes from start by applying convolution and adding 0s.
                start = tf.transpose(start,
                                     [0, 2, 1, 3])  # Now b x len x h x vec_s
                first = conv_linear(start, 1, 1, vec_size, nmaps, 1, True, 0.0,
                                    "input")
                first = layer_norm(first, nmaps, "input")

                # Computation steps.
                keep_prob = dropout * 3.0 / tf.sqrt(length_float)
                keep_prob = 1.0 - self.do_training * keep_prob
                act_noise_scale = act_noise * self.do_training

                # Start with a convolutional gate merging previous step.
                step = conv_gru([gpu_prev_step[gpu]], first, kw, kh, nmaps, 1,
                                cutoff, "first", do_layer_norm)

                # This is just for running a baseline RNN seq2seq model.
                if do_rnn:
                    self.after_enc_step.append(
                        step)  # Not meaningful here, but needed.

                    def lstm_cell():
                        return tf.contrib.rnn.BasicLSTMCell(height * nmaps)

                    cell = tf.contrib.rnn.MultiRNNCell(
                        [lstm_cell() for _ in range(nconvs)])
                    with tf.variable_scope("encoder"):
                        encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
                            cell,
                            tf.reshape(step,
                                       [batch_size, length, height * nmaps]),
                            dtype=tf.float32,
                            time_major=False)

                    # Attention.
                    attn = tf.layers.dense(encoder_outputs,
                                           height * nmaps,
                                           name="attn1")

                    # pylint: disable=cell-var-from-loop
                    @function.Defun(noinline=True)
                    def attention_query(query, attn_v):
                        vecs = tf.tanh(attn + tf.expand_dims(query, 1))
                        mask = tf.reduce_sum(
                            vecs * tf.reshape(attn_v, [1, 1, -1]), 2)
                        mask = tf.nn.softmax(mask)
                        return tf.reduce_sum(
                            encoder_outputs * tf.expand_dims(mask, 2), 1)

                    with tf.variable_scope("decoder"):

                        def decoder_loop_fn(state_prev_cell_out, _,
                                            cell_inp_cur_tgt):
                            state, prev_cell_out = state_prev_cell_out
                            cell_inp, cur_tgt = cell_inp_cur_tgt
                            """Decoder loop function."""
                            attn_q = tf.layers.dense(prev_cell_out,
                                                     height * nmaps,
                                                     name="attn_query")
                            attn_res = attention_query(
                                attn_q,
                                tf.get_variable(
                                    "attn_v", [height * nmaps],
                                    initializer=tf.random_uniform_initializer(
                                        -0.1, 0.1)))
                            concatenated = tf.reshape(
                                tf.concat(axis=1, values=[cell_inp, attn_res]),
                                [batch_size, 2 * height * nmaps])
                            cell_inp = tf.layers.dense(concatenated,
                                                       height * nmaps,
                                                       name="attn_merge")
                            output, new_state = cell(cell_inp, state)

                            mem_loss = 0.0
                            if mem_size > 0:
                                res, mask, mem_loss = memory_call(
                                    output, cur_tgt, height * nmaps, mem_size,
                                    noclass, num_gpus, self.update_mem)
                                res = tf.gather(target_emb_weights, res)
                                res *= tf.expand_dims(mask[:, 0], 1)
                                output = tf.layers.dense(tf.concat(
                                    axis=1, values=[output, res]),
                                                         height * nmaps,
                                                         name="rnnmem")

                            return new_state, output, mem_loss

                        # pylint: enable=cell-var-from-loop
                        gpu_targets = tf.squeeze(gpu_target[gpu],
                                                 [1])  # b x len
                        gpu_tgt_trans = tf.transpose(gpu_targets, [1, 0])
                        dec_zero = tf.zeros([batch_size, 1], dtype=tf.int32)
                        dec_inp = tf.concat(axis=1,
                                            values=[dec_zero, gpu_targets])
                        dec_inp = dec_inp[:, :length]
                        embedded_dec_inp = tf.gather(target_emb_weights,
                                                     dec_inp)
                        embedded_dec_inp_proj = tf.layers.dense(
                            embedded_dec_inp, height * nmaps, name="dec_proj")
                        embedded_dec_inp_proj = tf.transpose(
                            embedded_dec_inp_proj, [1, 0, 2])
                        init_vals = (encoder_state,
                                     tf.zeros([batch_size,
                                               height * nmaps]), 0.0)
                        _, dec_outputs, mem_losses = tf.scan(
                            decoder_loop_fn,
                            (embedded_dec_inp_proj, gpu_tgt_trans),
                            initializer=init_vals)
                    mem_loss = tf.reduce_mean(mem_losses)
                    outputs = tf.layers.dense(dec_outputs,
                                              nmaps,
                                              name="out_proj")
                    # Final convolution to get logits, list outputs.
                    outputs = tf.matmul(tf.reshape(outputs, [-1, nmaps]),
                                        output_w)
                    outputs = tf.reshape(outputs,
                                         [length, batch_size, noclass])
                    gpu_out_idx.append(tf.argmax(outputs, 2))
                else:  # Here we go with the Neural GPU.
                    # Encoder.
                    enc_length = length
                    step = enc_step(step)  # First step hard-coded.
                    # pylint: disable=cell-var-from-loop
                    i = tf.constant(1)
                    c = lambda i, _s: tf.less(i, enc_length)

                    def enc_step_lambda(i, step):
                        with tf.variable_scope(tf.get_variable_scope(),
                                               reuse=True):
                            new_step = enc_step(step)
                        return (i + 1, new_step)

                    _, step = tf.while_loop(c,
                                            enc_step_lambda, [i, step],
                                            parallel_iterations=1,
                                            swap_memory=True)
                    # pylint: enable=cell-var-from-loop

                    self.after_enc_step.append(step)

                    # Decoder.
                    if beam_size > 0:
                        output_ta = tf.TensorArray(dtype=tf.float32,
                                                   size=length,
                                                   dynamic_size=False,
                                                   infer_shape=False,
                                                   name="outputs")
                        out_idx = tf.zeros([beam_size * batch_size, length, 1],
                                           dtype=tf.int32)
                        decided_t = tf.zeros(
                            [beam_size * batch_size, length, height, vec_size])

                        # Prepare for beam search.
                        tgts = tf.concat(axis=1,
                                         values=[embedded_targets_tn] *
                                         beam_size)
                        beam_cost = tf.zeros([batch_size, beam_size])
                        step = tf.concat(axis=0, values=[step] * beam_size)
                        # First step hard-coded.
                        step, decided_t, output_ta, mem_loss, nupd, oi, bc = dec_step(
                            step, 0, 0, decided_t, output_ta, tgts, 0.0, 0,
                            out_idx, beam_cost)
                        tf.get_variable_scope().reuse_variables()

                        # pylint: disable=cell-var-from-loop
                        def step_lambda(i, step, dec_t, out_ta, ml, nu, oi,
                                        bc):
                            with tf.variable_scope(tf.get_variable_scope(),
                                                   reuse=True):
                                s, d, t, nml, nu, oi, bc = dec_step(
                                    step, i, 1, dec_t, out_ta, tgts, ml, nu,
                                    oi, bc)
                            return (i + 1, s, d, t, nml, nu, oi, bc)

                        i = tf.constant(1)
                        c = lambda i, _s, _d, _o, _ml, _nu, _oi, _bc: tf.less(
                            i, length)
                        _, step, _, output_ta, mem_loss, nupd, out_idx, _ = tf.while_loop(
                            c,
                            step_lambda, [
                                i, step, decided_t, output_ta, mem_loss, nupd,
                                oi, bc
                            ],
                            parallel_iterations=1,
                            swap_memory=True)
                        # pylint: enable=cell-var-from-loop
                        gpu_out_idx.append(tf.squeeze(out_idx, [2]))
                        outputs = output_ta.stack()
                        outputs = tf.squeeze(outputs,
                                             [2, 3])  # Now l x b x nmaps
                    else:
                        # If beam_size is 0 or less, we don't have a decoder.
                        mem_loss = 0.0
                        outputs = tf.transpose(step[:, :, 1, :], [1, 0, 2])
                        gpu_out_idx.append(tf.argmax(outputs, 2))

                    # Final convolution to get logits, list outputs.
                    outputs = tf.matmul(tf.reshape(outputs, [-1, nmaps]),
                                        output_w)
                    outputs = tf.reshape(outputs,
                                         [length, batch_size, noclass])
                gpu_outputs[gpu] = tf.nn.softmax(outputs)

                # Calculate cross-entropy loss and normalize it.
                targets_soft = make_dense(tf.squeeze(gpu_target[gpu], [1]),
                                          noclass, 0.1)
                targets_soft = tf.reshape(targets_soft, [-1, noclass])
                targets_hard = make_dense(tf.squeeze(gpu_target[gpu], [1]),
                                          noclass, 0.0)
                targets_hard = tf.reshape(targets_hard, [-1, noclass])
                output = tf.transpose(outputs, [1, 0, 2])
                xent_soft = tf.reshape(
                    tf.nn.softmax_cross_entropy_with_logits(
                        logits=tf.reshape(output, [-1, noclass]),
                        labels=targets_soft), [batch_size, length])
                xent_hard = tf.reshape(
                    tf.nn.softmax_cross_entropy_with_logits(
                        logits=tf.reshape(output, [-1, noclass]),
                        labels=targets_hard), [batch_size, length])
                low, high = 0.1 / float(noclass - 1), 0.9
                const = high * tf.log(high) + float(noclass -
                                                    1) * low * tf.log(low)
                weight_sum = tf.reduce_sum(weights) + 1e-20
                true_perp = tf.reduce_sum(xent_hard * weights) / weight_sum
                soft_loss = tf.reduce_sum(xent_soft * weights) / weight_sum
                perp_loss = soft_loss + const
                # Final loss: cross-entropy + shared parameter relaxation part + extra.
                mem_loss = 0.5 * tf.reduce_mean(mem_loss) / length_float
                total_loss = perp_loss + mem_loss
                gpu_losses[gpu].append(true_perp)

                # Gradients.
                if backward:
                    data.print_out("Creating backward pass for the model.")
                    grads = tf.gradients(total_loss,
                                         tf.trainable_variables(),
                                         colocate_gradients_with_ops=True)
                    for g_i, g in enumerate(grads):
                        if isinstance(g, tf.IndexedSlices):
                            grads[g_i] = tf.convert_to_tensor(g)
                    grads, norm = tf.clip_by_global_norm(grads, max_grad_norm)
                    gpu_grad_norms[gpu].append(norm)
                    for g in grads:
                        if grad_noise_scale > 0.001:
                            g += tf.truncated_normal(
                                tf.shape(g)) * self.noise_param
                    grads_list.append(grads)
                else:
                    gpu_grad_norms[gpu].append(0.0)
                data.print_out("Created model for gpu %d in %.2f s." %
                               (gpu, time.time() - start_time))

        self.updates = []
        self.after_enc_step = tf.concat(
            axis=0, values=self.after_enc_step)  # Concat GPUs.
        if backward:
            tf.get_variable_scope()._reuse = False
            tf.get_variable_scope().set_caching_device(None)
            grads = [
                gpu_avg([grads_list[g][i] for g in range(num_gpus)])
                for i in range(len(grads_list[0]))
            ]
            update = adam_update(grads)
            self.updates.append(update)
        else:
            self.updates.append(tf.no_op())

        self.losses = [
            gpu_avg([gpu_losses[g][i] for g in range(num_gpus)])
            for i in range(len(gpu_losses[0]))
        ]
        self.out_idx = tf.concat(axis=0, values=gpu_out_idx)
        self.grad_norms = [
            gpu_avg([gpu_grad_norms[g][i] for g in range(num_gpus)])
            for i in range(len(gpu_grad_norms[0]))
        ]
        self.outputs = [
            tf.concat(axis=1, values=[gpu_outputs[g] for g in range(num_gpus)])
        ]
        self.quantize_op = quantize_weights_op(512, 8)
        if backward:
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
  def __init__(self, nmaps, vec_size, niclass, noclass, dropout, rx_step,
               max_grad_norm, cutoff, nconvs, kw, kh, height, mode,
               learning_rate, pull, pull_incr, min_length, act_noise=0.0):
    # Feeds for parameters and ops to update them.
    self.global_step = tf.Variable(0, trainable=False)
    self.cur_length = tf.Variable(min_length, trainable=False)
    self.cur_length_incr_op = self.cur_length.assign_add(1)
    self.lr = tf.Variable(float(learning_rate), trainable=False)
    self.lr_decay_op = self.lr.assign(self.lr * 0.98)
    self.pull = tf.Variable(float(pull), trainable=False)
    self.pull_incr_op = self.pull.assign(self.pull * pull_incr)
    self.do_training = tf.placeholder(tf.float32, name="do_training")
    self.noise_param = tf.placeholder(tf.float32, name="noise_param")

    # Feeds for inputs, targets, outputs, losses, etc.
    self.input = []
    self.target = []
    for l in xrange(data_utils.forward_max + 1):
      self.input.append(tf.placeholder(tf.int32, name="inp{0}".format(l)))
      self.target.append(tf.placeholder(tf.int32, name="tgt{0}".format(l)))
    self.outputs = []
    self.losses = []
    self.grad_norms = []
    self.updates = []

    # Computation.
    inp0_shape = tf.shape(self.input[0])
    batch_size = inp0_shape[0]
    with tf.device("/cpu:0"):
      emb_weights = tf.get_variable(
          "embedding", [niclass, vec_size],
          initializer=tf.random_uniform_initializer(-1.7, 1.7))
      e0 = tf.scatter_update(emb_weights,
                             tf.constant(0, dtype=tf.int32, shape=[1]),
                             tf.zeros([1, vec_size]))

    adam = tf.train.AdamOptimizer(self.lr, epsilon=1e-4)

    # Main graph creation loop, for every bin in data_utils.
    self.steps = []
    for length in sorted(list(set(data_utils.bins + [data_utils.forward_max]))):
      data_utils.print_out("Creating model for bin of length %d." % length)
      start_time = time.time()
      if length > data_utils.bins[0]:
        tf.get_variable_scope().reuse_variables()

      # Embed inputs and calculate mask.
      with tf.device("/cpu:0"):
        with tf.control_dependencies([e0]):
          embedded = [tf.nn.embedding_lookup(emb_weights, self.input[l])
                      for l in xrange(length)]
        # Mask to 0-out padding space in each step.
        imask = [check_for_zero(self.input[l]) for l in xrange(length)]
        omask = [check_for_zero(self.target[l]) for l in xrange(length)]
        mask = [1.0 - (imask[i] * omask[i]) for i in xrange(length)]
        mask = [tf.reshape(m, [-1, 1]) for m in mask]
        # Use a shifted mask for step scaling and concatenated for weights.
        shifted_mask = mask + [tf.zeros_like(mask[0])]
        scales = [shifted_mask[i] * (1.0 - shifted_mask[i+1])
                  for i in xrange(length)]
        scales = [tf.reshape(s, [-1, 1, 1, 1]) for s in scales]
        mask = tf.concat(1, mask[0:length])  # batch x length
        weights = mask
        # Add a height dimension to mask to use later for masking.
        mask = tf.reshape(mask, [-1, length, 1, 1])
        mask = tf.concat(2, [mask for _ in xrange(height)]) + tf.zeros(
            tf.pack([batch_size, length, height, nmaps]), dtype=tf.float32)

      # Start is a length-list of batch-by-nmaps tensors, reshape and concat.
      start = [tf.tanh(embedded[l]) for l in xrange(length)]
      start = [tf.reshape(start[l], [-1, 1, nmaps]) for l in xrange(length)]
      start = tf.reshape(tf.concat(1, start), [-1, length, 1, nmaps])

      # First image comes from start by applying one convolution and adding 0s.
      first = conv_linear(start, 1, 1, vec_size, nmaps, True, 0.0, "input")
      first = [first] + [tf.zeros(tf.pack([batch_size, length, 1, nmaps]),
                                  dtype=tf.float32) for _ in xrange(height - 1)]
      first = tf.concat(2, first)

      # Computation steps.
      keep_prob = 1.0 - self.do_training * (dropout * 8.0 / float(length))
      step = [tf.nn.dropout(first, keep_prob) * mask]
      act_noise_scale = act_noise * self.do_training * self.pull
      outputs = []
      for it in xrange(length):
        with tf.variable_scope("RX%d" % (it % rx_step)) as vs:
          if it >= rx_step:
            vs.reuse_variables()
          cur = step[it]
          # Do nconvs-many CGRU steps.
          for layer in xrange(nconvs):
            cur = conv_gru([], cur, kw, kh, nmaps, cutoff, "cgru_%d" % layer)
            cur *= mask
          outputs.append(tf.slice(cur, [0, 0, 0, 0], [-1, -1, 1, -1]))
          cur = tf.nn.dropout(cur, keep_prob)
          if act_noise > 0.00001:
            cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale
          step.append(cur * mask)

      self.steps.append([tf.reshape(s, [-1, length, height * nmaps])
                         for s in step])
      # Output is the n-th step output; n = current length, as in scales.
      output = tf.add_n([outputs[i] * scales[i] for i in xrange(length)])
      # Final convolution to get logits, list outputs.
      output = conv_linear(output, 1, 1, nmaps, noclass, True, 0.0, "output")
      output = tf.reshape(output, [-1, length, noclass])
      external_output = [tf.reshape(o, [-1, noclass])
                         for o in list(tf.split(1, length, output))]
      external_output = [tf.nn.softmax(o) for o in external_output]
      self.outputs.append(external_output)

      # Calculate cross-entropy loss and normalize it.
      targets = tf.concat(1, [make_dense(self.target[l], noclass)
                              for l in xrange(length)])
      targets = tf.reshape(targets, [-1, noclass])
      xent = tf.reshape(tf.nn.softmax_cross_entropy_with_logits(
          tf.reshape(output, [-1, noclass]), targets), [-1, length])
      perp_loss = tf.reduce_sum(xent * weights)
      perp_loss /= tf.cast(batch_size, dtype=tf.float32)
      perp_loss /= length

      # Final loss: cross-entropy + shared parameter relaxation part.
      relax_dist, self.avg_op = relaxed_distance(rx_step)
      total_loss = perp_loss + relax_dist * self.pull
      self.losses.append(perp_loss)

      # Gradients and Adam update operation.
      if length == data_utils.bins[0] or (mode == 0 and
                                          length < data_utils.bins[-1] + 1):
        data_utils.print_out("Creating backward for bin of length %d." % length)
        params = tf.trainable_variables()
        grads = tf.gradients(total_loss, params)
        grads, norm = tf.clip_by_global_norm(grads, max_grad_norm)
        self.grad_norms.append(norm)
        for grad in grads:
          if isinstance(grad, tf.Tensor):
            grad += tf.truncated_normal(tf.shape(grad)) * self.noise_param
        update = adam.apply_gradients(zip(grads, params),
                                      global_step=self.global_step)
        self.updates.append(update)
      data_utils.print_out("Created model for bin of length %d in"
                           " %.2f s." % (length, time.time() - start_time))
    self.saver = tf.train.Saver(tf.all_variables())
Exemple #13
0
  def __init__(self, nmaps, vec_size, niclass, noclass, dropout,
               max_grad_norm, cutoff, nconvs, kw, kh, height, mem_size,
               learning_rate, min_length, num_gpus, num_replicas,
               grad_noise_scale, sampling_rate, act_noise=0.0, do_rnn=False,
               atrous=False, beam_size=1, backward=True, do_layer_norm=False,
               autoenc_decay=1.0):
    # Feeds for parameters and ops to update them.
    self.nmaps = nmaps
    if backward:
      self.global_step = tf.Variable(0, trainable=False, name="global_step")
      self.cur_length = tf.Variable(min_length, trainable=False)
      self.cur_length_incr_op = self.cur_length.assign_add(1)
      self.lr = tf.Variable(learning_rate, trainable=False)
      self.lr_decay_op = self.lr.assign(self.lr * 0.995)
    self.do_training = tf.placeholder(tf.float32, name="do_training")
    self.update_mem = tf.placeholder(tf.int32, name="update_mem")
    self.noise_param = tf.placeholder(tf.float32, name="noise_param")

    # Feeds for inputs, targets, outputs, losses, etc.
    self.input = tf.placeholder(tf.int32, name="inp")
    self.target = tf.placeholder(tf.int32, name="tgt")
    self.prev_step = tf.placeholder(tf.float32, name="prev_step")
    gpu_input = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.input)
    gpu_target = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.target)
    gpu_prev_step = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.prev_step)
    batch_size = tf.shape(gpu_input[0])[0]

    if backward:
      adam_lr = 0.005 * self.lr
      adam = tf.train.AdamOptimizer(adam_lr, epsilon=1e-3)

      def adam_update(grads):
        return adam.apply_gradients(zip(grads, tf.trainable_variables()),
                                    global_step=self.global_step,
                                    name="adam_update")

    # When switching from Adam to SGD we perform reverse-decay.
    if backward:
      global_step_float = tf.cast(self.global_step, tf.float32)
      sampling_decay_exponent = global_step_float / 100000.0
      sampling_decay = tf.maximum(0.05, tf.pow(0.5, sampling_decay_exponent))
      self.sampling = sampling_rate * 0.05 / sampling_decay
    else:
      self.sampling = tf.constant(0.0)

    # Cache variables on cpu if needed.
    if num_replicas > 1 or num_gpus > 1:
      with tf.device("/cpu:0"):
        caching_const = tf.constant(0)
      tf.get_variable_scope().set_caching_device(caching_const.op.device)
      # partitioner = tf.variable_axis_size_partitioner(1024*256*4)
      # tf.get_variable_scope().set_partitioner(partitioner)

    def gpu_avg(l):
      if l[0] is None:
        for elem in l:
          assert elem is None
        return 0.0
      if len(l) < 2:
        return l[0]
      return sum(l) / float(num_gpus)

    self.length_tensor = tf.placeholder(tf.int32, name="length")

    with tf.device("/cpu:0"):
      emb_weights = tf.get_variable(
          "embedding", [niclass, vec_size],
          initializer=tf.random_uniform_initializer(-1.7, 1.7))
      if beam_size > 0:
        target_emb_weights = tf.get_variable(
            "target_embedding", [noclass, nmaps],
            initializer=tf.random_uniform_initializer(-1.7, 1.7))
      e0 = tf.scatter_update(emb_weights,
                             tf.constant(0, dtype=tf.int32, shape=[1]),
                             tf.zeros([1, vec_size]))
      output_w = tf.get_variable("output_w", [nmaps, noclass], tf.float32)

    def conv_rate(layer):
      if atrous:
        return 2**layer
      return 1

    # pylint: disable=cell-var-from-loop
    def enc_step(step):
      """Encoder step."""
      if autoenc_decay < 1.0:
        quant_step = autoenc_quantize(step, 16, nmaps, self.do_training)
        if backward:
          exp_glob = tf.train.exponential_decay(1.0, self.global_step - 10000,
                                                1000, autoenc_decay)
          dec_factor = 1.0 - exp_glob  # * self.do_training
          dec_factor = tf.cond(tf.less(self.global_step, 10500),
                               lambda: tf.constant(0.05), lambda: dec_factor)
        else:
          dec_factor = 1.0
        cur = tf.cond(tf.less(tf.random_uniform([]), dec_factor),
                      lambda: quant_step, lambda: step)
      else:
        cur = step
      if dropout > 0.0001:
        cur = tf.nn.dropout(cur, keep_prob)
      if act_noise > 0.00001:
        cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale
      # Do nconvs-many CGRU steps.
      if do_jit and tf.get_variable_scope().reuse:
        with jit_scope():
          for layer in xrange(nconvs):
            cur = conv_gru([], cur, kw, kh, nmaps, conv_rate(layer),
                           cutoff, "ecgru_%d" % layer, do_layer_norm)
      else:
        for layer in xrange(nconvs):
          cur = conv_gru([], cur, kw, kh, nmaps, conv_rate(layer),
                         cutoff, "ecgru_%d" % layer, do_layer_norm)
      return cur

    zero_tgt = tf.zeros([batch_size, nmaps, 1])
    zero_tgt.set_shape([None, nmaps, 1])

    def dec_substep(step, decided):
      """Decoder sub-step."""
      cur = step
      if dropout > 0.0001:
        cur = tf.nn.dropout(cur, keep_prob)
      if act_noise > 0.00001:
        cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale
      # Do nconvs-many CGRU steps.
      if do_jit and tf.get_variable_scope().reuse:
        with jit_scope():
          for layer in xrange(nconvs):
            cur = conv_gru([decided], cur, kw, kh, nmaps, conv_rate(layer),
                           cutoff, "dcgru_%d" % layer, do_layer_norm)
      else:
        for layer in xrange(nconvs):
          cur = conv_gru([decided], cur, kw, kh, nmaps, conv_rate(layer),
                         cutoff, "dcgru_%d" % layer, do_layer_norm)
      return cur
    # pylint: enable=cell-var-from-loop

    def dec_step(step, it, it_int, decided, output_ta, tgts,
                 mloss, nupd_in, out_idx, beam_cost):
      """Decoder step."""
      nupd, mem_loss = 0, 0.0
      if mem_size > 0:
        it_incr = tf.minimum(it+1, length - 1)
        mem, mem_loss, nupd = memory_run(
            step, nmaps, mem_size, batch_size, noclass, self.global_step,
            self.do_training, self.update_mem, 10, num_gpus,
            target_emb_weights, output_w, gpu_targets_tn, it_incr)
      step = dec_substep(step, decided)
      output_l = tf.expand_dims(tf.expand_dims(step[:, it, 0, :], 1), 1)
      # Calculate argmax output.
      output = tf.reshape(output_l, [-1, nmaps])
      # pylint: disable=cell-var-from-loop
      output = tf.matmul(output, output_w)
      if beam_size > 1:
        beam_cost, output, out, reordered = reorder_beam(
            beam_size, batch_size, beam_cost, output, it_int == 0,
            [output_l, out_idx, step, decided])
        [output_l, out_idx, step, decided] = reordered
      else:
        # Scheduled sampling.
        out = tf.multinomial(tf.stop_gradient(output), 1)
        out = tf.to_int32(tf.squeeze(out, [1]))
      out_write = output_ta.write(it, output_l[:batch_size, :, :, :])
      output = tf.gather(target_emb_weights, out)
      output = tf.reshape(output, [-1, 1, nmaps])
      output = tf.concat(axis=1, values=[output] * height)
      tgt = tgts[it, :, :, :]
      selected = tf.cond(tf.less(tf.random_uniform([]), self.sampling),
                         lambda: output, lambda: tgt)
      # pylint: enable=cell-var-from-loop
      dec_write = place_at14(decided, tf.expand_dims(selected, 1), it)
      out_idx = place_at13(
          out_idx, tf.reshape(out, [beam_size * batch_size, 1, 1]), it)
      if mem_size > 0:
        mem = tf.concat(axis=2, values=[mem] * height)
        dec_write = place_at14(dec_write, mem, it_incr)
      return (step, dec_write, out_write, mloss + mem_loss, nupd_in + nupd,
              out_idx, beam_cost)

    # Main model construction.
    gpu_outputs = []
    gpu_losses = []
    gpu_grad_norms = []
    grads_list = []
    gpu_out_idx = []
    self.after_enc_step = []
    for gpu in xrange(num_gpus):  # Multi-GPU towers, average gradients later.
      length = self.length_tensor
      length_float = tf.cast(length, tf.float32)
      if gpu > 0:
        tf.get_variable_scope().reuse_variables()
      gpu_outputs.append([])
      gpu_losses.append([])
      gpu_grad_norms.append([])
      with tf.name_scope("gpu%d" % gpu), tf.device("/gpu:%d" % gpu):
        # Main graph creation loop.
        data.print_out("Creating model.")
        start_time = time.time()

        # Embed inputs and calculate mask.
        with tf.device("/cpu:0"):
          tgt_shape = tf.shape(tf.squeeze(gpu_target[gpu], [1]))
          weights = tf.where(tf.squeeze(gpu_target[gpu], [1]) > 0,
                             tf.ones(tgt_shape), tf.zeros(tgt_shape))

          # Embed inputs and targets.
          with tf.control_dependencies([e0]):
            start = tf.gather(emb_weights, gpu_input[gpu])  # b x h x l x nmaps
            gpu_targets_tn = gpu_target[gpu]  # b x 1 x len
            if beam_size > 0:
              embedded_targets_tn = tf.gather(target_emb_weights,
                                              gpu_targets_tn)
              embedded_targets_tn = tf.transpose(
                  embedded_targets_tn, [2, 0, 1, 3])  # len x b x 1 x nmaps
              embedded_targets_tn = tf.concat(axis=2, values=[embedded_targets_tn] * height)

        # First image comes from start by applying convolution and adding 0s.
        start = tf.transpose(start, [0, 2, 1, 3])  # Now b x len x h x vec_s
        first = conv_linear(start, 1, 1, vec_size, nmaps, 1, True, 0.0, "input")
        first = layer_norm(first, nmaps, "input")

        # Computation steps.
        keep_prob = dropout * 3.0 / tf.sqrt(length_float)
        keep_prob = 1.0 - self.do_training * keep_prob
        act_noise_scale = act_noise * self.do_training

        # Start with a convolutional gate merging previous step.
        step = conv_gru([gpu_prev_step[gpu]], first,
                        kw, kh, nmaps, 1, cutoff, "first", do_layer_norm)

        # This is just for running a baseline RNN seq2seq model.
        if do_rnn:
          self.after_enc_step.append(step)  # Not meaningful here, but needed.
          lstm_cell = tf.contrib.rnn.BasicLSTMCell(height * nmaps)
          cell = tf.contrib.rnn.MultiRNNCell([lstm_cell] * nconvs)
          with tf.variable_scope("encoder"):
            encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
                cell, tf.reshape(step, [batch_size, length, height * nmaps]),
                dtype=tf.float32, time_major=False)

          # Attention.
          attn = tf.layers.dense(
              encoder_outputs, height * nmaps, name="attn1")

          # pylint: disable=cell-var-from-loop
          @function.Defun(noinline=True)
          def attention_query(query, attn_v):
            vecs = tf.tanh(attn + tf.expand_dims(query, 1))
            mask = tf.reduce_sum(vecs * tf.reshape(attn_v, [1, 1, -1]), 2)
            mask = tf.nn.softmax(mask)
            return tf.reduce_sum(encoder_outputs * tf.expand_dims(mask, 2), 1)

          with tf.variable_scope("decoder"):
            def decoder_loop_fn((state, prev_cell_out, _), (cell_inp, cur_tgt)):
              """Decoder loop function."""
              attn_q = tf.layers.dense(prev_cell_out, height * nmaps,
                                       name="attn_query")
              attn_res = attention_query(attn_q, tf.get_variable(
                  "attn_v", [height * nmaps],
                  initializer=tf.random_uniform_initializer(-0.1, 0.1)))
              concatenated = tf.reshape(tf.concat(axis=1, values=[cell_inp, attn_res]),
                                        [batch_size, 2 * height * nmaps])
              cell_inp = tf.layers.dense(
                  concatenated, height * nmaps, name="attn_merge")
              output, new_state = cell(cell_inp, state)

              mem_loss = 0.0
              if mem_size > 0:
                res, mask, mem_loss = memory_call(
                    output, cur_tgt, height * nmaps, mem_size, noclass,
                    num_gpus, self.update_mem)
                res = tf.gather(target_emb_weights, res)
                res *= tf.expand_dims(mask[:, 0], 1)
                output = tf.layers.dense(
                    tf.concat(axis=1, values=[output, res]), height * nmaps, name="rnnmem")

              return new_state, output, mem_loss
            # pylint: enable=cell-var-from-loop
            gpu_targets = tf.squeeze(gpu_target[gpu], [1])  # b x len
            gpu_tgt_trans = tf.transpose(gpu_targets, [1, 0])
            dec_zero = tf.zeros([batch_size, 1], dtype=tf.int32)
            dec_inp = tf.concat(axis=1, values=[dec_zero, gpu_targets])
            dec_inp = dec_inp[:, :length]
            embedded_dec_inp = tf.gather(target_emb_weights, dec_inp)
            embedded_dec_inp_proj = tf.layers.dense(
                embedded_dec_inp, height * nmaps, name="dec_proj")
            embedded_dec_inp_proj = tf.transpose(embedded_dec_inp_proj,
                                                 [1, 0, 2])
            init_vals = (encoder_state,
                         tf.zeros([batch_size, height * nmaps]), 0.0)
            _, dec_outputs, mem_losses = tf.scan(
                decoder_loop_fn, (embedded_dec_inp_proj, gpu_tgt_trans),
                initializer=init_vals)
          mem_loss = tf.reduce_mean(mem_losses)
          outputs = tf.layers.dense(dec_outputs, nmaps, name="out_proj")
          # Final convolution to get logits, list outputs.
          outputs = tf.matmul(tf.reshape(outputs, [-1, nmaps]), output_w)
          outputs = tf.reshape(outputs, [length, batch_size, noclass])
          gpu_out_idx.append(tf.argmax(outputs, 2))
        else:  # Here we go with the Neural GPU.
          # Encoder.
          enc_length = length
          step = enc_step(step)  # First step hard-coded.
          # pylint: disable=cell-var-from-loop
          i = tf.constant(1)
          c = lambda i, _s: tf.less(i, enc_length)
          def enc_step_lambda(i, step):
            with tf.variable_scope(tf.get_variable_scope(), reuse=True):
              new_step = enc_step(step)
            return (i + 1, new_step)
          _, step = tf.while_loop(
              c, enc_step_lambda, [i, step],
              parallel_iterations=1, swap_memory=True)
          # pylint: enable=cell-var-from-loop

          self.after_enc_step.append(step)

          # Decoder.
          if beam_size > 0:
            output_ta = tf.TensorArray(
                dtype=tf.float32, size=length, dynamic_size=False,
                infer_shape=False, name="outputs")
            out_idx = tf.zeros([beam_size * batch_size, length, 1],
                               dtype=tf.int32)
            decided_t = tf.zeros([beam_size * batch_size, length,
                                  height, vec_size])

            # Prepare for beam search.
            tgts = tf.concat(axis=1, values=[embedded_targets_tn] * beam_size)
            beam_cost = tf.zeros([batch_size, beam_size])
            step = tf.concat(axis=0, values=[step] * beam_size)
            # First step hard-coded.
            step, decided_t, output_ta, mem_loss, nupd, oi, bc = dec_step(
                step, 0, 0, decided_t, output_ta, tgts, 0.0, 0, out_idx,
                beam_cost)
            tf.get_variable_scope().reuse_variables()
            # pylint: disable=cell-var-from-loop
            def step_lambda(i, step, dec_t, out_ta, ml, nu, oi, bc):
              with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                s, d, t, nml, nu, oi, bc = dec_step(
                    step, i, 1, dec_t, out_ta, tgts, ml, nu, oi, bc)
              return (i + 1, s, d, t, nml, nu, oi, bc)
            i = tf.constant(1)
            c = lambda i, _s, _d, _o, _ml, _nu, _oi, _bc: tf.less(i, length)
            _, step, _, output_ta, mem_loss, nupd, out_idx, _ = tf.while_loop(
                c, step_lambda,
                [i, step, decided_t, output_ta, mem_loss, nupd, oi, bc],
                parallel_iterations=1, swap_memory=True)
            # pylint: enable=cell-var-from-loop
            gpu_out_idx.append(tf.squeeze(out_idx, [2]))
            outputs = output_ta.stack()
            outputs = tf.squeeze(outputs, [2, 3])  # Now l x b x nmaps
          else:
            # If beam_size is 0 or less, we don't have a decoder.
            mem_loss = 0.0
            outputs = tf.transpose(step[:, :, 1, :], [1, 0, 2])
            gpu_out_idx.append(tf.argmax(outputs, 2))

          # Final convolution to get logits, list outputs.
          outputs = tf.matmul(tf.reshape(outputs, [-1, nmaps]), output_w)
          outputs = tf.reshape(outputs, [length, batch_size, noclass])
        gpu_outputs[gpu] = tf.nn.softmax(outputs)

        # Calculate cross-entropy loss and normalize it.
        targets_soft = make_dense(tf.squeeze(gpu_target[gpu], [1]),
                                  noclass, 0.1)
        targets_soft = tf.reshape(targets_soft, [-1, noclass])
        targets_hard = make_dense(tf.squeeze(gpu_target[gpu], [1]),
                                  noclass, 0.0)
        targets_hard = tf.reshape(targets_hard, [-1, noclass])
        output = tf.transpose(outputs, [1, 0, 2])
        xent_soft = tf.reshape(tf.nn.softmax_cross_entropy_with_logits(
            logits=tf.reshape(output, [-1, noclass]), labels=targets_soft),
                               [batch_size, length])
        xent_hard = tf.reshape(tf.nn.softmax_cross_entropy_with_logits(
            logits=tf.reshape(output, [-1, noclass]), labels=targets_hard),
                               [batch_size, length])
        low, high = 0.1 / float(noclass - 1), 0.9
        const = high * tf.log(high) + float(noclass - 1) * low * tf.log(low)
        weight_sum = tf.reduce_sum(weights) + 1e-20
        true_perp = tf.reduce_sum(xent_hard * weights) / weight_sum
        soft_loss = tf.reduce_sum(xent_soft * weights) / weight_sum
        perp_loss = soft_loss + const
        # Final loss: cross-entropy + shared parameter relaxation part + extra.
        mem_loss = 0.5 * tf.reduce_mean(mem_loss) / length_float
        total_loss = perp_loss + mem_loss
        gpu_losses[gpu].append(true_perp)

        # Gradients.
        if backward:
          data.print_out("Creating backward pass for the model.")
          grads = tf.gradients(
              total_loss, tf.trainable_variables(),
              colocate_gradients_with_ops=True)
          for g_i, g in enumerate(grads):
            if isinstance(g, tf.IndexedSlices):
              grads[g_i] = tf.convert_to_tensor(g)
          grads, norm = tf.clip_by_global_norm(grads, max_grad_norm)
          gpu_grad_norms[gpu].append(norm)
          for g in grads:
            if grad_noise_scale > 0.001:
              g += tf.truncated_normal(tf.shape(g)) * self.noise_param
          grads_list.append(grads)
        else:
          gpu_grad_norms[gpu].append(0.0)
        data.print_out("Created model for gpu %d in %.2f s."
                       % (gpu, time.time() - start_time))
Exemple #14
0
  def __init__(self, nmaps, vec_size, niclass, noclass, dropout, rx_step,
               max_grad_norm, cutoff, nconvs, kw, kh, height, mode,
               learning_rate, pull, pull_incr, min_length, act_noise=0.0):
    # Feeds for parameters and ops to update them.
    self.global_step = tf.Variable(0, trainable=False)
    self.cur_length = tf.Variable(min_length, trainable=False)
    self.cur_length_incr_op = self.cur_length.assign_add(1)
    self.lr = tf.Variable(float(learning_rate), trainable=False)
    self.lr_decay_op = self.lr.assign(self.lr * 0.98)
    self.pull = tf.Variable(float(pull), trainable=False)
    self.pull_incr_op = self.pull.assign(self.pull * pull_incr)
    self.do_training = tf.placeholder(tf.float32, name="do_training")
    self.noise_param = tf.placeholder(tf.float32, name="noise_param")

    # Feeds for inputs, targets, outputs, losses, etc.
    self.input = []
    self.target = []
    for l in xrange(data_utils.forward_max + 1):
      self.input.append(tf.placeholder(tf.int32, name="inp{0}".format(l)))
      self.target.append(tf.placeholder(tf.int32, name="tgt{0}".format(l)))
    self.outputs = []
    self.losses = []
    self.grad_norms = []
    self.updates = []

    # Computation.
    inp0_shape = tf.shape(self.input[0])
    batch_size = inp0_shape[0]
    with tf.device("/cpu:0"):
      emb_weights = tf.get_variable(
          "embedding", [niclass, vec_size],
          initializer=tf.random_uniform_initializer(-1.7, 1.7))
      e0 = tf.scatter_update(emb_weights,
                             tf.constant(0, dtype=tf.int32, shape=[1]),
                             tf.zeros([1, vec_size]))

    adam = tf.train.AdamOptimizer(self.lr, epsilon=1e-4)

    # Main graph creation loop, for every bin in data_utils.
    self.steps = []
    for length in sorted(list(set(data_utils.bins + [data_utils.forward_max]))):
      data_utils.print_out("Creating model for bin of length %d." % length)
      start_time = time.time()
      if length > data_utils.bins[0]:
        tf.get_variable_scope().reuse_variables()

      # Embed inputs and calculate mask.
      with tf.device("/cpu:0"):
        with tf.control_dependencies([e0]):
          embedded = [tf.nn.embedding_lookup(emb_weights, self.input[l])
                      for l in xrange(length)]
        # Mask to 0-out padding space in each step.
        imask = [check_for_zero(self.input[l]) for l in xrange(length)]
        omask = [check_for_zero(self.target[l]) for l in xrange(length)]
        mask = [1.0 - (imask[i] * omask[i]) for i in xrange(length)]
        mask = [tf.reshape(m, [-1, 1]) for m in mask]
        # Use a shifted mask for step scaling and concatenated for weights.
        shifted_mask = mask + [tf.zeros_like(mask[0])]
        scales = [shifted_mask[i] * (1.0 - shifted_mask[i+1])
                  for i in xrange(length)]
        scales = [tf.reshape(s, [-1, 1, 1, 1]) for s in scales]
        mask = tf.concat(1, mask[0:length])  # batch x length
        weights = mask
        # Add a height dimension to mask to use later for masking.
        mask = tf.reshape(mask, [-1, length, 1, 1])
        mask = tf.concat(2, [mask for _ in xrange(height)]) + tf.zeros(
            tf.pack([batch_size, length, height, nmaps]), dtype=tf.float32)

      # Start is a length-list of batch-by-nmaps tensors, reshape and concat.
      start = [tf.tanh(embedded[l]) for l in xrange(length)]
      start = [tf.reshape(start[l], [-1, 1, nmaps]) for l in xrange(length)]
      start = tf.reshape(tf.concat(1, start), [-1, length, 1, nmaps])

      # First image comes from start by applying one convolution and adding 0s.
      first = conv_linear(start, 1, 1, vec_size, nmaps, True, 0.0, "input")
      first = [first] + [tf.zeros(tf.pack([batch_size, length, 1, nmaps]),
                                  dtype=tf.float32) for _ in xrange(height - 1)]
      first = tf.concat(2, first)

      # Computation steps.
      keep_prob = 1.0 - self.do_training * (dropout * 8.0 / float(length))
      step = [tf.nn.dropout(first, keep_prob) * mask]
      act_noise_scale = act_noise * self.do_training * self.pull
      outputs = []
      for it in xrange(length):
        with tf.variable_scope("RX%d" % (it % rx_step)) as vs:
          if it >= rx_step:
            vs.reuse_variables()
          cur = step[it]
          # Do nconvs-many CGRU steps.
          for layer in xrange(nconvs):
            cur = conv_gru([], cur, kw, kh, nmaps, cutoff, "cgru_%d" % layer)
            cur *= mask
          outputs.append(tf.slice(cur, [0, 0, 0, 0], [-1, -1, 1, -1]))
          cur = tf.nn.dropout(cur, keep_prob)
          if act_noise > 0.00001:
            cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale
          step.append(cur * mask)

      self.steps.append([tf.reshape(s, [-1, length, height * nmaps])
                         for s in step])
      # Output is the n-th step output; n = current length, as in scales.
      output = tf.add_n([outputs[i] * scales[i] for i in xrange(length)])
      # Final convolution to get logits, list outputs.
      output = conv_linear(output, 1, 1, nmaps, noclass, True, 0.0, "output")
      output = tf.reshape(output, [-1, length, noclass])
      external_output = [tf.reshape(o, [-1, noclass])
                         for o in list(tf.split(1, length, output))]
      external_output = [tf.nn.softmax(o) for o in external_output]
      self.outputs.append(external_output)

      # Calculate cross-entropy loss and normalize it.
      targets = tf.concat(1, [make_dense(self.target[l], noclass)
                              for l in xrange(length)])
      targets = tf.reshape(targets, [-1, noclass])
      xent = tf.reshape(tf.nn.softmax_cross_entropy_with_logits(
          tf.reshape(output, [-1, noclass]), targets), [-1, length])
      perp_loss = tf.reduce_sum(xent * weights)
      perp_loss /= tf.cast(batch_size, dtype=tf.float32)
      perp_loss /= length

      # Final loss: cross-entropy + shared parameter relaxation part.
      relax_dist, self.avg_op = relaxed_distance(rx_step)
      total_loss = perp_loss + relax_dist * self.pull
      self.losses.append(perp_loss)

      # Gradients and Adam update operation.
      if length == data_utils.bins[0] or (mode == 0 and
                                          length < data_utils.bins[-1] + 1):
        data_utils.print_out("Creating backward for bin of length %d." % length)
        params = tf.trainable_variables()
        grads = tf.gradients(total_loss, params)
        grads, norm = tf.clip_by_global_norm(grads, max_grad_norm)
        self.grad_norms.append(norm)
        for grad in grads:
          if isinstance(grad, tf.Tensor):
            grad += tf.truncated_normal(tf.shape(grad)) * self.noise_param
        update = adam.apply_gradients(zip(grads, params),
                                      global_step=self.global_step)
        self.updates.append(update)
      data_utils.print_out("Created model for bin of length %d in"
                           " %.2f s." % (length, time.time() - start_time))
    self.saver = tf.train.Saver(tf.all_variables())
def train():
    """Train the model."""
    batch_size = FLAGS.batch_size
    tasks = FLAGS.task.split("-")
    with tf.Session() as sess:
        (model, min_length, max_length, checkpoint_dir, curriculum,
         _) = initialize(sess)
        quant_op = neural_gpu.quantize_weights_op(512, 8)
        max_cur_length = min(min_length + 3, max_length)
        prev_acc_perp = [1000000 for _ in xrange(3)]
        prev_seq_err = 1.0

        # Main traning loop.
        while True:
            global_step, pull, max_cur_length, learning_rate = sess.run(
                [model.global_step, model.pull, model.cur_length, model.lr])
            acc_loss, acc_total, acc_errors, acc_seq_err = 0.0, 0, 0, 0
            acc_grad_norm, step_count, step_time = 0.0, 0, 0.0
            for _ in xrange(FLAGS.steps_per_checkpoint):
                global_step += 1
                task = random.choice(tasks)

                # Select the length for curriculum learning.
                l = np.random.randint(max_cur_length - min_length +
                                      1) + min_length
                # Prefer longer stuff 60% of time.
                if np.random.randint(100) < 60:
                    l1 = np.random.randint(max_cur_length - min_length +
                                           1) + min_length
                    l = max(l, l1)
                # Mixed curriculum learning: in 25% of cases go to any larger length.
                if np.random.randint(100) < 25:
                    l1 = np.random.randint(max_length - min_length +
                                           1) + min_length
                    l = max(l, l1)

                # Run a step and time it.
                start_time = time.time()
                inp, target = data.get_batch(l, batch_size, True, task)
                noise_param = math.sqrt(
                    math.pow(global_step, -0.55) *
                    prev_seq_err) * FLAGS.grad_noise_scale
                loss, res, gnorm, _ = model.step(sess, inp, target, True,
                                                 noise_param)
                step_time += time.time() - start_time
                acc_grad_norm += float(gnorm)

                # Accumulate statistics only if we did not exceed curriculum length.
                if l < max_cur_length + 1:
                    step_count += 1
                    acc_loss += loss
                    errors, total, seq_err = data.accuracy(
                        inp, res, target, batch_size, 0)
                    acc_total += total
                    acc_errors += errors
                    acc_seq_err += seq_err

            # Normalize and print out accumulated statistics.
            acc_loss /= step_count
            step_time /= FLAGS.steps_per_checkpoint
            acc_seq_err = float(acc_seq_err) / (step_count * batch_size)
            prev_seq_err = max(0.0,
                               acc_seq_err - 0.02)  # No noise at error < 2%.
            acc_errors = float(
                acc_errors) / acc_total if acc_total > 0 else 1.0
            msg1 = "step %d step-time %.2f" % (global_step, step_time)
            msg2 = "lr %.8f pull %.3f" % (learning_rate, pull)
            msg3 = ("%s %s grad-norm %.8f" %
                    (msg1, msg2, acc_grad_norm / FLAGS.steps_per_checkpoint))
            data.print_out(
                "%s len %d ppx %.8f errors %.2f sequence-errors %.2f" %
                (msg3, max_cur_length, data.safe_exp(acc_loss),
                 100 * acc_errors, 100 * acc_seq_err))

            # If errors are below the curriculum threshold, move curriculum forward.
            if curriculum > acc_seq_err:
                if FLAGS.quantize:
                    # Quantize weights.
                    data.print_out("  Quantizing parameters.")
                    sess.run([quant_op])
                # Increase current length (until the next with training data).
                do_incr = True
                while do_incr and max_cur_length < max_length:
                    sess.run(model.cur_length_incr_op)
                    for t in tasks:
                        if data.train_set[t]: do_incr = False
                # Forget last perplexities if we're not yet at the end.
                if max_cur_length < max_length:
                    prev_acc_perp.append(1000000)
                # Either increase pull or, if it's large, average parameters.
                if pull < 0.1:
                    sess.run(model.pull_incr_op)
                else:
                    data.print_out("  Averaging parameters.")
                    sess.run(model.avg_op)
                    if acc_seq_err < (curriculum / 3.0):
                        sess.run(model.lr_decay_op)

            # Lower learning rate if we're worse than the last 3 checkpoints.
            acc_perp = data.safe_exp(acc_loss)
            if acc_perp > max(prev_acc_perp[-3:]):
                sess.run(model.lr_decay_op)
            prev_acc_perp.append(acc_perp)

            # Save checkpoint.
            checkpoint_path = os.path.join(checkpoint_dir, "neural_gpu.ckpt")
            model.saver.save(sess,
                             checkpoint_path,
                             global_step=model.global_step)

            # Run evaluation.
            bound = data.bins[-1] + 1
            for t in tasks:
                l = min_length
                while l < max_length + EXTRA_EVAL and l < bound:
                    _, seq_err, _ = single_test(l, model, sess, t,
                                                FLAGS.nprint, batch_size)
                    l += 1
                    while l < bound + 1 and not data.test_set[t][l]:
                        l += 1
                if seq_err < 0.05:  # Run larger test if we're good enough.
                    _, seq_err = multi_test(data.forward_max, model, sess, t,
                                            FLAGS.nprint, batch_size * 4)
            if seq_err < 0.01:  # Super-large test on 1-task large-forward models.
                if data.forward_max > 4000 and len(tasks) == 1:
                    multi_test(data.forward_max, model, sess, tasks[0],
                               FLAGS.nprint, batch_size * 16, 0)
Exemple #16
0
def train():
  """Train the model."""
  batch_size = FLAGS.batch_size
  tasks = FLAGS.task.split("-")
  with tf.Session() as sess:
    (model, min_length, max_length, checkpoint_dir,
     curriculum, _) = initialize(sess)
    quant_op = neural_gpu.quantize_weights_op(512, 8)
    max_cur_length = min(min_length + 3, max_length)
    prev_acc_perp = [1000000 for _ in xrange(3)]
    prev_seq_err = 1.0

    # Main traning loop.
    while True:
      global_step, pull, max_cur_length, learning_rate = sess.run(
          [model.global_step, model.pull, model.cur_length, model.lr])
      acc_loss, acc_total, acc_errors, acc_seq_err = 0.0, 0, 0, 0
      acc_grad_norm, step_count, step_time = 0.0, 0, 0.0
      for _ in xrange(FLAGS.steps_per_checkpoint):
        global_step += 1
        task = random.choice(tasks)

        # Select the length for curriculum learning.
        l = np.random.randint(max_cur_length - min_length + 1) + min_length
        # Prefer longer stuff 60% of time.
        if np.random.randint(100) < 60:
          l1 = np.random.randint(max_cur_length - min_length+1) + min_length
          l = max(l, l1)
        # Mixed curriculum learning: in 25% of cases go to any larger length.
        if np.random.randint(100) < 25:
          l1 = np.random.randint(max_length - min_length + 1) + min_length
          l = max(l, l1)

        # Run a step and time it.
        start_time = time.time()
        inp, target = data.get_batch(l, batch_size, True, task)
        noise_param = math.sqrt(math.pow(global_step, -0.55) *
                                prev_seq_err) * FLAGS.grad_noise_scale
        loss, res, gnorm, _ = model.step(sess, inp, target, True, noise_param)
        step_time += time.time() - start_time
        acc_grad_norm += float(gnorm)

        # Accumulate statistics only if we did not exceed curriculum length.
        if l < max_cur_length + 1:
          step_count += 1
          acc_loss += loss
          errors, total, seq_err = data.accuracy(inp, res, target,
                                                 batch_size, 0)
          acc_total += total
          acc_errors += errors
          acc_seq_err += seq_err

      # Normalize and print out accumulated statistics.
      acc_loss /= step_count
      step_time /= FLAGS.steps_per_checkpoint
      acc_seq_err = float(acc_seq_err) / (step_count * batch_size)
      prev_seq_err = max(0.0, acc_seq_err - 0.02)  # No noise at error < 2%.
      acc_errors = float(acc_errors) / acc_total if acc_total > 0 else 1.0
      msg1 = "step %d step-time %.2f" % (global_step, step_time)
      msg2 = "lr %.8f pull %.3f" % (learning_rate, pull)
      msg3 = ("%s %s grad-norm %.8f"
              % (msg1, msg2, acc_grad_norm / FLAGS.steps_per_checkpoint))
      data.print_out("%s len %d ppx %.8f errors %.2f sequence-errors %.2f" %
                     (msg3, max_cur_length, data.safe_exp(acc_loss),
                      100*acc_errors, 100*acc_seq_err))

      # If errors are below the curriculum threshold, move curriculum forward.
      if curriculum > acc_seq_err:
        if FLAGS.quantize:
          # Quantize weights.
          data.print_out("  Quantizing parameters.")
          sess.run([quant_op])
        # Increase current length (until the next with training data).
        do_incr = True
        while do_incr and max_cur_length < max_length:
          sess.run(model.cur_length_incr_op)
          for t in tasks:
            if data.train_set[t]: do_incr = False
        # Forget last perplexities if we're not yet at the end.
        if max_cur_length < max_length:
          prev_acc_perp.append(1000000)
        # Either increase pull or, if it's large, average parameters.
        if pull < 0.1:
          sess.run(model.pull_incr_op)
        else:
          data.print_out("  Averaging parameters.")
          sess.run(model.avg_op)
          if acc_seq_err < (curriculum / 3.0):
            sess.run(model.lr_decay_op)

      # Lower learning rate if we're worse than the last 3 checkpoints.
      acc_perp = data.safe_exp(acc_loss)
      if acc_perp > max(prev_acc_perp[-3:]):
        sess.run(model.lr_decay_op)
      prev_acc_perp.append(acc_perp)

      # Save checkpoint.
      checkpoint_path = os.path.join(checkpoint_dir, "neural_gpu.ckpt")
      model.saver.save(sess, checkpoint_path,
                       global_step=model.global_step)

      # Run evaluation.
      bound = data.bins[-1] + 1
      for t in tasks:
        l = min_length
        while l < max_length + EXTRA_EVAL and l < bound:
          _, seq_err, _ = single_test(l, model, sess, t,
                                      FLAGS.nprint, batch_size)
          l += 1
          while l < bound + 1 and not data.test_set[t][l]:
            l += 1
        if seq_err < 0.05:  # Run larger test if we're good enough.
          _, seq_err = multi_test(data.forward_max, model, sess, t,
                                  FLAGS.nprint, batch_size * 4)
      if seq_err < 0.01:  # Super-large test on 1-task large-forward models.
        if data.forward_max > 4000 and len(tasks) == 1:
          multi_test(data.forward_max, model, sess, tasks[0], FLAGS.nprint,
                     batch_size * 16, 0)
Exemple #17
0
def initialize(sess):
  """Initialize data and model."""
  if FLAGS.jobid >= 0:
    data.log_filename = os.path.join(FLAGS.train_dir, "log%d" % FLAGS.jobid)
  data.print_out("NN ", newline=False)

  # Set random seed.
  seed = FLAGS.random_seed + max(0, FLAGS.jobid)
  tf.set_random_seed(seed)
  random.seed(seed)
  np.random.seed(seed)

  # Check data sizes.
  assert data.bins
  min_length = 3
  max_length = min(FLAGS.max_length, data.bins[-1])
  assert max_length + 1 > min_length
  while len(data.bins) > 1 and data.bins[-2] > max_length + EXTRA_EVAL:
    data.bins = data.bins[:-1]
  assert data.bins[0] > FLAGS.rx_step
  data.forward_max = max(FLAGS.forward_max, data.bins[-1])
  nclass = min(FLAGS.niclass, FLAGS.noclass)
  data_size = FLAGS.train_data_size if FLAGS.mode == 0 else 1000

  # Initialize data for each task.
  tasks = FLAGS.task.split("-")
  for t in tasks:
    for l in xrange(max_length + EXTRA_EVAL - 1):
      data.init_data(t, l, data_size, nclass)
    data.init_data(t, data.bins[-2], data_size, nclass)
    data.init_data(t, data.bins[-1], data_size, nclass)
    end_size = 4 * 1024 if FLAGS.mode > 0 else 1024
    data.init_data(t, data.forward_max, end_size, nclass)

  # Print out parameters.
  curriculum = FLAGS.curriculum_bound
  msg1 = ("layers %d kw %d h %d kh %d relax %d batch %d noise %.2f task %s"
          % (FLAGS.nconvs, FLAGS.kw, FLAGS.height, FLAGS.kh, FLAGS.rx_step,
             FLAGS.batch_size, FLAGS.grad_noise_scale, FLAGS.task))
  msg2 = "data %d %s" % (FLAGS.train_data_size, msg1)
  msg3 = ("cut %.2f pull %.3f lr %.2f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s" %
          (FLAGS.cutoff, FLAGS.pull_incr, FLAGS.lr, FLAGS.init_weight,
           curriculum, FLAGS.nmaps, FLAGS.dropout, FLAGS.max_grad_norm, msg2))
  data.print_out(msg3)

  # Create checkpoint directory if it does not exist.
  checkpoint_dir = os.path.join(FLAGS.train_dir, "neural_gpu%s"
                                % ("" if FLAGS.jobid < 0 else str(FLAGS.jobid)))
  if not gfile.IsDirectory(checkpoint_dir):
    data.print_out("Creating checkpoint directory %s." % checkpoint_dir)
    gfile.MkDir(checkpoint_dir)

  # Create model and initialize it.
  tf.get_variable_scope().set_initializer(
      tf.uniform_unit_scaling_initializer(factor=1.8 * FLAGS.init_weight))
  model = neural_gpu.NeuralGPU(
      FLAGS.nmaps, FLAGS.nmaps, FLAGS.niclass, FLAGS.noclass, FLAGS.dropout,
      FLAGS.rx_step, FLAGS.max_grad_norm, FLAGS.cutoff, FLAGS.nconvs,
      FLAGS.kw, FLAGS.kh, FLAGS.height, FLAGS.mode, FLAGS.lr,
      FLAGS.pull, FLAGS.pull_incr, min_length + 3)
  data.print_out("Created model.")
  sess.run(tf.initialize_all_variables())
  data.print_out("Initialized variables.")

  # Load model from parameters if a checkpoint exists.
  ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
  if ckpt and gfile.Exists(ckpt.model_checkpoint_path):
    data.print_out("Reading model parameters from %s"
                   % ckpt.model_checkpoint_path)
    model.saver.restore(sess, ckpt.model_checkpoint_path)

  # Check if there are ensemble models and get their checkpoints.
  ensemble = []
  ensemble_dir_list = [d for d in FLAGS.ensemble.split(",") if d]
  for ensemble_dir in ensemble_dir_list:
    ckpt = tf.train.get_checkpoint_state(ensemble_dir)
    if ckpt and gfile.Exists(ckpt.model_checkpoint_path):
      data.print_out("Found ensemble model %s" % ckpt.model_checkpoint_path)
      ensemble.append(ckpt.model_checkpoint_path)

  # Return the model and needed variables.
  return (model, min_length, max_length, checkpoint_dir, curriculum, ensemble)
Exemple #18
0
def initialize(sess=None):
  """Initialize data and model."""
  global MAXLEN_F
  # Create training directory if it does not exist.
  if not tf.gfile.IsDirectory(FLAGS.train_dir):
    data.print_out("Creating training directory %s." % FLAGS.train_dir)
    tf.gfile.MkDir(FLAGS.train_dir)
  decode_suffix = "beam%dln%d" % (FLAGS.beam_size,
                                  int(100 * FLAGS.length_norm))
  if FLAGS.mode == 0:
    decode_suffix = ""
  if FLAGS.task >= 0:
    data.log_filename = os.path.join(FLAGS.train_dir,
                                     "log%d%s" % (FLAGS.task, decode_suffix))
  else:
    data.log_filename = os.path.join(FLAGS.train_dir, "neural_gpu/log")

  # Set random seed.
  if FLAGS.random_seed > 0:
    seed = FLAGS.random_seed + max(0, FLAGS.task)
    tf.set_random_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

  # Check data sizes.
  assert data.bins
  max_length = min(FLAGS.max_length, data.bins[-1])
  while len(data.bins) > 1 and data.bins[-2] >= max_length + EXTRA_EVAL:
    data.bins = data.bins[:-1]
  if sess is None and FLAGS.task == 0 and FLAGS.num_replicas > 1:
    if max_length > 60:
      max_length = max_length * 1 / 2  # Save memory on chief.
  min_length = min(14, max_length - 3) if FLAGS.problem == "wmt" else 3
  for p in FLAGS.problem.split("-"):
    if p in ["progeval", "progsynth"]:
      min_length = max(26, min_length)
  assert max_length + 1 > min_length
  while len(data.bins) > 1 and data.bins[-2] >= max_length + EXTRA_EVAL:
    data.bins = data.bins[:-1]

  # Create checkpoint directory if it does not exist.
  if FLAGS.mode == 0 or FLAGS.task < 0:
    checkpoint_dir = os.path.join(FLAGS.train_dir, "neural_gpu%s"
                                  % ("" if FLAGS.task < 0 else str(FLAGS.task)))
  else:
    checkpoint_dir = FLAGS.train_dir
  if not tf.gfile.IsDirectory(checkpoint_dir):
    data.print_out("Creating checkpoint directory %s." % checkpoint_dir)
    tf.gfile.MkDir(checkpoint_dir)

  # Prepare data.
  if FLAGS.problem == "wmt":
    # Prepare WMT data.
    data.print_out("Preparing WMT data in %s" % FLAGS.data_dir)
    if FLAGS.simple_tokenizer:
      MAXLEN_F = 3.5
      (en_train, fr_train, en_dev, fr_dev,
       en_path, fr_path) = wmt.prepare_wmt_data(
           FLAGS.data_dir, FLAGS.vocab_size,
           tokenizer=wmt.space_tokenizer,
           normalize_digits=FLAGS.normalize_digits)
    else:
      (en_train, fr_train, en_dev, fr_dev,
       en_path, fr_path) = wmt.prepare_wmt_data(
           FLAGS.data_dir, FLAGS.vocab_size)

    # Read data into buckets and compute their sizes.
    fr_vocab, rev_fr_vocab = wmt.initialize_vocabulary(fr_path)
    data.vocab = fr_vocab
    data.rev_vocab = rev_fr_vocab
    data.print_out("Reading development and training data (limit: %d)."
                   % FLAGS.max_train_data_size)
    dev_set = {}
    dev_set["wmt"] = read_data(en_dev, fr_dev, data.bins)
    def data_read(size, print_out):
      read_data_into_global(en_train, fr_train, data.bins, size, print_out)
    data_read(50000, False)
    read_thread_small = threading.Thread(
        name="reading-data-small", target=lambda: data_read(900000, False))
    read_thread_small.start()
    read_thread_full = threading.Thread(
        name="reading-data-full",
        target=lambda: data_read(FLAGS.max_train_data_size, True))
    read_thread_full.start()
    data.print_out("Data reading set up.")
  else:
    # Prepare algorithmic data.
    en_path, fr_path = None, None
    tasks = FLAGS.problem.split("-")
    data_size = FLAGS.train_data_size
    for t in tasks:
      data.print_out("Generating data for %s." % t)
      if t in ["progeval", "progsynth"]:
        data.init_data(t, data.bins[-1], 20 * data_size, FLAGS.vocab_size)
        if len(program_utils.prog_vocab) > FLAGS.vocab_size - 2:
          raise ValueError("Increase vocab_size to %d for prog-tasks."
                           % (len(program_utils.prog_vocab) + 2))
        data.rev_vocab = program_utils.prog_vocab
        data.vocab = program_utils.prog_rev_vocab
      else:
        for l in xrange(max_length + EXTRA_EVAL - 1):
          data.init_data(t, l, data_size, FLAGS.vocab_size)
        data.init_data(t, data.bins[-2], data_size, FLAGS.vocab_size)
        data.init_data(t, data.bins[-1], data_size, FLAGS.vocab_size)
      if t not in global_train_set:
        global_train_set[t] = []
      global_train_set[t].append(data.train_set[t])
      calculate_buckets_scale(data.train_set[t], data.bins, t)
    dev_set = data.test_set

  # Grid-search parameters.
  lr = FLAGS.lr
  init_weight = FLAGS.init_weight
  max_grad_norm = FLAGS.max_grad_norm
  if sess is not None and FLAGS.task > -1:
    def job_id_factor(step):
      """If jobid / step mod 3 is 0, 1, 2: say 0, 1, -1."""
      return ((((FLAGS.task / step) % 3) + 1) % 3) - 1
    lr *= math.pow(2, job_id_factor(1))
    init_weight *= math.pow(1.5, job_id_factor(3))
    max_grad_norm *= math.pow(2, job_id_factor(9))

  # Print out parameters.
  curriculum = FLAGS.curriculum_seq
  msg1 = ("layers %d kw %d h %d kh %d batch %d noise %.2f"
          % (FLAGS.nconvs, FLAGS.kw, FLAGS.height, FLAGS.kh,
             FLAGS.batch_size, FLAGS.grad_noise_scale))
  msg2 = ("cut %.2f lr %.3f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s"
          % (FLAGS.cutoff, lr, init_weight, curriculum, FLAGS.nmaps,
             FLAGS.dropout, max_grad_norm, msg1))
  data.print_out(msg2)

  # Create model and initialize it.
  tf.get_variable_scope().set_initializer(
      tf.orthogonal_initializer(gain=1.8 * init_weight))
  max_sampling_rate = FLAGS.max_sampling_rate if FLAGS.mode == 0 else 0.0
  o = FLAGS.vocab_size if FLAGS.max_target_vocab < 1 else FLAGS.max_target_vocab
  ngpu.CHOOSE_K = FLAGS.soft_mem_size
  do_beam_model = FLAGS.train_beam_freq > 0.0001 and FLAGS.beam_size > 1
  beam_size = FLAGS.beam_size if FLAGS.mode > 0 and not do_beam_model else 1
  beam_size = min(beam_size, FLAGS.beam_size)
  beam_model = None
  def make_ngpu(cur_beam_size, back):
    return ngpu.NeuralGPU(
        FLAGS.nmaps, FLAGS.vec_size, FLAGS.vocab_size, o,
        FLAGS.dropout, max_grad_norm, FLAGS.cutoff, FLAGS.nconvs,
        FLAGS.kw, FLAGS.kh, FLAGS.height, FLAGS.mem_size,
        lr / math.sqrt(FLAGS.num_replicas), min_length + 3, FLAGS.num_gpus,
        FLAGS.num_replicas, FLAGS.grad_noise_scale, max_sampling_rate,
        atrous=FLAGS.atrous, do_rnn=FLAGS.rnn_baseline,
        do_layer_norm=FLAGS.layer_norm, beam_size=cur_beam_size, backward=back)
  if sess is None:
    with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)):
      model = make_ngpu(beam_size, True)
      if do_beam_model:
        tf.get_variable_scope().reuse_variables()
        beam_model = make_ngpu(FLAGS.beam_size, False)
  else:
    model = make_ngpu(beam_size, True)
    if do_beam_model:
      tf.get_variable_scope().reuse_variables()
      beam_model = make_ngpu(FLAGS.beam_size, False)

  sv = None
  if sess is None:
    # The supervisor configuration has a few overriden options.
    sv = tf.train.Supervisor(logdir=checkpoint_dir,
                             is_chief=(FLAGS.task < 1),
                             saver=model.saver,
                             summary_op=None,
                             save_summaries_secs=60,
                             save_model_secs=15 * 60,
                             global_step=model.global_step)

    config = tf.ConfigProto(allow_soft_placement=True)
    sess = sv.PrepareSession(FLAGS.master, config=config)

  data.print_out("Created model. Checkpoint dir %s" % checkpoint_dir)

  # Load model from parameters if a checkpoint exists.
  ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
  if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path + ".index"):
    data.print_out("Reading model parameters from %s"
                   % ckpt.model_checkpoint_path)
    model.saver.restore(sess, ckpt.model_checkpoint_path)
  elif sv is None:
    sess.run(tf.global_variables_initializer())
    data.print_out("Initialized variables (no supervisor mode).")
  elif FLAGS.task < 1 and FLAGS.mem_size > 0:
    # sess.run(model.mem_norm_op)
    data.print_out("Created new model and normalized mem (on chief).")

  # Return the model and needed variables.
  return (model, beam_model, min_length, max_length, checkpoint_dir,
          (global_train_set, dev_set, en_path, fr_path), sv, sess)
def initialize(sess):
    """Initialize data and model."""
    if FLAGS.jobid >= 0:
        data.log_filename = os.path.join(FLAGS.train_dir,
                                         "log%d" % FLAGS.jobid)
    data.print_out("NN ", newline=False)

    # Set random seed.
    seed = FLAGS.random_seed + max(0, FLAGS.jobid)
    tf.set_random_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

    # Check data sizes.
    assert data.bins
    min_length = 3
    max_length = min(FLAGS.max_length, data.bins[-1])
    assert max_length + 1 > min_length
    while len(data.bins) > 1 and data.bins[-2] > max_length + EXTRA_EVAL:
        data.bins = data.bins[:-1]
    assert data.bins[0] > FLAGS.rx_step
    data.forward_max = max(FLAGS.forward_max, data.bins[-1])
    nclass = min(FLAGS.niclass, FLAGS.noclass)
    data_size = FLAGS.train_data_size if FLAGS.mode == 0 else 1000

    # Initialize data for each task.
    tasks = FLAGS.task.split("-")
    for t in tasks:
        for l in xrange(max_length + EXTRA_EVAL - 1):
            data.init_data(t, l, data_size, nclass)
        data.init_data(t, data.bins[-2], data_size, nclass)
        data.init_data(t, data.bins[-1], data_size, nclass)
        end_size = 4 * 1024 if FLAGS.mode > 0 else 1024
        data.init_data(t, data.forward_max, end_size, nclass)

    # Print out parameters.
    curriculum = FLAGS.curriculum_bound
    msg1 = ("layers %d kw %d h %d kh %d relax %d batch %d noise %.2f task %s" %
            (FLAGS.nconvs, FLAGS.kw, FLAGS.height, FLAGS.kh, FLAGS.rx_step,
             FLAGS.batch_size, FLAGS.grad_noise_scale, FLAGS.task))
    msg2 = "data %d %s" % (FLAGS.train_data_size, msg1)
    msg3 = (
        "cut %.2f pull %.3f lr %.2f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s" %
        (FLAGS.cutoff, FLAGS.pull_incr, FLAGS.lr, FLAGS.init_weight,
         curriculum, FLAGS.nmaps, FLAGS.dropout, FLAGS.max_grad_norm, msg2))
    data.print_out(msg3)

    # Create checkpoint directory if it does not exist.
    checkpoint_dir = os.path.join(
        FLAGS.train_dir,
        "neural_gpu%s" % ("" if FLAGS.jobid < 0 else str(FLAGS.jobid)))
    if not gfile.IsDirectory(checkpoint_dir):
        data.print_out("Creating checkpoint directory %s." % checkpoint_dir)
        gfile.MkDir(checkpoint_dir)

    # Create model and initialize it.
    tf.get_variable_scope().set_initializer(
        tf.uniform_unit_scaling_initializer(factor=1.8 * FLAGS.init_weight))
    model = neural_gpu.NeuralGPU(FLAGS.nmaps, FLAGS.nmaps, FLAGS.niclass,
                                 FLAGS.noclass, FLAGS.dropout, FLAGS.rx_step,
                                 FLAGS.max_grad_norm, FLAGS.cutoff,
                                 FLAGS.nconvs, FLAGS.kw, FLAGS.kh,
                                 FLAGS.height, FLAGS.mode, FLAGS.lr,
                                 FLAGS.pull, FLAGS.pull_incr, min_length + 3)
    data.print_out("Created model.")
    sess.run(tf.initialize_all_variables())
    data.print_out("Initialized variables.")

    # Load model from parameters if a checkpoint exists.
    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
    if ckpt and gfile.Exists(ckpt.model_checkpoint_path):
        data.print_out("Reading model parameters from %s" %
                       ckpt.model_checkpoint_path)
        model.saver.restore(sess, ckpt.model_checkpoint_path)

    # Check if there are ensemble models and get their checkpoints.
    ensemble = []
    ensemble_dir_list = [d for d in FLAGS.ensemble.split(",") if d]
    for ensemble_dir in ensemble_dir_list:
        ckpt = tf.train.get_checkpoint_state(ensemble_dir)
        if ckpt and gfile.Exists(ckpt.model_checkpoint_path):
            data.print_out("Found ensemble model %s" %
                           ckpt.model_checkpoint_path)
            ensemble.append(ckpt.model_checkpoint_path)

    # Return the model and needed variables.
    return (model, min_length, max_length, checkpoint_dir, curriculum,
            ensemble)
Exemple #20
0
def evaluate():
  """Evaluate an existing model."""
  batch_size = FLAGS.batch_size * FLAGS.num_gpus
  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
    (model, beam_model, _, _, _,
     (_, dev_set, en_vocab_path, fr_vocab_path), _, sess) = initialize(sess)
    for p in FLAGS.problem.split("-"):
      for bin_id in xrange(len(data.bins)):
        if (FLAGS.task >= 0 and bin_id > 4) or (FLAGS.nprint == 0 and
                                                bin_id > 8 and p == "wmt"):
          break
        single_test(bin_id, model, sess, FLAGS.nprint, batch_size, dev_set, p,
                    beam_model=beam_model)
    path = FLAGS.test_file_prefix
    xid = "" if FLAGS.task < 0 else ("%.4d" % (FLAGS.task+FLAGS.decode_offset))
    en_path, fr_path = path + ".en" + xid, path + ".fr" + xid
    # Evaluate the test file if they exist.
    if path and tf.gfile.Exists(en_path) and tf.gfile.Exists(fr_path):
      data.print_out("Translating test set %s" % en_path)
      # Read lines.
      en_lines, fr_lines = [], []
      with tf.gfile.GFile(en_path, mode="r") as f:
        for line in f:
          en_lines.append(line.strip())
      with tf.gfile.GFile(fr_path, mode="r") as f:
        for line in f:
          fr_lines.append(line.strip())
      # Tokenize and convert to ids.
      en_vocab, _ = wmt.initialize_vocabulary(en_vocab_path)
      _, rev_fr_vocab = wmt.initialize_vocabulary(fr_vocab_path)
      if FLAGS.simple_tokenizer:
        en_ids = [wmt.sentence_to_token_ids(
            l, en_vocab, tokenizer=wmt.space_tokenizer,
            normalize_digits=FLAGS.normalize_digits)
                  for l in en_lines]
      else:
        en_ids = [wmt.sentence_to_token_ids(l, en_vocab) for l in en_lines]
      # Translate.
      results = []
      for idx, token_ids in enumerate(en_ids):
        if idx % 5 == 0:
          data.print_out("Translating example %d of %d." % (idx, len(en_ids)))
        # Which bucket does it belong to?
        buckets = [b for b in xrange(len(data.bins))
                   if data.bins[b] >= len(token_ids)]
        if buckets:
          result, result_cost = [], 100000000.0
          for bucket_id in buckets:
            if data.bins[bucket_id] > MAXLEN_F * len(token_ids) + EVAL_LEN_INCR:
              break
            # Get a 1-element batch to feed the sentence to the model.
            used_batch_size = 1  # batch_size
            inp, target = data.get_batch(
                bucket_id, used_batch_size, None, FLAGS.height,
                preset=([token_ids], [[]]))
            loss, output_logits, _, _ = model.step(
                sess, inp, target, None, beam_size=FLAGS.beam_size)
            outputs = [int(o[0]) for o in output_logits]
            loss = loss[0] - (data.bins[bucket_id] * FLAGS.length_norm)
            if FLAGS.simple_tokenizer:
              cur_out = outputs
              if wmt.EOS_ID in cur_out:
                cur_out = cur_out[:cur_out.index(wmt.EOS_ID)]
              res_tags = [rev_fr_vocab[o] for o in cur_out]
              bad_words, bad_brack = wmt.parse_constraints(token_ids, res_tags)
              loss += 1000.0 * bad_words + 100.0 * bad_brack
            # print (bucket_id, loss)
            if loss < result_cost:
              result = outputs
              result_cost = loss
          final = linearize(result, rev_fr_vocab)
          results.append("%s\t%s\n" % (final, fr_lines[idx]))
          # print result_cost
          sys.stderr.write(results[-1])
          sys.stderr.flush()
        else:
          sys.stderr.write("TOOO_LONG\t%s\n" % fr_lines[idx])
          sys.stderr.flush()
      if xid:
        decode_suffix = "beam%dln%dn" % (FLAGS.beam_size,
                                         int(100 * FLAGS.length_norm))
        with tf.gfile.GFile(path + ".res" + decode_suffix + xid, mode="w") as f:
          for line in results:
            f.write(line)