def single_test(l, model, sess, task, nprint, batch_size, print_out=True, offset=None): """Test model on test data of length l using the given session.""" inpt, target = data.get_batch(l, batch_size, False, task, offset) _, res, _, steps = model.step(sess, inpt, target, False) errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint) seq_err = float(seq_err) / batch_size if total > 0: errors = float(errors) / total if print_out: data.print_out(" %s len %d errors %.2f sequence-errors %.2f" % (task, l, 100*errors, 100*seq_err)) return errors, seq_err, (steps, inpt, [np.argmax(o, axis=1) for o in res])
def assign_vectors(word_vector_file, embedding_key, vocab_path, sess): """Assign the embedding_key variable from the given word vectors file.""" # For words in the word vector file, set their embedding at start. if not tf.gfile.Exists(word_vector_file): data.print_out("Word vector file does not exist: %s" % word_vector_file) sys.exit(1) vocab, _ = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() data.print_out("Pre-setting word vectors from %s" % word_vector_file) with tf.gfile.GFile(word_vector_file, mode="r") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for line in f: line_parts = line.split() # The first part is the word. word = line_parts[0] if word in vocab: # Remaining parts are components of the vector. word_vector = np.array(map(float, line_parts[1:])) if len(word_vector) != FLAGS.vec_size: data.print_out("Warn: Word '%s', Expecting vector size %d, " "found %d" % (word, FLAGS.vec_size, len(word_vector))) else: vectors[vocab[word]] = word_vector # Assign the modified vectors to the vectors_variable in the graph. sess.run([vectors_variable.initializer], {vectors_variable.initializer.inputs[1]: vectors})
def print_vectors(embedding_key, vocab_path, word_vector_file): """Print vectors from the given variable.""" _, rev_vocab = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() l, s = vectors.shape[0], vectors.shape[1] data.print_out("Printing %d word vectors from %s to %s." % (l, embedding_key, word_vector_file)) with tf.gfile.GFile(word_vector_file, mode="w") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for i in xrange(l): f.write(rev_vocab[i]) for j in xrange(s): f.write(" %.8f" % vectors[i][j]) f.write("\n")
def single_test(bin_id, model, sess, nprint, batch_size, dev, p, print_out=True, offset=None, beam_model=None): """Test model on test data of length l using the given session.""" if not dev[p][bin_id]: data.print_out(" bin %d (%d)\t%s\tppl NA errors NA seq-errors NA" % (bin_id, data.bins[bin_id], p)) return 1.0, 1.0, 0.0 inpt, target = data.get_batch( bin_id, batch_size, dev[p], FLAGS.height, offset) if FLAGS.beam_size > 1 and beam_model: loss, res, new_tgt, scores = m_step( model, beam_model, sess, batch_size, inpt, target, bin_id, FLAGS.eval_beam_steps, p) score_avgs = [sum(s) / float(len(s)) for s in scores] score_maxs = [max(s) for s in scores] score_str = ["(%.2f, %.2f)" % (score_avgs[i], score_maxs[i]) for i in xrange(FLAGS.eval_beam_steps)] data.print_out(" == scores (avg, max): %s" % "; ".join(score_str)) errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint, new_tgt, scores[-1]) else: loss, res, _, _ = model.step(sess, inpt, target, False) errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint) seq_err = float(seq_err) / batch_size if total > 0: errors = float(errors) / total if print_out: data.print_out(" bin %d (%d)\t%s\tppl %.2f errors %.2f seq-errors %.2f" % (bin_id, data.bins[bin_id], p, data.safe_exp(loss), 100 * errors, 100 * seq_err)) return (errors, seq_err, loss)
def multi_test(l, model, sess, task, nprint, batch_size, offset=None, ensemble=None): """Run multiple tests at lower batch size to save memory.""" errors, seq_err = 0.0, 0.0 to_print = nprint low_batch = FLAGS.low_batch_size low_batch = min(low_batch, batch_size) for mstep in xrange(batch_size / low_batch): cur_offset = None if offset is None else offset + mstep * low_batch err, sq_err, _ = single_test(l, model, sess, task, to_print, low_batch, False, cur_offset, ensemble=ensemble) to_print = max(0, to_print - low_batch) errors += err seq_err += sq_err if FLAGS.mode > 0: cur_errors = float(low_batch * errors) / ((mstep + 1) * low_batch) cur_seq_err = float(low_batch * seq_err) / ( (mstep + 1) * low_batch) data.print_out( " %s multitest current errors %.2f sequence-errors %.2f" % (task, 100 * cur_errors, 100 * cur_seq_err)) errors = float(low_batch) * float(errors) / batch_size seq_err = float(low_batch) * float(seq_err) / batch_size data.print_out(" %s len %d errors %.2f sequence-errors %.2f" % (task, l, 100 * errors, 100 * seq_err)) return errors, seq_err
def multi_test(l, model, sess, task, nprint, batch_size, offset=None): """Run multiple tests at lower batch size to save memory.""" errors, seq_err = 0.0, 0.0 to_print = nprint low_batch = FLAGS.low_batch_size low_batch = min(low_batch, batch_size) for mstep in xrange(batch_size / low_batch): cur_offset = None if offset is None else offset + mstep * low_batch err, sq_err, _ = single_test(l, model, sess, task, to_print, low_batch, False, cur_offset) to_print = max(0, to_print - low_batch) errors += err seq_err += sq_err if FLAGS.mode > 0: cur_errors = float(low_batch * errors) / ((mstep+1) * low_batch) cur_seq_err = float(low_batch * seq_err) / ((mstep+1) * low_batch) data.print_out(" %s multitest current errors %.2f sequence-errors %.2f" % (task, 100*cur_errors, 100*cur_seq_err)) errors = float(low_batch) * float(errors) / batch_size seq_err = float(low_batch) * float(seq_err) / batch_size data.print_out(" %s len %d errors %.2f sequence-errors %.2f" % (task, l, 100*errors, 100*seq_err)) return errors, seq_err
def spec(self, inp, task, nclass): """Return the target given the input for some tasks.""" if task == "sort": return sorted(inp) elif task == "id": return inp elif task == "rev": return [i for i in reversed(inp)] elif task == "shuffle": # bit reverse permutation n_bits = (len(inp) - 1).bit_length() res = [] for i in range(len(inp)): i1 = reverse_bit(i, n_bits) % len(inp) res.append(inp[i1]) return res elif task == "incr": carry = 1 res = [] for i in range(len(inp)): if inp[i] + carry < nclass: res.append(inp[i] + carry) carry = 0 else: res.append(1) carry = 1 return res elif task == "left": return [inp[0]] elif task == "right": return [inp[-1]] elif task == "left-shift": return [inp[l - 1] for l in range(len(inp))] elif task == "right-shift": return [inp[l + 1] for l in range(len(inp))] else: data_utils.print_out("Unknown spec for task " + str(task)) sys.exit()
def single_test(l, model, sess, task, nprint, batch_size, print_out=True, offset=None, ensemble=None, get_steps=False): """Test model on test data of length l using the given session.""" inpt, target = data.get_batch(l, batch_size, False, task, offset) _, res, _, steps = model.step(sess, inpt, target, False, get_steps=get_steps) errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint) seq_err = float(seq_err) / batch_size if total > 0: errors = float(errors) / total if print_out: data.print_out(" %s len %d errors %.2f sequence-errors %.2f" % (task, l, 100 * errors, 100 * seq_err)) # Ensemble eval. if ensemble: results = [] for m in ensemble: model.saver.restore(sess, m) _, result, _, _ = model.step(sess, inpt, target, False) m_errors, m_total, m_seq_err = data.accuracy( inpt, result, target, batch_size, nprint) m_seq_err = float(m_seq_err) / batch_size if total > 0: m_errors = float(m_errors) / m_total data.print_out( " %s len %d m-errors %.2f m-sequence-errors %.2f" % (task, l, 100 * m_errors, 100 * m_seq_err)) results.append(result) ens = [sum(o) for o in zip(*results)] errors, total, seq_err = data.accuracy(inpt, ens, target, batch_size, nprint) seq_err = float(seq_err) / batch_size if total > 0: errors = float(errors) / total if print_out: data.print_out( " %s len %d ens-errors %.2f ens-sequence-errors %.2f" % (task, l, 100 * errors, 100 * seq_err)) return errors, seq_err, (steps, inpt, [np.argmax(o, axis=1) for o in res])
def single_test(l, model, sess, task, nprint, batch_size, print_out=True, offset=None, ensemble=None, get_steps=False): """Test model on test data of length l using the given session.""" inpt, target = data.get_batch(l, batch_size, False, task, offset) _, res, _, steps = model.step(sess, inpt, target, False, get_steps=get_steps) errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint) seq_err = float(seq_err) / batch_size if total > 0: errors = float(errors) / total if print_out: data.print_out(" %s len %d errors %.2f sequence-errors %.2f" % (task, l, 100*errors, 100*seq_err)) # Ensemble eval. if ensemble: results = [] for m in ensemble: model.saver.restore(sess, m) _, result, _, _ = model.step(sess, inpt, target, False) m_errors, m_total, m_seq_err = data.accuracy(inpt, result, target, batch_size, nprint) m_seq_err = float(m_seq_err) / batch_size if total > 0: m_errors = float(m_errors) / m_total data.print_out(" %s len %d m-errors %.2f m-sequence-errors %.2f" % (task, l, 100*m_errors, 100*m_seq_err)) results.append(result) ens = [sum(o) for o in zip(*results)] errors, total, seq_err = data.accuracy(inpt, ens, target, batch_size, nprint) seq_err = float(seq_err) / batch_size if total > 0: errors = float(errors) / total if print_out: data.print_out(" %s len %d ens-errors %.2f ens-sequence-errors %.2f" % (task, l, 100*errors, 100*seq_err)) return errors, seq_err, (steps, inpt, [np.argmax(o, axis=1) for o in res])
def train(): """Train the model.""" batch_size = FLAGS.batch_size * FLAGS.num_gpus (model, beam_model, min_length, max_length, checkpoint_dir, (train_set, dev_set, en_vocab_path, fr_vocab_path), sv, sess) = initialize() with sess.as_default(): quant_op = model.quantize_op max_cur_length = min(min_length + 3, max_length) prev_acc_perp = [1000000 for _ in xrange(5)] prev_seq_err = 1.0 is_chief = FLAGS.task < 1 do_report = False # Main traning loop. while not sv.ShouldStop(): global_step, max_cur_length, learning_rate = sess.run( [model.global_step, model.cur_length, model.lr]) acc_loss, acc_l1, acc_total, acc_errors, acc_seq_err = 0.0, 0.0, 0, 0, 0 acc_grad_norm, step_count, step_c1, step_time = 0.0, 0, 0, 0.0 # For words in the word vector file, set their embedding at start. bound1 = FLAGS.steps_per_checkpoint - 1 if FLAGS.word_vector_file_en and global_step < bound1 and is_chief: assign_vectors(FLAGS.word_vector_file_en, "embedding:0", en_vocab_path, sess) if FLAGS.max_target_vocab < 1: assign_vectors(FLAGS.word_vector_file_en, "target_embedding:0", en_vocab_path, sess) if FLAGS.word_vector_file_fr and global_step < bound1 and is_chief: assign_vectors(FLAGS.word_vector_file_fr, "embedding:0", fr_vocab_path, sess) if FLAGS.max_target_vocab < 1: assign_vectors(FLAGS.word_vector_file_fr, "target_embedding:0", fr_vocab_path, sess) for _ in xrange(FLAGS.steps_per_checkpoint): step_count += 1 step_c1 += 1 global_step = int(model.global_step.eval()) train_beam_anneal = global_step / float(FLAGS.train_beam_anneal) train_beam_freq = FLAGS.train_beam_freq * min(1.0, train_beam_anneal) p = random.choice(FLAGS.problem.split("-")) train_set = global_train_set[p][-1] bucket_id = get_bucket_id(train_buckets_scale[p][-1], max_cur_length, train_set) # Prefer longer stuff 60% of time if not wmt. if np.random.randint(100) < 60 and FLAGS.problem != "wmt": bucket1 = get_bucket_id(train_buckets_scale[p][-1], max_cur_length, train_set) bucket_id = max(bucket1, bucket_id) # Run a step and time it. start_time = time.time() inp, target = data.get_batch(bucket_id, batch_size, train_set, FLAGS.height) noise_param = math.sqrt(math.pow(global_step + 1, -0.55) * prev_seq_err) * FLAGS.grad_noise_scale # In multi-step mode, we use best from beam for middle steps. state, new_target, scores, history = None, None, None, [] while (FLAGS.beam_size > 1 and train_beam_freq > np.random.random_sample()): # Get the best beam (no training, just forward model). new_target, new_first, new_inp, scores = get_best_beam( beam_model, sess, inp, target, batch_size, FLAGS.beam_size, bucket_id, history, p) history.append(new_first) # Training step with the previous input and the best beam as target. _, _, _, state = model.step(sess, inp, new_target, FLAGS.do_train, noise_param, update_mem=True, state=state) # Change input to the new one for the next step. inp = new_inp # If all results are great, stop (todo: not to wait for all?). if FLAGS.nprint > 1: print(scores) if sum(scores) / float(len(scores)) >= 10.0: break # The final step with the true target. loss, res, gnorm, _ = model.step( sess, inp, target, FLAGS.do_train, noise_param, update_mem=True, state=state) step_time += time.time() - start_time acc_grad_norm += 0.0 if gnorm is None else float(gnorm) # Accumulate statistics. acc_loss += loss acc_l1 += loss errors, total, seq_err = data.accuracy( inp, res, target, batch_size, 0, new_target, scores) if FLAGS.nprint > 1: print("seq_err: ", seq_err) acc_total += total acc_errors += errors acc_seq_err += seq_err # Report summary every 10 steps. if step_count + 3 > FLAGS.steps_per_checkpoint: do_report = True # Don't polute plot too early. if is_chief and step_count % 10 == 1 and do_report: cur_loss = acc_l1 / float(step_c1) acc_l1, step_c1 = 0.0, 0 cur_perp = data.safe_exp(cur_loss) summary = tf.Summary() summary.value.extend( [tf.Summary.Value(tag="log_perplexity", simple_value=cur_loss), tf.Summary.Value(tag="perplexity", simple_value=cur_perp)]) sv.SummaryComputed(sess, summary, global_step) # Normalize and print out accumulated statistics. acc_loss /= step_count step_time /= FLAGS.steps_per_checkpoint acc_seq_err = float(acc_seq_err) / (step_count * batch_size) prev_seq_err = max(0.0, acc_seq_err - 0.02) # No noise at error < 2%. acc_errors = float(acc_errors) / acc_total if acc_total > 0 else 1.0 t_size = float(sum([len(x) for x in train_set])) / float(1000000) msg = ("step %d step-time %.2f train-size %.3f lr %.6f grad-norm %.4f" % (global_step + 1, step_time, t_size, learning_rate, acc_grad_norm / FLAGS.steps_per_checkpoint)) data.print_out("%s len %d ppl %.6f errors %.2f sequence-errors %.2f" % (msg, max_cur_length, data.safe_exp(acc_loss), 100*acc_errors, 100*acc_seq_err)) # If errors are below the curriculum threshold, move curriculum forward. is_good = FLAGS.curriculum_ppx > data.safe_exp(acc_loss) is_good = is_good and FLAGS.curriculum_seq > acc_seq_err if is_good and is_chief: if FLAGS.quantize: # Quantize weights. data.print_out(" Quantizing parameters.") sess.run([quant_op]) # Increase current length (until the next with training data). sess.run(model.cur_length_incr_op) # Forget last perplexities if we're not yet at the end. if max_cur_length < max_length: prev_acc_perp.append(1000000) # Lower learning rate if we're worse than the last 5 checkpoints. acc_perp = data.safe_exp(acc_loss) if acc_perp > max(prev_acc_perp[-5:]) and is_chief: sess.run(model.lr_decay_op) prev_acc_perp.append(acc_perp) # Save checkpoint. if is_chief: checkpoint_path = os.path.join(checkpoint_dir, "neural_gpu.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) # Run evaluation. bin_bound = 4 for p in FLAGS.problem.split("-"): total_loss, total_err, tl_counter = 0.0, 0.0, 0 for bin_id in xrange(len(data.bins)): if bin_id < bin_bound or bin_id % FLAGS.eval_bin_print == 1: err, _, loss = single_test(bin_id, model, sess, FLAGS.nprint, batch_size * 4, dev_set, p, beam_model=beam_model) if loss > 0.0: total_loss += loss total_err += err tl_counter += 1 test_loss = total_loss / max(1, tl_counter) test_err = total_err / max(1, tl_counter) test_perp = data.safe_exp(test_loss) summary = tf.Summary() summary.value.extend( [tf.Summary.Value(tag="test/%s/loss" % p, simple_value=test_loss), tf.Summary.Value(tag="test/%s/error" % p, simple_value=test_err), tf.Summary.Value(tag="test/%s/perplexity" % p, simple_value=test_perp)]) sv.SummaryComputed(sess, summary, global_step)
def __init__(self, nmaps, vec_size, niclass, noclass, dropout, max_grad_norm, cutoff, nconvs, kw, kh, height, mem_size, learning_rate, min_length, num_gpus, num_replicas, grad_noise_scale, sampling_rate, act_noise=0.0, do_rnn=False, atrous=False, beam_size=1, backward=True, do_layer_norm=False, autoenc_decay=1.0): #todo Feeds for parameters and ops to update them. self.nmaps = nmaps if backward: self.global_step = tf.Variable(0, trainable=False, name="global_step") self.cur_length = tf.Variable(min_length, trainable=False) self.cur_length_incr_op = self.cur_length.assign_add(1) self.lr = tf.Variable(learning_rate, trainable=False) self.lr_decay_op = self.lr.assign(self.lr * 0.995) self.do_training = tf.placeholder(tf.float32, name="do_training") self.update_mem = tf.placeholder(tf.int32, name="update_mem") self.noise_param = tf.placeholder(tf.float32, name="noise_param") self.input = tf.placeholder(tf.int32, name="inp") self.target = tf.placeholder(tf.int32, name="tgt") self.prev_step = tf.placeholder(tf.float32, name="prev_step") gpu_input = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.input) gpu_target = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.target) gpu_prev_step = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.prev_step) batch_size = tf.shape(gpu_input[0])[0] if backward: adam_lr = 0.005 * self.lr adam = tf.train.AdamOptimizer(adam_lr, epsilon=1e-3) def adam_update(grads): return adam.apply_gradients(zip(grads, tf.trainable_variables()), global_step=self.global_step, name="adam_update") #todo When switching from Adam to SGD we perform reverse-decay. if backward: global_step_float = tf.cast(self.global_step, tf.float32) sampling_decay_exponent = global_step_float / 100000.0 sampling_decay = tf.maximum(0.05, tf.pow(0.5, sampling_decay_exponent)) self.sampling = sampling_rate * 0.05 / sampling_decay else: self.sampling = tf.constant(0.0) #todo Cache variables on cpu if needed. if num_replicas > 1 or num_gpus > 1: with tf.device("/cpu:0"): caching_const = tf.constant(0) tf.get_variable_scope().set_caching_device(caching_const.op.device) def gpu_avg(l): if l[0] is None: for elem in l: assert elem is None return 0.0 if len(l) < 2: return l[0] return sum(l) / float(num_gpus) self.length_tensor = tf.placeholder(tf.int32, name="length") with tf.device("/cpu:0"): emb_weights = tf.get_variable( "embedding", [niclass, vec_size], initializer=tf.random_uniform_initializer(-1.7, 1.7)) if beam_size > 0: target_emb_weights = tf.get_variable( "target_embedding", [noclass, nmaps], initializer=tf.random_uniform_initializer(-1.7, 1.7)) e0 = tf.scatter_update(emb_weights, tf.constant(0, dtype=tf.int32, shape=[1]), tf.zeros([1, vec_size])) output_w = tf.get_variable("output_w", [nmaps, noclass], tf.float32) def conv_rate(layer): if atrous: return 2**layer return 1 # pylint: disable=cell-var-from-loop def enc_step(step): """Encoder step.""" if autoenc_decay < 1.0: quant_step = autoenc_quantize(step, 16, nmaps, self.do_training) if backward: exp_glob = tf.train.exponential_decay( 1.0, self.global_step - 10000, 1000, autoenc_decay) dec_factor = 1.0 - exp_glob # * self.do_training dec_factor = tf.cond(tf.less(self.global_step, 10500), lambda: tf.constant(0.05), lambda: dec_factor) else: dec_factor = 1.0 cur = tf.cond(tf.less(tf.random_uniform([]), dec_factor), lambda: quant_step, lambda: step) else: cur = step if dropout > 0.0001: cur = tf.nn.dropout(cur, keep_prob) if act_noise > 0.00001: cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale # Do nconvs-many CGRU steps. if do_jit and tf.get_variable_scope().reuse: with jit_scope(): for layer in range(nconvs): cur = conv_gru([], cur, kw, kh, nmaps, conv_rate(layer), cutoff, "ecgru_%d" % layer, do_layer_norm) else: for layer in range(nconvs): cur = conv_gru([], cur, kw, kh, nmaps, conv_rate(layer), cutoff, "ecgru_%d" % layer, do_layer_norm) return cur zero_tgt = tf.zeros([batch_size, nmaps, 1]) zero_tgt.set_shape([None, nmaps, 1]) def dec_substep(step, decided): """Decoder sub-step.""" cur = step if dropout > 0.0001: cur = tf.nn.dropout(cur, keep_prob) if act_noise > 0.00001: cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale # Do nconvs-many CGRU steps. if do_jit and tf.get_variable_scope().reuse: with jit_scope(): for layer in range(nconvs): cur = conv_gru([decided], cur, kw, kh, nmaps, conv_rate(layer), cutoff, "dcgru_%d" % layer, do_layer_norm) else: for layer in range(nconvs): cur = conv_gru([decided], cur, kw, kh, nmaps, conv_rate(layer), cutoff, "dcgru_%d" % layer, do_layer_norm) return cur # pylint: enable=cell-var-from-loop def dec_step(step, it, it_int, decided, output_ta, tgts, mloss, nupd_in, out_idx, beam_cost): """Decoder step.""" nupd, mem_loss = 0, 0.0 if mem_size > 0: it_incr = tf.minimum(it + 1, length - 1) mem, mem_loss, nupd = memory_run( step, nmaps, mem_size, batch_size, noclass, self.global_step, self.do_training, self.update_mem, 10, num_gpus, target_emb_weights, output_w, gpu_targets_tn, it_incr) step = dec_substep(step, decided) output_l = tf.expand_dims(tf.expand_dims(step[:, it, 0, :], 1), 1) # Calculate argmax output. output = tf.reshape(output_l, [-1, nmaps]) # pylint: disable=cell-var-from-loop output = tf.matmul(output, output_w) if beam_size > 1: beam_cost, output, out, reordered = reorder_beam( beam_size, batch_size, beam_cost, output, it_int == 0, [output_l, out_idx, step, decided]) [output_l, out_idx, step, decided] = reordered else: # Scheduled sampling. out = tf.multinomial(tf.stop_gradient(output), 1) out = tf.to_int32(tf.squeeze(out, [1])) out_write = output_ta.write(it, output_l[:batch_size, :, :, :]) output = tf.gather(target_emb_weights, out) output = tf.reshape(output, [-1, 1, nmaps]) output = tf.concat(axis=1, values=[output] * height) tgt = tgts[it, :, :, :] selected = tf.cond(tf.less(tf.random_uniform([]), self.sampling), lambda: output, lambda: tgt) # pylint: enable=cell-var-from-loop dec_write = place_at14(decided, tf.expand_dims(selected, 1), it) out_idx = place_at13( out_idx, tf.reshape(out, [beam_size * batch_size, 1, 1]), it) if mem_size > 0: mem = tf.concat(axis=2, values=[mem] * height) dec_write = place_at14(dec_write, mem, it_incr) return (step, dec_write, out_write, mloss + mem_loss, nupd_in + nupd, out_idx, beam_cost) # Main model construction. gpu_outputs = [] gpu_losses = [] gpu_grad_norms = [] grads_list = [] gpu_out_idx = [] self.after_enc_step = [] for gpu in range( num_gpus): # Multi-GPU towers, average gradients later. length = self.length_tensor length_float = tf.cast(length, tf.float32) if gpu > 0: tf.get_variable_scope().reuse_variables() gpu_outputs.append([]) gpu_losses.append([]) gpu_grad_norms.append([]) with tf.name_scope("gpu%d" % gpu), tf.device("/gpu:%d" % gpu): # Main graph creation loop. data.print_out("Creating model.") start_time = time.time() # Embed inputs and calculate mask. with tf.device("/cpu:0"): tgt_shape = tf.shape(tf.squeeze(gpu_target[gpu], [1])) weights = tf.where( tf.squeeze(gpu_target[gpu], [1]) > 0, tf.ones(tgt_shape), tf.zeros(tgt_shape)) # Embed inputs and targets. with tf.control_dependencies([e0]): start = tf.gather(emb_weights, gpu_input[gpu]) # b x h x l x nmaps gpu_targets_tn = gpu_target[gpu] # b x 1 x len if beam_size > 0: embedded_targets_tn = tf.gather( target_emb_weights, gpu_targets_tn) embedded_targets_tn = tf.transpose( embedded_targets_tn, [2, 0, 1, 3]) # len x b x 1 x nmaps embedded_targets_tn = tf.concat( axis=2, values=[embedded_targets_tn] * height) # First image comes from start by applying convolution and adding 0s. start = tf.transpose(start, [0, 2, 1, 3]) # Now b x len x h x vec_s first = conv_linear(start, 1, 1, vec_size, nmaps, 1, True, 0.0, "input") first = layer_norm(first, nmaps, "input") # Computation steps. keep_prob = dropout * 3.0 / tf.sqrt(length_float) keep_prob = 1.0 - self.do_training * keep_prob act_noise_scale = act_noise * self.do_training # Start with a convolutional gate merging previous step. step = conv_gru([gpu_prev_step[gpu]], first, kw, kh, nmaps, 1, cutoff, "first", do_layer_norm) # This is just for running a baseline RNN seq2seq model. if do_rnn: self.after_enc_step.append( step) # Not meaningful here, but needed. def lstm_cell(): return tf.contrib.rnn.BasicLSTMCell(height * nmaps) cell = tf.contrib.rnn.MultiRNNCell( [lstm_cell() for _ in range(nconvs)]) with tf.variable_scope("encoder"): encoder_outputs, encoder_state = tf.nn.dynamic_rnn( cell, tf.reshape(step, [batch_size, length, height * nmaps]), dtype=tf.float32, time_major=False) # Attention. attn = tf.layers.dense(encoder_outputs, height * nmaps, name="attn1") # pylint: disable=cell-var-from-loop @function.Defun(noinline=True) def attention_query(query, attn_v): vecs = tf.tanh(attn + tf.expand_dims(query, 1)) mask = tf.reduce_sum( vecs * tf.reshape(attn_v, [1, 1, -1]), 2) mask = tf.nn.softmax(mask) return tf.reduce_sum( encoder_outputs * tf.expand_dims(mask, 2), 1) with tf.variable_scope("decoder"): def decoder_loop_fn(state_prev_cell_out, _, cell_inp_cur_tgt): state, prev_cell_out = state_prev_cell_out cell_inp, cur_tgt = cell_inp_cur_tgt """Decoder loop function.""" attn_q = tf.layers.dense(prev_cell_out, height * nmaps, name="attn_query") attn_res = attention_query( attn_q, tf.get_variable( "attn_v", [height * nmaps], initializer=tf.random_uniform_initializer( -0.1, 0.1))) concatenated = tf.reshape( tf.concat(axis=1, values=[cell_inp, attn_res]), [batch_size, 2 * height * nmaps]) cell_inp = tf.layers.dense(concatenated, height * nmaps, name="attn_merge") output, new_state = cell(cell_inp, state) mem_loss = 0.0 if mem_size > 0: res, mask, mem_loss = memory_call( output, cur_tgt, height * nmaps, mem_size, noclass, num_gpus, self.update_mem) res = tf.gather(target_emb_weights, res) res *= tf.expand_dims(mask[:, 0], 1) output = tf.layers.dense(tf.concat( axis=1, values=[output, res]), height * nmaps, name="rnnmem") return new_state, output, mem_loss # pylint: enable=cell-var-from-loop gpu_targets = tf.squeeze(gpu_target[gpu], [1]) # b x len gpu_tgt_trans = tf.transpose(gpu_targets, [1, 0]) dec_zero = tf.zeros([batch_size, 1], dtype=tf.int32) dec_inp = tf.concat(axis=1, values=[dec_zero, gpu_targets]) dec_inp = dec_inp[:, :length] embedded_dec_inp = tf.gather(target_emb_weights, dec_inp) embedded_dec_inp_proj = tf.layers.dense( embedded_dec_inp, height * nmaps, name="dec_proj") embedded_dec_inp_proj = tf.transpose( embedded_dec_inp_proj, [1, 0, 2]) init_vals = (encoder_state, tf.zeros([batch_size, height * nmaps]), 0.0) _, dec_outputs, mem_losses = tf.scan( decoder_loop_fn, (embedded_dec_inp_proj, gpu_tgt_trans), initializer=init_vals) mem_loss = tf.reduce_mean(mem_losses) outputs = tf.layers.dense(dec_outputs, nmaps, name="out_proj") # Final convolution to get logits, list outputs. outputs = tf.matmul(tf.reshape(outputs, [-1, nmaps]), output_w) outputs = tf.reshape(outputs, [length, batch_size, noclass]) gpu_out_idx.append(tf.argmax(outputs, 2)) else: # Here we go with the Neural GPU. # Encoder. enc_length = length step = enc_step(step) # First step hard-coded. # pylint: disable=cell-var-from-loop i = tf.constant(1) c = lambda i, _s: tf.less(i, enc_length) def enc_step_lambda(i, step): with tf.variable_scope(tf.get_variable_scope(), reuse=True): new_step = enc_step(step) return (i + 1, new_step) _, step = tf.while_loop(c, enc_step_lambda, [i, step], parallel_iterations=1, swap_memory=True) # pylint: enable=cell-var-from-loop self.after_enc_step.append(step) # Decoder. if beam_size > 0: output_ta = tf.TensorArray(dtype=tf.float32, size=length, dynamic_size=False, infer_shape=False, name="outputs") out_idx = tf.zeros([beam_size * batch_size, length, 1], dtype=tf.int32) decided_t = tf.zeros( [beam_size * batch_size, length, height, vec_size]) # Prepare for beam search. tgts = tf.concat(axis=1, values=[embedded_targets_tn] * beam_size) beam_cost = tf.zeros([batch_size, beam_size]) step = tf.concat(axis=0, values=[step] * beam_size) # First step hard-coded. step, decided_t, output_ta, mem_loss, nupd, oi, bc = dec_step( step, 0, 0, decided_t, output_ta, tgts, 0.0, 0, out_idx, beam_cost) tf.get_variable_scope().reuse_variables() # pylint: disable=cell-var-from-loop def step_lambda(i, step, dec_t, out_ta, ml, nu, oi, bc): with tf.variable_scope(tf.get_variable_scope(), reuse=True): s, d, t, nml, nu, oi, bc = dec_step( step, i, 1, dec_t, out_ta, tgts, ml, nu, oi, bc) return (i + 1, s, d, t, nml, nu, oi, bc) i = tf.constant(1) c = lambda i, _s, _d, _o, _ml, _nu, _oi, _bc: tf.less( i, length) _, step, _, output_ta, mem_loss, nupd, out_idx, _ = tf.while_loop( c, step_lambda, [ i, step, decided_t, output_ta, mem_loss, nupd, oi, bc ], parallel_iterations=1, swap_memory=True) # pylint: enable=cell-var-from-loop gpu_out_idx.append(tf.squeeze(out_idx, [2])) outputs = output_ta.stack() outputs = tf.squeeze(outputs, [2, 3]) # Now l x b x nmaps else: # If beam_size is 0 or less, we don't have a decoder. mem_loss = 0.0 outputs = tf.transpose(step[:, :, 1, :], [1, 0, 2]) gpu_out_idx.append(tf.argmax(outputs, 2)) # Final convolution to get logits, list outputs. outputs = tf.matmul(tf.reshape(outputs, [-1, nmaps]), output_w) outputs = tf.reshape(outputs, [length, batch_size, noclass]) gpu_outputs[gpu] = tf.nn.softmax(outputs) # Calculate cross-entropy loss and normalize it. targets_soft = make_dense(tf.squeeze(gpu_target[gpu], [1]), noclass, 0.1) targets_soft = tf.reshape(targets_soft, [-1, noclass]) targets_hard = make_dense(tf.squeeze(gpu_target[gpu], [1]), noclass, 0.0) targets_hard = tf.reshape(targets_hard, [-1, noclass]) output = tf.transpose(outputs, [1, 0, 2]) xent_soft = tf.reshape( tf.nn.softmax_cross_entropy_with_logits( logits=tf.reshape(output, [-1, noclass]), labels=targets_soft), [batch_size, length]) xent_hard = tf.reshape( tf.nn.softmax_cross_entropy_with_logits( logits=tf.reshape(output, [-1, noclass]), labels=targets_hard), [batch_size, length]) low, high = 0.1 / float(noclass - 1), 0.9 const = high * tf.log(high) + float(noclass - 1) * low * tf.log(low) weight_sum = tf.reduce_sum(weights) + 1e-20 true_perp = tf.reduce_sum(xent_hard * weights) / weight_sum soft_loss = tf.reduce_sum(xent_soft * weights) / weight_sum perp_loss = soft_loss + const # Final loss: cross-entropy + shared parameter relaxation part + extra. mem_loss = 0.5 * tf.reduce_mean(mem_loss) / length_float total_loss = perp_loss + mem_loss gpu_losses[gpu].append(true_perp) # Gradients. if backward: data.print_out("Creating backward pass for the model.") grads = tf.gradients(total_loss, tf.trainable_variables(), colocate_gradients_with_ops=True) for g_i, g in enumerate(grads): if isinstance(g, tf.IndexedSlices): grads[g_i] = tf.convert_to_tensor(g) grads, norm = tf.clip_by_global_norm(grads, max_grad_norm) gpu_grad_norms[gpu].append(norm) for g in grads: if grad_noise_scale > 0.001: g += tf.truncated_normal( tf.shape(g)) * self.noise_param grads_list.append(grads) else: gpu_grad_norms[gpu].append(0.0) data.print_out("Created model for gpu %d in %.2f s." % (gpu, time.time() - start_time)) self.updates = [] self.after_enc_step = tf.concat( axis=0, values=self.after_enc_step) # Concat GPUs. if backward: tf.get_variable_scope()._reuse = False tf.get_variable_scope().set_caching_device(None) grads = [ gpu_avg([grads_list[g][i] for g in range(num_gpus)]) for i in range(len(grads_list[0])) ] update = adam_update(grads) self.updates.append(update) else: self.updates.append(tf.no_op()) self.losses = [ gpu_avg([gpu_losses[g][i] for g in range(num_gpus)]) for i in range(len(gpu_losses[0])) ] self.out_idx = tf.concat(axis=0, values=gpu_out_idx) self.grad_norms = [ gpu_avg([gpu_grad_norms[g][i] for g in range(num_gpus)]) for i in range(len(gpu_grad_norms[0])) ] self.outputs = [ tf.concat(axis=1, values=[gpu_outputs[g] for g in range(num_gpus)]) ] self.quantize_op = quantize_weights_op(512, 8) if backward: self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
def __init__(self, nmaps, vec_size, niclass, noclass, dropout, rx_step, max_grad_norm, cutoff, nconvs, kw, kh, height, mode, learning_rate, pull, pull_incr, min_length, act_noise=0.0): # Feeds for parameters and ops to update them. self.global_step = tf.Variable(0, trainable=False) self.cur_length = tf.Variable(min_length, trainable=False) self.cur_length_incr_op = self.cur_length.assign_add(1) self.lr = tf.Variable(float(learning_rate), trainable=False) self.lr_decay_op = self.lr.assign(self.lr * 0.98) self.pull = tf.Variable(float(pull), trainable=False) self.pull_incr_op = self.pull.assign(self.pull * pull_incr) self.do_training = tf.placeholder(tf.float32, name="do_training") self.noise_param = tf.placeholder(tf.float32, name="noise_param") # Feeds for inputs, targets, outputs, losses, etc. self.input = [] self.target = [] for l in xrange(data_utils.forward_max + 1): self.input.append(tf.placeholder(tf.int32, name="inp{0}".format(l))) self.target.append(tf.placeholder(tf.int32, name="tgt{0}".format(l))) self.outputs = [] self.losses = [] self.grad_norms = [] self.updates = [] # Computation. inp0_shape = tf.shape(self.input[0]) batch_size = inp0_shape[0] with tf.device("/cpu:0"): emb_weights = tf.get_variable( "embedding", [niclass, vec_size], initializer=tf.random_uniform_initializer(-1.7, 1.7)) e0 = tf.scatter_update(emb_weights, tf.constant(0, dtype=tf.int32, shape=[1]), tf.zeros([1, vec_size])) adam = tf.train.AdamOptimizer(self.lr, epsilon=1e-4) # Main graph creation loop, for every bin in data_utils. self.steps = [] for length in sorted(list(set(data_utils.bins + [data_utils.forward_max]))): data_utils.print_out("Creating model for bin of length %d." % length) start_time = time.time() if length > data_utils.bins[0]: tf.get_variable_scope().reuse_variables() # Embed inputs and calculate mask. with tf.device("/cpu:0"): with tf.control_dependencies([e0]): embedded = [tf.nn.embedding_lookup(emb_weights, self.input[l]) for l in xrange(length)] # Mask to 0-out padding space in each step. imask = [check_for_zero(self.input[l]) for l in xrange(length)] omask = [check_for_zero(self.target[l]) for l in xrange(length)] mask = [1.0 - (imask[i] * omask[i]) for i in xrange(length)] mask = [tf.reshape(m, [-1, 1]) for m in mask] # Use a shifted mask for step scaling and concatenated for weights. shifted_mask = mask + [tf.zeros_like(mask[0])] scales = [shifted_mask[i] * (1.0 - shifted_mask[i+1]) for i in xrange(length)] scales = [tf.reshape(s, [-1, 1, 1, 1]) for s in scales] mask = tf.concat(1, mask[0:length]) # batch x length weights = mask # Add a height dimension to mask to use later for masking. mask = tf.reshape(mask, [-1, length, 1, 1]) mask = tf.concat(2, [mask for _ in xrange(height)]) + tf.zeros( tf.pack([batch_size, length, height, nmaps]), dtype=tf.float32) # Start is a length-list of batch-by-nmaps tensors, reshape and concat. start = [tf.tanh(embedded[l]) for l in xrange(length)] start = [tf.reshape(start[l], [-1, 1, nmaps]) for l in xrange(length)] start = tf.reshape(tf.concat(1, start), [-1, length, 1, nmaps]) # First image comes from start by applying one convolution and adding 0s. first = conv_linear(start, 1, 1, vec_size, nmaps, True, 0.0, "input") first = [first] + [tf.zeros(tf.pack([batch_size, length, 1, nmaps]), dtype=tf.float32) for _ in xrange(height - 1)] first = tf.concat(2, first) # Computation steps. keep_prob = 1.0 - self.do_training * (dropout * 8.0 / float(length)) step = [tf.nn.dropout(first, keep_prob) * mask] act_noise_scale = act_noise * self.do_training * self.pull outputs = [] for it in xrange(length): with tf.variable_scope("RX%d" % (it % rx_step)) as vs: if it >= rx_step: vs.reuse_variables() cur = step[it] # Do nconvs-many CGRU steps. for layer in xrange(nconvs): cur = conv_gru([], cur, kw, kh, nmaps, cutoff, "cgru_%d" % layer) cur *= mask outputs.append(tf.slice(cur, [0, 0, 0, 0], [-1, -1, 1, -1])) cur = tf.nn.dropout(cur, keep_prob) if act_noise > 0.00001: cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale step.append(cur * mask) self.steps.append([tf.reshape(s, [-1, length, height * nmaps]) for s in step]) # Output is the n-th step output; n = current length, as in scales. output = tf.add_n([outputs[i] * scales[i] for i in xrange(length)]) # Final convolution to get logits, list outputs. output = conv_linear(output, 1, 1, nmaps, noclass, True, 0.0, "output") output = tf.reshape(output, [-1, length, noclass]) external_output = [tf.reshape(o, [-1, noclass]) for o in list(tf.split(1, length, output))] external_output = [tf.nn.softmax(o) for o in external_output] self.outputs.append(external_output) # Calculate cross-entropy loss and normalize it. targets = tf.concat(1, [make_dense(self.target[l], noclass) for l in xrange(length)]) targets = tf.reshape(targets, [-1, noclass]) xent = tf.reshape(tf.nn.softmax_cross_entropy_with_logits( tf.reshape(output, [-1, noclass]), targets), [-1, length]) perp_loss = tf.reduce_sum(xent * weights) perp_loss /= tf.cast(batch_size, dtype=tf.float32) perp_loss /= length # Final loss: cross-entropy + shared parameter relaxation part. relax_dist, self.avg_op = relaxed_distance(rx_step) total_loss = perp_loss + relax_dist * self.pull self.losses.append(perp_loss) # Gradients and Adam update operation. if length == data_utils.bins[0] or (mode == 0 and length < data_utils.bins[-1] + 1): data_utils.print_out("Creating backward for bin of length %d." % length) params = tf.trainable_variables() grads = tf.gradients(total_loss, params) grads, norm = tf.clip_by_global_norm(grads, max_grad_norm) self.grad_norms.append(norm) for grad in grads: if isinstance(grad, tf.Tensor): grad += tf.truncated_normal(tf.shape(grad)) * self.noise_param update = adam.apply_gradients(zip(grads, params), global_step=self.global_step) self.updates.append(update) data_utils.print_out("Created model for bin of length %d in" " %.2f s." % (length, time.time() - start_time)) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, nmaps, vec_size, niclass, noclass, dropout, max_grad_norm, cutoff, nconvs, kw, kh, height, mem_size, learning_rate, min_length, num_gpus, num_replicas, grad_noise_scale, sampling_rate, act_noise=0.0, do_rnn=False, atrous=False, beam_size=1, backward=True, do_layer_norm=False, autoenc_decay=1.0): # Feeds for parameters and ops to update them. self.nmaps = nmaps if backward: self.global_step = tf.Variable(0, trainable=False, name="global_step") self.cur_length = tf.Variable(min_length, trainable=False) self.cur_length_incr_op = self.cur_length.assign_add(1) self.lr = tf.Variable(learning_rate, trainable=False) self.lr_decay_op = self.lr.assign(self.lr * 0.995) self.do_training = tf.placeholder(tf.float32, name="do_training") self.update_mem = tf.placeholder(tf.int32, name="update_mem") self.noise_param = tf.placeholder(tf.float32, name="noise_param") # Feeds for inputs, targets, outputs, losses, etc. self.input = tf.placeholder(tf.int32, name="inp") self.target = tf.placeholder(tf.int32, name="tgt") self.prev_step = tf.placeholder(tf.float32, name="prev_step") gpu_input = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.input) gpu_target = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.target) gpu_prev_step = tf.split(axis=0, num_or_size_splits=num_gpus, value=self.prev_step) batch_size = tf.shape(gpu_input[0])[0] if backward: adam_lr = 0.005 * self.lr adam = tf.train.AdamOptimizer(adam_lr, epsilon=1e-3) def adam_update(grads): return adam.apply_gradients(zip(grads, tf.trainable_variables()), global_step=self.global_step, name="adam_update") # When switching from Adam to SGD we perform reverse-decay. if backward: global_step_float = tf.cast(self.global_step, tf.float32) sampling_decay_exponent = global_step_float / 100000.0 sampling_decay = tf.maximum(0.05, tf.pow(0.5, sampling_decay_exponent)) self.sampling = sampling_rate * 0.05 / sampling_decay else: self.sampling = tf.constant(0.0) # Cache variables on cpu if needed. if num_replicas > 1 or num_gpus > 1: with tf.device("/cpu:0"): caching_const = tf.constant(0) tf.get_variable_scope().set_caching_device(caching_const.op.device) # partitioner = tf.variable_axis_size_partitioner(1024*256*4) # tf.get_variable_scope().set_partitioner(partitioner) def gpu_avg(l): if l[0] is None: for elem in l: assert elem is None return 0.0 if len(l) < 2: return l[0] return sum(l) / float(num_gpus) self.length_tensor = tf.placeholder(tf.int32, name="length") with tf.device("/cpu:0"): emb_weights = tf.get_variable( "embedding", [niclass, vec_size], initializer=tf.random_uniform_initializer(-1.7, 1.7)) if beam_size > 0: target_emb_weights = tf.get_variable( "target_embedding", [noclass, nmaps], initializer=tf.random_uniform_initializer(-1.7, 1.7)) e0 = tf.scatter_update(emb_weights, tf.constant(0, dtype=tf.int32, shape=[1]), tf.zeros([1, vec_size])) output_w = tf.get_variable("output_w", [nmaps, noclass], tf.float32) def conv_rate(layer): if atrous: return 2**layer return 1 # pylint: disable=cell-var-from-loop def enc_step(step): """Encoder step.""" if autoenc_decay < 1.0: quant_step = autoenc_quantize(step, 16, nmaps, self.do_training) if backward: exp_glob = tf.train.exponential_decay(1.0, self.global_step - 10000, 1000, autoenc_decay) dec_factor = 1.0 - exp_glob # * self.do_training dec_factor = tf.cond(tf.less(self.global_step, 10500), lambda: tf.constant(0.05), lambda: dec_factor) else: dec_factor = 1.0 cur = tf.cond(tf.less(tf.random_uniform([]), dec_factor), lambda: quant_step, lambda: step) else: cur = step if dropout > 0.0001: cur = tf.nn.dropout(cur, keep_prob) if act_noise > 0.00001: cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale # Do nconvs-many CGRU steps. if do_jit and tf.get_variable_scope().reuse: with jit_scope(): for layer in xrange(nconvs): cur = conv_gru([], cur, kw, kh, nmaps, conv_rate(layer), cutoff, "ecgru_%d" % layer, do_layer_norm) else: for layer in xrange(nconvs): cur = conv_gru([], cur, kw, kh, nmaps, conv_rate(layer), cutoff, "ecgru_%d" % layer, do_layer_norm) return cur zero_tgt = tf.zeros([batch_size, nmaps, 1]) zero_tgt.set_shape([None, nmaps, 1]) def dec_substep(step, decided): """Decoder sub-step.""" cur = step if dropout > 0.0001: cur = tf.nn.dropout(cur, keep_prob) if act_noise > 0.00001: cur += tf.truncated_normal(tf.shape(cur)) * act_noise_scale # Do nconvs-many CGRU steps. if do_jit and tf.get_variable_scope().reuse: with jit_scope(): for layer in xrange(nconvs): cur = conv_gru([decided], cur, kw, kh, nmaps, conv_rate(layer), cutoff, "dcgru_%d" % layer, do_layer_norm) else: for layer in xrange(nconvs): cur = conv_gru([decided], cur, kw, kh, nmaps, conv_rate(layer), cutoff, "dcgru_%d" % layer, do_layer_norm) return cur # pylint: enable=cell-var-from-loop def dec_step(step, it, it_int, decided, output_ta, tgts, mloss, nupd_in, out_idx, beam_cost): """Decoder step.""" nupd, mem_loss = 0, 0.0 if mem_size > 0: it_incr = tf.minimum(it+1, length - 1) mem, mem_loss, nupd = memory_run( step, nmaps, mem_size, batch_size, noclass, self.global_step, self.do_training, self.update_mem, 10, num_gpus, target_emb_weights, output_w, gpu_targets_tn, it_incr) step = dec_substep(step, decided) output_l = tf.expand_dims(tf.expand_dims(step[:, it, 0, :], 1), 1) # Calculate argmax output. output = tf.reshape(output_l, [-1, nmaps]) # pylint: disable=cell-var-from-loop output = tf.matmul(output, output_w) if beam_size > 1: beam_cost, output, out, reordered = reorder_beam( beam_size, batch_size, beam_cost, output, it_int == 0, [output_l, out_idx, step, decided]) [output_l, out_idx, step, decided] = reordered else: # Scheduled sampling. out = tf.multinomial(tf.stop_gradient(output), 1) out = tf.to_int32(tf.squeeze(out, [1])) out_write = output_ta.write(it, output_l[:batch_size, :, :, :]) output = tf.gather(target_emb_weights, out) output = tf.reshape(output, [-1, 1, nmaps]) output = tf.concat(axis=1, values=[output] * height) tgt = tgts[it, :, :, :] selected = tf.cond(tf.less(tf.random_uniform([]), self.sampling), lambda: output, lambda: tgt) # pylint: enable=cell-var-from-loop dec_write = place_at14(decided, tf.expand_dims(selected, 1), it) out_idx = place_at13( out_idx, tf.reshape(out, [beam_size * batch_size, 1, 1]), it) if mem_size > 0: mem = tf.concat(axis=2, values=[mem] * height) dec_write = place_at14(dec_write, mem, it_incr) return (step, dec_write, out_write, mloss + mem_loss, nupd_in + nupd, out_idx, beam_cost) # Main model construction. gpu_outputs = [] gpu_losses = [] gpu_grad_norms = [] grads_list = [] gpu_out_idx = [] self.after_enc_step = [] for gpu in xrange(num_gpus): # Multi-GPU towers, average gradients later. length = self.length_tensor length_float = tf.cast(length, tf.float32) if gpu > 0: tf.get_variable_scope().reuse_variables() gpu_outputs.append([]) gpu_losses.append([]) gpu_grad_norms.append([]) with tf.name_scope("gpu%d" % gpu), tf.device("/gpu:%d" % gpu): # Main graph creation loop. data.print_out("Creating model.") start_time = time.time() # Embed inputs and calculate mask. with tf.device("/cpu:0"): tgt_shape = tf.shape(tf.squeeze(gpu_target[gpu], [1])) weights = tf.where(tf.squeeze(gpu_target[gpu], [1]) > 0, tf.ones(tgt_shape), tf.zeros(tgt_shape)) # Embed inputs and targets. with tf.control_dependencies([e0]): start = tf.gather(emb_weights, gpu_input[gpu]) # b x h x l x nmaps gpu_targets_tn = gpu_target[gpu] # b x 1 x len if beam_size > 0: embedded_targets_tn = tf.gather(target_emb_weights, gpu_targets_tn) embedded_targets_tn = tf.transpose( embedded_targets_tn, [2, 0, 1, 3]) # len x b x 1 x nmaps embedded_targets_tn = tf.concat(axis=2, values=[embedded_targets_tn] * height) # First image comes from start by applying convolution and adding 0s. start = tf.transpose(start, [0, 2, 1, 3]) # Now b x len x h x vec_s first = conv_linear(start, 1, 1, vec_size, nmaps, 1, True, 0.0, "input") first = layer_norm(first, nmaps, "input") # Computation steps. keep_prob = dropout * 3.0 / tf.sqrt(length_float) keep_prob = 1.0 - self.do_training * keep_prob act_noise_scale = act_noise * self.do_training # Start with a convolutional gate merging previous step. step = conv_gru([gpu_prev_step[gpu]], first, kw, kh, nmaps, 1, cutoff, "first", do_layer_norm) # This is just for running a baseline RNN seq2seq model. if do_rnn: self.after_enc_step.append(step) # Not meaningful here, but needed. lstm_cell = tf.contrib.rnn.BasicLSTMCell(height * nmaps) cell = tf.contrib.rnn.MultiRNNCell([lstm_cell] * nconvs) with tf.variable_scope("encoder"): encoder_outputs, encoder_state = tf.nn.dynamic_rnn( cell, tf.reshape(step, [batch_size, length, height * nmaps]), dtype=tf.float32, time_major=False) # Attention. attn = tf.layers.dense( encoder_outputs, height * nmaps, name="attn1") # pylint: disable=cell-var-from-loop @function.Defun(noinline=True) def attention_query(query, attn_v): vecs = tf.tanh(attn + tf.expand_dims(query, 1)) mask = tf.reduce_sum(vecs * tf.reshape(attn_v, [1, 1, -1]), 2) mask = tf.nn.softmax(mask) return tf.reduce_sum(encoder_outputs * tf.expand_dims(mask, 2), 1) with tf.variable_scope("decoder"): def decoder_loop_fn((state, prev_cell_out, _), (cell_inp, cur_tgt)): """Decoder loop function.""" attn_q = tf.layers.dense(prev_cell_out, height * nmaps, name="attn_query") attn_res = attention_query(attn_q, tf.get_variable( "attn_v", [height * nmaps], initializer=tf.random_uniform_initializer(-0.1, 0.1))) concatenated = tf.reshape(tf.concat(axis=1, values=[cell_inp, attn_res]), [batch_size, 2 * height * nmaps]) cell_inp = tf.layers.dense( concatenated, height * nmaps, name="attn_merge") output, new_state = cell(cell_inp, state) mem_loss = 0.0 if mem_size > 0: res, mask, mem_loss = memory_call( output, cur_tgt, height * nmaps, mem_size, noclass, num_gpus, self.update_mem) res = tf.gather(target_emb_weights, res) res *= tf.expand_dims(mask[:, 0], 1) output = tf.layers.dense( tf.concat(axis=1, values=[output, res]), height * nmaps, name="rnnmem") return new_state, output, mem_loss # pylint: enable=cell-var-from-loop gpu_targets = tf.squeeze(gpu_target[gpu], [1]) # b x len gpu_tgt_trans = tf.transpose(gpu_targets, [1, 0]) dec_zero = tf.zeros([batch_size, 1], dtype=tf.int32) dec_inp = tf.concat(axis=1, values=[dec_zero, gpu_targets]) dec_inp = dec_inp[:, :length] embedded_dec_inp = tf.gather(target_emb_weights, dec_inp) embedded_dec_inp_proj = tf.layers.dense( embedded_dec_inp, height * nmaps, name="dec_proj") embedded_dec_inp_proj = tf.transpose(embedded_dec_inp_proj, [1, 0, 2]) init_vals = (encoder_state, tf.zeros([batch_size, height * nmaps]), 0.0) _, dec_outputs, mem_losses = tf.scan( decoder_loop_fn, (embedded_dec_inp_proj, gpu_tgt_trans), initializer=init_vals) mem_loss = tf.reduce_mean(mem_losses) outputs = tf.layers.dense(dec_outputs, nmaps, name="out_proj") # Final convolution to get logits, list outputs. outputs = tf.matmul(tf.reshape(outputs, [-1, nmaps]), output_w) outputs = tf.reshape(outputs, [length, batch_size, noclass]) gpu_out_idx.append(tf.argmax(outputs, 2)) else: # Here we go with the Neural GPU. # Encoder. enc_length = length step = enc_step(step) # First step hard-coded. # pylint: disable=cell-var-from-loop i = tf.constant(1) c = lambda i, _s: tf.less(i, enc_length) def enc_step_lambda(i, step): with tf.variable_scope(tf.get_variable_scope(), reuse=True): new_step = enc_step(step) return (i + 1, new_step) _, step = tf.while_loop( c, enc_step_lambda, [i, step], parallel_iterations=1, swap_memory=True) # pylint: enable=cell-var-from-loop self.after_enc_step.append(step) # Decoder. if beam_size > 0: output_ta = tf.TensorArray( dtype=tf.float32, size=length, dynamic_size=False, infer_shape=False, name="outputs") out_idx = tf.zeros([beam_size * batch_size, length, 1], dtype=tf.int32) decided_t = tf.zeros([beam_size * batch_size, length, height, vec_size]) # Prepare for beam search. tgts = tf.concat(axis=1, values=[embedded_targets_tn] * beam_size) beam_cost = tf.zeros([batch_size, beam_size]) step = tf.concat(axis=0, values=[step] * beam_size) # First step hard-coded. step, decided_t, output_ta, mem_loss, nupd, oi, bc = dec_step( step, 0, 0, decided_t, output_ta, tgts, 0.0, 0, out_idx, beam_cost) tf.get_variable_scope().reuse_variables() # pylint: disable=cell-var-from-loop def step_lambda(i, step, dec_t, out_ta, ml, nu, oi, bc): with tf.variable_scope(tf.get_variable_scope(), reuse=True): s, d, t, nml, nu, oi, bc = dec_step( step, i, 1, dec_t, out_ta, tgts, ml, nu, oi, bc) return (i + 1, s, d, t, nml, nu, oi, bc) i = tf.constant(1) c = lambda i, _s, _d, _o, _ml, _nu, _oi, _bc: tf.less(i, length) _, step, _, output_ta, mem_loss, nupd, out_idx, _ = tf.while_loop( c, step_lambda, [i, step, decided_t, output_ta, mem_loss, nupd, oi, bc], parallel_iterations=1, swap_memory=True) # pylint: enable=cell-var-from-loop gpu_out_idx.append(tf.squeeze(out_idx, [2])) outputs = output_ta.stack() outputs = tf.squeeze(outputs, [2, 3]) # Now l x b x nmaps else: # If beam_size is 0 or less, we don't have a decoder. mem_loss = 0.0 outputs = tf.transpose(step[:, :, 1, :], [1, 0, 2]) gpu_out_idx.append(tf.argmax(outputs, 2)) # Final convolution to get logits, list outputs. outputs = tf.matmul(tf.reshape(outputs, [-1, nmaps]), output_w) outputs = tf.reshape(outputs, [length, batch_size, noclass]) gpu_outputs[gpu] = tf.nn.softmax(outputs) # Calculate cross-entropy loss and normalize it. targets_soft = make_dense(tf.squeeze(gpu_target[gpu], [1]), noclass, 0.1) targets_soft = tf.reshape(targets_soft, [-1, noclass]) targets_hard = make_dense(tf.squeeze(gpu_target[gpu], [1]), noclass, 0.0) targets_hard = tf.reshape(targets_hard, [-1, noclass]) output = tf.transpose(outputs, [1, 0, 2]) xent_soft = tf.reshape(tf.nn.softmax_cross_entropy_with_logits( logits=tf.reshape(output, [-1, noclass]), labels=targets_soft), [batch_size, length]) xent_hard = tf.reshape(tf.nn.softmax_cross_entropy_with_logits( logits=tf.reshape(output, [-1, noclass]), labels=targets_hard), [batch_size, length]) low, high = 0.1 / float(noclass - 1), 0.9 const = high * tf.log(high) + float(noclass - 1) * low * tf.log(low) weight_sum = tf.reduce_sum(weights) + 1e-20 true_perp = tf.reduce_sum(xent_hard * weights) / weight_sum soft_loss = tf.reduce_sum(xent_soft * weights) / weight_sum perp_loss = soft_loss + const # Final loss: cross-entropy + shared parameter relaxation part + extra. mem_loss = 0.5 * tf.reduce_mean(mem_loss) / length_float total_loss = perp_loss + mem_loss gpu_losses[gpu].append(true_perp) # Gradients. if backward: data.print_out("Creating backward pass for the model.") grads = tf.gradients( total_loss, tf.trainable_variables(), colocate_gradients_with_ops=True) for g_i, g in enumerate(grads): if isinstance(g, tf.IndexedSlices): grads[g_i] = tf.convert_to_tensor(g) grads, norm = tf.clip_by_global_norm(grads, max_grad_norm) gpu_grad_norms[gpu].append(norm) for g in grads: if grad_noise_scale > 0.001: g += tf.truncated_normal(tf.shape(g)) * self.noise_param grads_list.append(grads) else: gpu_grad_norms[gpu].append(0.0) data.print_out("Created model for gpu %d in %.2f s." % (gpu, time.time() - start_time))
def train(): """Train the model.""" batch_size = FLAGS.batch_size tasks = FLAGS.task.split("-") with tf.Session() as sess: (model, min_length, max_length, checkpoint_dir, curriculum, _) = initialize(sess) quant_op = neural_gpu.quantize_weights_op(512, 8) max_cur_length = min(min_length + 3, max_length) prev_acc_perp = [1000000 for _ in xrange(3)] prev_seq_err = 1.0 # Main traning loop. while True: global_step, pull, max_cur_length, learning_rate = sess.run( [model.global_step, model.pull, model.cur_length, model.lr]) acc_loss, acc_total, acc_errors, acc_seq_err = 0.0, 0, 0, 0 acc_grad_norm, step_count, step_time = 0.0, 0, 0.0 for _ in xrange(FLAGS.steps_per_checkpoint): global_step += 1 task = random.choice(tasks) # Select the length for curriculum learning. l = np.random.randint(max_cur_length - min_length + 1) + min_length # Prefer longer stuff 60% of time. if np.random.randint(100) < 60: l1 = np.random.randint(max_cur_length - min_length + 1) + min_length l = max(l, l1) # Mixed curriculum learning: in 25% of cases go to any larger length. if np.random.randint(100) < 25: l1 = np.random.randint(max_length - min_length + 1) + min_length l = max(l, l1) # Run a step and time it. start_time = time.time() inp, target = data.get_batch(l, batch_size, True, task) noise_param = math.sqrt( math.pow(global_step, -0.55) * prev_seq_err) * FLAGS.grad_noise_scale loss, res, gnorm, _ = model.step(sess, inp, target, True, noise_param) step_time += time.time() - start_time acc_grad_norm += float(gnorm) # Accumulate statistics only if we did not exceed curriculum length. if l < max_cur_length + 1: step_count += 1 acc_loss += loss errors, total, seq_err = data.accuracy( inp, res, target, batch_size, 0) acc_total += total acc_errors += errors acc_seq_err += seq_err # Normalize and print out accumulated statistics. acc_loss /= step_count step_time /= FLAGS.steps_per_checkpoint acc_seq_err = float(acc_seq_err) / (step_count * batch_size) prev_seq_err = max(0.0, acc_seq_err - 0.02) # No noise at error < 2%. acc_errors = float( acc_errors) / acc_total if acc_total > 0 else 1.0 msg1 = "step %d step-time %.2f" % (global_step, step_time) msg2 = "lr %.8f pull %.3f" % (learning_rate, pull) msg3 = ("%s %s grad-norm %.8f" % (msg1, msg2, acc_grad_norm / FLAGS.steps_per_checkpoint)) data.print_out( "%s len %d ppx %.8f errors %.2f sequence-errors %.2f" % (msg3, max_cur_length, data.safe_exp(acc_loss), 100 * acc_errors, 100 * acc_seq_err)) # If errors are below the curriculum threshold, move curriculum forward. if curriculum > acc_seq_err: if FLAGS.quantize: # Quantize weights. data.print_out(" Quantizing parameters.") sess.run([quant_op]) # Increase current length (until the next with training data). do_incr = True while do_incr and max_cur_length < max_length: sess.run(model.cur_length_incr_op) for t in tasks: if data.train_set[t]: do_incr = False # Forget last perplexities if we're not yet at the end. if max_cur_length < max_length: prev_acc_perp.append(1000000) # Either increase pull or, if it's large, average parameters. if pull < 0.1: sess.run(model.pull_incr_op) else: data.print_out(" Averaging parameters.") sess.run(model.avg_op) if acc_seq_err < (curriculum / 3.0): sess.run(model.lr_decay_op) # Lower learning rate if we're worse than the last 3 checkpoints. acc_perp = data.safe_exp(acc_loss) if acc_perp > max(prev_acc_perp[-3:]): sess.run(model.lr_decay_op) prev_acc_perp.append(acc_perp) # Save checkpoint. checkpoint_path = os.path.join(checkpoint_dir, "neural_gpu.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) # Run evaluation. bound = data.bins[-1] + 1 for t in tasks: l = min_length while l < max_length + EXTRA_EVAL and l < bound: _, seq_err, _ = single_test(l, model, sess, t, FLAGS.nprint, batch_size) l += 1 while l < bound + 1 and not data.test_set[t][l]: l += 1 if seq_err < 0.05: # Run larger test if we're good enough. _, seq_err = multi_test(data.forward_max, model, sess, t, FLAGS.nprint, batch_size * 4) if seq_err < 0.01: # Super-large test on 1-task large-forward models. if data.forward_max > 4000 and len(tasks) == 1: multi_test(data.forward_max, model, sess, tasks[0], FLAGS.nprint, batch_size * 16, 0)
def train(): """Train the model.""" batch_size = FLAGS.batch_size tasks = FLAGS.task.split("-") with tf.Session() as sess: (model, min_length, max_length, checkpoint_dir, curriculum, _) = initialize(sess) quant_op = neural_gpu.quantize_weights_op(512, 8) max_cur_length = min(min_length + 3, max_length) prev_acc_perp = [1000000 for _ in xrange(3)] prev_seq_err = 1.0 # Main traning loop. while True: global_step, pull, max_cur_length, learning_rate = sess.run( [model.global_step, model.pull, model.cur_length, model.lr]) acc_loss, acc_total, acc_errors, acc_seq_err = 0.0, 0, 0, 0 acc_grad_norm, step_count, step_time = 0.0, 0, 0.0 for _ in xrange(FLAGS.steps_per_checkpoint): global_step += 1 task = random.choice(tasks) # Select the length for curriculum learning. l = np.random.randint(max_cur_length - min_length + 1) + min_length # Prefer longer stuff 60% of time. if np.random.randint(100) < 60: l1 = np.random.randint(max_cur_length - min_length+1) + min_length l = max(l, l1) # Mixed curriculum learning: in 25% of cases go to any larger length. if np.random.randint(100) < 25: l1 = np.random.randint(max_length - min_length + 1) + min_length l = max(l, l1) # Run a step and time it. start_time = time.time() inp, target = data.get_batch(l, batch_size, True, task) noise_param = math.sqrt(math.pow(global_step, -0.55) * prev_seq_err) * FLAGS.grad_noise_scale loss, res, gnorm, _ = model.step(sess, inp, target, True, noise_param) step_time += time.time() - start_time acc_grad_norm += float(gnorm) # Accumulate statistics only if we did not exceed curriculum length. if l < max_cur_length + 1: step_count += 1 acc_loss += loss errors, total, seq_err = data.accuracy(inp, res, target, batch_size, 0) acc_total += total acc_errors += errors acc_seq_err += seq_err # Normalize and print out accumulated statistics. acc_loss /= step_count step_time /= FLAGS.steps_per_checkpoint acc_seq_err = float(acc_seq_err) / (step_count * batch_size) prev_seq_err = max(0.0, acc_seq_err - 0.02) # No noise at error < 2%. acc_errors = float(acc_errors) / acc_total if acc_total > 0 else 1.0 msg1 = "step %d step-time %.2f" % (global_step, step_time) msg2 = "lr %.8f pull %.3f" % (learning_rate, pull) msg3 = ("%s %s grad-norm %.8f" % (msg1, msg2, acc_grad_norm / FLAGS.steps_per_checkpoint)) data.print_out("%s len %d ppx %.8f errors %.2f sequence-errors %.2f" % (msg3, max_cur_length, data.safe_exp(acc_loss), 100*acc_errors, 100*acc_seq_err)) # If errors are below the curriculum threshold, move curriculum forward. if curriculum > acc_seq_err: if FLAGS.quantize: # Quantize weights. data.print_out(" Quantizing parameters.") sess.run([quant_op]) # Increase current length (until the next with training data). do_incr = True while do_incr and max_cur_length < max_length: sess.run(model.cur_length_incr_op) for t in tasks: if data.train_set[t]: do_incr = False # Forget last perplexities if we're not yet at the end. if max_cur_length < max_length: prev_acc_perp.append(1000000) # Either increase pull or, if it's large, average parameters. if pull < 0.1: sess.run(model.pull_incr_op) else: data.print_out(" Averaging parameters.") sess.run(model.avg_op) if acc_seq_err < (curriculum / 3.0): sess.run(model.lr_decay_op) # Lower learning rate if we're worse than the last 3 checkpoints. acc_perp = data.safe_exp(acc_loss) if acc_perp > max(prev_acc_perp[-3:]): sess.run(model.lr_decay_op) prev_acc_perp.append(acc_perp) # Save checkpoint. checkpoint_path = os.path.join(checkpoint_dir, "neural_gpu.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) # Run evaluation. bound = data.bins[-1] + 1 for t in tasks: l = min_length while l < max_length + EXTRA_EVAL and l < bound: _, seq_err, _ = single_test(l, model, sess, t, FLAGS.nprint, batch_size) l += 1 while l < bound + 1 and not data.test_set[t][l]: l += 1 if seq_err < 0.05: # Run larger test if we're good enough. _, seq_err = multi_test(data.forward_max, model, sess, t, FLAGS.nprint, batch_size * 4) if seq_err < 0.01: # Super-large test on 1-task large-forward models. if data.forward_max > 4000 and len(tasks) == 1: multi_test(data.forward_max, model, sess, tasks[0], FLAGS.nprint, batch_size * 16, 0)
def initialize(sess): """Initialize data and model.""" if FLAGS.jobid >= 0: data.log_filename = os.path.join(FLAGS.train_dir, "log%d" % FLAGS.jobid) data.print_out("NN ", newline=False) # Set random seed. seed = FLAGS.random_seed + max(0, FLAGS.jobid) tf.set_random_seed(seed) random.seed(seed) np.random.seed(seed) # Check data sizes. assert data.bins min_length = 3 max_length = min(FLAGS.max_length, data.bins[-1]) assert max_length + 1 > min_length while len(data.bins) > 1 and data.bins[-2] > max_length + EXTRA_EVAL: data.bins = data.bins[:-1] assert data.bins[0] > FLAGS.rx_step data.forward_max = max(FLAGS.forward_max, data.bins[-1]) nclass = min(FLAGS.niclass, FLAGS.noclass) data_size = FLAGS.train_data_size if FLAGS.mode == 0 else 1000 # Initialize data for each task. tasks = FLAGS.task.split("-") for t in tasks: for l in xrange(max_length + EXTRA_EVAL - 1): data.init_data(t, l, data_size, nclass) data.init_data(t, data.bins[-2], data_size, nclass) data.init_data(t, data.bins[-1], data_size, nclass) end_size = 4 * 1024 if FLAGS.mode > 0 else 1024 data.init_data(t, data.forward_max, end_size, nclass) # Print out parameters. curriculum = FLAGS.curriculum_bound msg1 = ("layers %d kw %d h %d kh %d relax %d batch %d noise %.2f task %s" % (FLAGS.nconvs, FLAGS.kw, FLAGS.height, FLAGS.kh, FLAGS.rx_step, FLAGS.batch_size, FLAGS.grad_noise_scale, FLAGS.task)) msg2 = "data %d %s" % (FLAGS.train_data_size, msg1) msg3 = ("cut %.2f pull %.3f lr %.2f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s" % (FLAGS.cutoff, FLAGS.pull_incr, FLAGS.lr, FLAGS.init_weight, curriculum, FLAGS.nmaps, FLAGS.dropout, FLAGS.max_grad_norm, msg2)) data.print_out(msg3) # Create checkpoint directory if it does not exist. checkpoint_dir = os.path.join(FLAGS.train_dir, "neural_gpu%s" % ("" if FLAGS.jobid < 0 else str(FLAGS.jobid))) if not gfile.IsDirectory(checkpoint_dir): data.print_out("Creating checkpoint directory %s." % checkpoint_dir) gfile.MkDir(checkpoint_dir) # Create model and initialize it. tf.get_variable_scope().set_initializer( tf.uniform_unit_scaling_initializer(factor=1.8 * FLAGS.init_weight)) model = neural_gpu.NeuralGPU( FLAGS.nmaps, FLAGS.nmaps, FLAGS.niclass, FLAGS.noclass, FLAGS.dropout, FLAGS.rx_step, FLAGS.max_grad_norm, FLAGS.cutoff, FLAGS.nconvs, FLAGS.kw, FLAGS.kh, FLAGS.height, FLAGS.mode, FLAGS.lr, FLAGS.pull, FLAGS.pull_incr, min_length + 3) data.print_out("Created model.") sess.run(tf.initialize_all_variables()) data.print_out("Initialized variables.") # Load model from parameters if a checkpoint exists. ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and gfile.Exists(ckpt.model_checkpoint_path): data.print_out("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) # Check if there are ensemble models and get their checkpoints. ensemble = [] ensemble_dir_list = [d for d in FLAGS.ensemble.split(",") if d] for ensemble_dir in ensemble_dir_list: ckpt = tf.train.get_checkpoint_state(ensemble_dir) if ckpt and gfile.Exists(ckpt.model_checkpoint_path): data.print_out("Found ensemble model %s" % ckpt.model_checkpoint_path) ensemble.append(ckpt.model_checkpoint_path) # Return the model and needed variables. return (model, min_length, max_length, checkpoint_dir, curriculum, ensemble)
def initialize(sess=None): """Initialize data and model.""" global MAXLEN_F # Create training directory if it does not exist. if not tf.gfile.IsDirectory(FLAGS.train_dir): data.print_out("Creating training directory %s." % FLAGS.train_dir) tf.gfile.MkDir(FLAGS.train_dir) decode_suffix = "beam%dln%d" % (FLAGS.beam_size, int(100 * FLAGS.length_norm)) if FLAGS.mode == 0: decode_suffix = "" if FLAGS.task >= 0: data.log_filename = os.path.join(FLAGS.train_dir, "log%d%s" % (FLAGS.task, decode_suffix)) else: data.log_filename = os.path.join(FLAGS.train_dir, "neural_gpu/log") # Set random seed. if FLAGS.random_seed > 0: seed = FLAGS.random_seed + max(0, FLAGS.task) tf.set_random_seed(seed) random.seed(seed) np.random.seed(seed) # Check data sizes. assert data.bins max_length = min(FLAGS.max_length, data.bins[-1]) while len(data.bins) > 1 and data.bins[-2] >= max_length + EXTRA_EVAL: data.bins = data.bins[:-1] if sess is None and FLAGS.task == 0 and FLAGS.num_replicas > 1: if max_length > 60: max_length = max_length * 1 / 2 # Save memory on chief. min_length = min(14, max_length - 3) if FLAGS.problem == "wmt" else 3 for p in FLAGS.problem.split("-"): if p in ["progeval", "progsynth"]: min_length = max(26, min_length) assert max_length + 1 > min_length while len(data.bins) > 1 and data.bins[-2] >= max_length + EXTRA_EVAL: data.bins = data.bins[:-1] # Create checkpoint directory if it does not exist. if FLAGS.mode == 0 or FLAGS.task < 0: checkpoint_dir = os.path.join(FLAGS.train_dir, "neural_gpu%s" % ("" if FLAGS.task < 0 else str(FLAGS.task))) else: checkpoint_dir = FLAGS.train_dir if not tf.gfile.IsDirectory(checkpoint_dir): data.print_out("Creating checkpoint directory %s." % checkpoint_dir) tf.gfile.MkDir(checkpoint_dir) # Prepare data. if FLAGS.problem == "wmt": # Prepare WMT data. data.print_out("Preparing WMT data in %s" % FLAGS.data_dir) if FLAGS.simple_tokenizer: MAXLEN_F = 3.5 (en_train, fr_train, en_dev, fr_dev, en_path, fr_path) = wmt.prepare_wmt_data( FLAGS.data_dir, FLAGS.vocab_size, tokenizer=wmt.space_tokenizer, normalize_digits=FLAGS.normalize_digits) else: (en_train, fr_train, en_dev, fr_dev, en_path, fr_path) = wmt.prepare_wmt_data( FLAGS.data_dir, FLAGS.vocab_size) # Read data into buckets and compute their sizes. fr_vocab, rev_fr_vocab = wmt.initialize_vocabulary(fr_path) data.vocab = fr_vocab data.rev_vocab = rev_fr_vocab data.print_out("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = {} dev_set["wmt"] = read_data(en_dev, fr_dev, data.bins) def data_read(size, print_out): read_data_into_global(en_train, fr_train, data.bins, size, print_out) data_read(50000, False) read_thread_small = threading.Thread( name="reading-data-small", target=lambda: data_read(900000, False)) read_thread_small.start() read_thread_full = threading.Thread( name="reading-data-full", target=lambda: data_read(FLAGS.max_train_data_size, True)) read_thread_full.start() data.print_out("Data reading set up.") else: # Prepare algorithmic data. en_path, fr_path = None, None tasks = FLAGS.problem.split("-") data_size = FLAGS.train_data_size for t in tasks: data.print_out("Generating data for %s." % t) if t in ["progeval", "progsynth"]: data.init_data(t, data.bins[-1], 20 * data_size, FLAGS.vocab_size) if len(program_utils.prog_vocab) > FLAGS.vocab_size - 2: raise ValueError("Increase vocab_size to %d for prog-tasks." % (len(program_utils.prog_vocab) + 2)) data.rev_vocab = program_utils.prog_vocab data.vocab = program_utils.prog_rev_vocab else: for l in xrange(max_length + EXTRA_EVAL - 1): data.init_data(t, l, data_size, FLAGS.vocab_size) data.init_data(t, data.bins[-2], data_size, FLAGS.vocab_size) data.init_data(t, data.bins[-1], data_size, FLAGS.vocab_size) if t not in global_train_set: global_train_set[t] = [] global_train_set[t].append(data.train_set[t]) calculate_buckets_scale(data.train_set[t], data.bins, t) dev_set = data.test_set # Grid-search parameters. lr = FLAGS.lr init_weight = FLAGS.init_weight max_grad_norm = FLAGS.max_grad_norm if sess is not None and FLAGS.task > -1: def job_id_factor(step): """If jobid / step mod 3 is 0, 1, 2: say 0, 1, -1.""" return ((((FLAGS.task / step) % 3) + 1) % 3) - 1 lr *= math.pow(2, job_id_factor(1)) init_weight *= math.pow(1.5, job_id_factor(3)) max_grad_norm *= math.pow(2, job_id_factor(9)) # Print out parameters. curriculum = FLAGS.curriculum_seq msg1 = ("layers %d kw %d h %d kh %d batch %d noise %.2f" % (FLAGS.nconvs, FLAGS.kw, FLAGS.height, FLAGS.kh, FLAGS.batch_size, FLAGS.grad_noise_scale)) msg2 = ("cut %.2f lr %.3f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s" % (FLAGS.cutoff, lr, init_weight, curriculum, FLAGS.nmaps, FLAGS.dropout, max_grad_norm, msg1)) data.print_out(msg2) # Create model and initialize it. tf.get_variable_scope().set_initializer( tf.orthogonal_initializer(gain=1.8 * init_weight)) max_sampling_rate = FLAGS.max_sampling_rate if FLAGS.mode == 0 else 0.0 o = FLAGS.vocab_size if FLAGS.max_target_vocab < 1 else FLAGS.max_target_vocab ngpu.CHOOSE_K = FLAGS.soft_mem_size do_beam_model = FLAGS.train_beam_freq > 0.0001 and FLAGS.beam_size > 1 beam_size = FLAGS.beam_size if FLAGS.mode > 0 and not do_beam_model else 1 beam_size = min(beam_size, FLAGS.beam_size) beam_model = None def make_ngpu(cur_beam_size, back): return ngpu.NeuralGPU( FLAGS.nmaps, FLAGS.vec_size, FLAGS.vocab_size, o, FLAGS.dropout, max_grad_norm, FLAGS.cutoff, FLAGS.nconvs, FLAGS.kw, FLAGS.kh, FLAGS.height, FLAGS.mem_size, lr / math.sqrt(FLAGS.num_replicas), min_length + 3, FLAGS.num_gpus, FLAGS.num_replicas, FLAGS.grad_noise_scale, max_sampling_rate, atrous=FLAGS.atrous, do_rnn=FLAGS.rnn_baseline, do_layer_norm=FLAGS.layer_norm, beam_size=cur_beam_size, backward=back) if sess is None: with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)): model = make_ngpu(beam_size, True) if do_beam_model: tf.get_variable_scope().reuse_variables() beam_model = make_ngpu(FLAGS.beam_size, False) else: model = make_ngpu(beam_size, True) if do_beam_model: tf.get_variable_scope().reuse_variables() beam_model = make_ngpu(FLAGS.beam_size, False) sv = None if sess is None: # The supervisor configuration has a few overriden options. sv = tf.train.Supervisor(logdir=checkpoint_dir, is_chief=(FLAGS.task < 1), saver=model.saver, summary_op=None, save_summaries_secs=60, save_model_secs=15 * 60, global_step=model.global_step) config = tf.ConfigProto(allow_soft_placement=True) sess = sv.PrepareSession(FLAGS.master, config=config) data.print_out("Created model. Checkpoint dir %s" % checkpoint_dir) # Load model from parameters if a checkpoint exists. ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path + ".index"): data.print_out("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) elif sv is None: sess.run(tf.global_variables_initializer()) data.print_out("Initialized variables (no supervisor mode).") elif FLAGS.task < 1 and FLAGS.mem_size > 0: # sess.run(model.mem_norm_op) data.print_out("Created new model and normalized mem (on chief).") # Return the model and needed variables. return (model, beam_model, min_length, max_length, checkpoint_dir, (global_train_set, dev_set, en_path, fr_path), sv, sess)
def initialize(sess): """Initialize data and model.""" if FLAGS.jobid >= 0: data.log_filename = os.path.join(FLAGS.train_dir, "log%d" % FLAGS.jobid) data.print_out("NN ", newline=False) # Set random seed. seed = FLAGS.random_seed + max(0, FLAGS.jobid) tf.set_random_seed(seed) random.seed(seed) np.random.seed(seed) # Check data sizes. assert data.bins min_length = 3 max_length = min(FLAGS.max_length, data.bins[-1]) assert max_length + 1 > min_length while len(data.bins) > 1 and data.bins[-2] > max_length + EXTRA_EVAL: data.bins = data.bins[:-1] assert data.bins[0] > FLAGS.rx_step data.forward_max = max(FLAGS.forward_max, data.bins[-1]) nclass = min(FLAGS.niclass, FLAGS.noclass) data_size = FLAGS.train_data_size if FLAGS.mode == 0 else 1000 # Initialize data for each task. tasks = FLAGS.task.split("-") for t in tasks: for l in xrange(max_length + EXTRA_EVAL - 1): data.init_data(t, l, data_size, nclass) data.init_data(t, data.bins[-2], data_size, nclass) data.init_data(t, data.bins[-1], data_size, nclass) end_size = 4 * 1024 if FLAGS.mode > 0 else 1024 data.init_data(t, data.forward_max, end_size, nclass) # Print out parameters. curriculum = FLAGS.curriculum_bound msg1 = ("layers %d kw %d h %d kh %d relax %d batch %d noise %.2f task %s" % (FLAGS.nconvs, FLAGS.kw, FLAGS.height, FLAGS.kh, FLAGS.rx_step, FLAGS.batch_size, FLAGS.grad_noise_scale, FLAGS.task)) msg2 = "data %d %s" % (FLAGS.train_data_size, msg1) msg3 = ( "cut %.2f pull %.3f lr %.2f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s" % (FLAGS.cutoff, FLAGS.pull_incr, FLAGS.lr, FLAGS.init_weight, curriculum, FLAGS.nmaps, FLAGS.dropout, FLAGS.max_grad_norm, msg2)) data.print_out(msg3) # Create checkpoint directory if it does not exist. checkpoint_dir = os.path.join( FLAGS.train_dir, "neural_gpu%s" % ("" if FLAGS.jobid < 0 else str(FLAGS.jobid))) if not gfile.IsDirectory(checkpoint_dir): data.print_out("Creating checkpoint directory %s." % checkpoint_dir) gfile.MkDir(checkpoint_dir) # Create model and initialize it. tf.get_variable_scope().set_initializer( tf.uniform_unit_scaling_initializer(factor=1.8 * FLAGS.init_weight)) model = neural_gpu.NeuralGPU(FLAGS.nmaps, FLAGS.nmaps, FLAGS.niclass, FLAGS.noclass, FLAGS.dropout, FLAGS.rx_step, FLAGS.max_grad_norm, FLAGS.cutoff, FLAGS.nconvs, FLAGS.kw, FLAGS.kh, FLAGS.height, FLAGS.mode, FLAGS.lr, FLAGS.pull, FLAGS.pull_incr, min_length + 3) data.print_out("Created model.") sess.run(tf.initialize_all_variables()) data.print_out("Initialized variables.") # Load model from parameters if a checkpoint exists. ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and gfile.Exists(ckpt.model_checkpoint_path): data.print_out("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) # Check if there are ensemble models and get their checkpoints. ensemble = [] ensemble_dir_list = [d for d in FLAGS.ensemble.split(",") if d] for ensemble_dir in ensemble_dir_list: ckpt = tf.train.get_checkpoint_state(ensemble_dir) if ckpt and gfile.Exists(ckpt.model_checkpoint_path): data.print_out("Found ensemble model %s" % ckpt.model_checkpoint_path) ensemble.append(ckpt.model_checkpoint_path) # Return the model and needed variables. return (model, min_length, max_length, checkpoint_dir, curriculum, ensemble)
def evaluate(): """Evaluate an existing model.""" batch_size = FLAGS.batch_size * FLAGS.num_gpus with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: (model, beam_model, _, _, _, (_, dev_set, en_vocab_path, fr_vocab_path), _, sess) = initialize(sess) for p in FLAGS.problem.split("-"): for bin_id in xrange(len(data.bins)): if (FLAGS.task >= 0 and bin_id > 4) or (FLAGS.nprint == 0 and bin_id > 8 and p == "wmt"): break single_test(bin_id, model, sess, FLAGS.nprint, batch_size, dev_set, p, beam_model=beam_model) path = FLAGS.test_file_prefix xid = "" if FLAGS.task < 0 else ("%.4d" % (FLAGS.task+FLAGS.decode_offset)) en_path, fr_path = path + ".en" + xid, path + ".fr" + xid # Evaluate the test file if they exist. if path and tf.gfile.Exists(en_path) and tf.gfile.Exists(fr_path): data.print_out("Translating test set %s" % en_path) # Read lines. en_lines, fr_lines = [], [] with tf.gfile.GFile(en_path, mode="r") as f: for line in f: en_lines.append(line.strip()) with tf.gfile.GFile(fr_path, mode="r") as f: for line in f: fr_lines.append(line.strip()) # Tokenize and convert to ids. en_vocab, _ = wmt.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = wmt.initialize_vocabulary(fr_vocab_path) if FLAGS.simple_tokenizer: en_ids = [wmt.sentence_to_token_ids( l, en_vocab, tokenizer=wmt.space_tokenizer, normalize_digits=FLAGS.normalize_digits) for l in en_lines] else: en_ids = [wmt.sentence_to_token_ids(l, en_vocab) for l in en_lines] # Translate. results = [] for idx, token_ids in enumerate(en_ids): if idx % 5 == 0: data.print_out("Translating example %d of %d." % (idx, len(en_ids))) # Which bucket does it belong to? buckets = [b for b in xrange(len(data.bins)) if data.bins[b] >= len(token_ids)] if buckets: result, result_cost = [], 100000000.0 for bucket_id in buckets: if data.bins[bucket_id] > MAXLEN_F * len(token_ids) + EVAL_LEN_INCR: break # Get a 1-element batch to feed the sentence to the model. used_batch_size = 1 # batch_size inp, target = data.get_batch( bucket_id, used_batch_size, None, FLAGS.height, preset=([token_ids], [[]])) loss, output_logits, _, _ = model.step( sess, inp, target, None, beam_size=FLAGS.beam_size) outputs = [int(o[0]) for o in output_logits] loss = loss[0] - (data.bins[bucket_id] * FLAGS.length_norm) if FLAGS.simple_tokenizer: cur_out = outputs if wmt.EOS_ID in cur_out: cur_out = cur_out[:cur_out.index(wmt.EOS_ID)] res_tags = [rev_fr_vocab[o] for o in cur_out] bad_words, bad_brack = wmt.parse_constraints(token_ids, res_tags) loss += 1000.0 * bad_words + 100.0 * bad_brack # print (bucket_id, loss) if loss < result_cost: result = outputs result_cost = loss final = linearize(result, rev_fr_vocab) results.append("%s\t%s\n" % (final, fr_lines[idx])) # print result_cost sys.stderr.write(results[-1]) sys.stderr.flush() else: sys.stderr.write("TOOO_LONG\t%s\n" % fr_lines[idx]) sys.stderr.flush() if xid: decode_suffix = "beam%dln%dn" % (FLAGS.beam_size, int(100 * FLAGS.length_norm)) with tf.gfile.GFile(path + ".res" + decode_suffix + xid, mode="w") as f: for line in results: f.write(line)