def decode(): # Prepare NLC data. global reverse_vocab, vocab, lm if FLAGS.lmfile is not None: print("Loading Language model from %s" % FLAGS.lmfile) # lm = kenlm.LanguageModel(FLAGS.lmfile) lm = kenlm.Model(FLAGS.lmfile) else: print('No lmfile, better to add kenlm arpa data file') print("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data( FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer()) vocab, reverse_vocab = initialize_vocabulary(vocab_path) vocab_size = len(vocab) print("Vocabulary size: %d" % vocab_size) with tf.Session() as sess: print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) while True: sent = input("Enter a sentence: ") output_sent = fix_sent(model, sess, sent) print("Candidate: ", output_sent)
def decode(): # Prepare NLC data. global reverse_vocab, vocab, lm if FLAGS.lmfile is not None: print("Loading Language model from %s" % FLAGS.lmfile) lm = kenlm.Model(FLAGS.lmfile) else: print('No lmfile, better to add kenlm arpa data file') print("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data( FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer()) vocab, reverse_vocab = initialize_vocabulary(vocab_path) vocab_size = len(vocab) print("Vocabulary size: %d" % vocab_size) with tf.Session() as sess: print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) while True: sent = input("Enter a sentence: ") output_sent = fix_sent(model, sess, sent) print("Candidate: ", output_sent)
def tokenize(sent, vocab, depth=FLAGS.num_layers): align = pow(2, depth - 1) token_ids = sentence_to_token_ids(sent, vocab, get_tokenizer()) ones = [1] * len(token_ids) pad = (align - len(token_ids)) % align token_ids += [PAD_ID] * pad ones += [0] * pad source = np.array(token_ids).reshape([-1, 1]) mask = np.array(ones).reshape([-1, 1]) return source, mask
def setup_batch_decode(sess): # decode for dev-sets, in batches global reverse_vocab, vocab, lm if FLAGS.lmfile is not None: print("Loading Language model from %s" % FLAGS.lmfile) lm = kenlm.LanguageModel(FLAGS.lmfile) print("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data( FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer()) # , other_dev_path="/deep/group/nlp_data/nlc_data/ourdev/bpe") vocab, reverse_vocab = initialize_vocabulary(vocab_path, bpe=(FLAGS.tokenizer.lower() == "bpe")) vocab_size = len(vocab) print("Vocabulary size: %d" % vocab_size) print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) return model, x_dev, y_dev
def train(): """Train a translation model using NLC data.""" # Prepare NLC data. logging.info("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data( FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer()) vocab, _ = initialize_vocabulary(vocab_path) vocab_size = len(vocab) logging.info("Vocabulary size: %d" % vocab_size) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir)) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with tf.Session() as sess: logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) logging.info('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev)) epoch = 0 previous_losses = [] exp_cost = None exp_length = None exp_norm = None total_iters = 0 start_time = time.time() while FLAGS.epochs == 0 or epoch < FLAGS.epochs: epoch += 1 current_step = 0 # Train epoch_tic = time.time() for source_tokens, source_mask, target_tokens, target_mask in pair_iter( x_train, y_train, FLAGS.batch_size, FLAGS.num_layers): # Get a batch and make a step. grad_norm, cost, param_norm = model.train( sess, source_tokens, source_mask, target_tokens, target_mask) total_iters += np.sum(target_mask) tps = total_iters / (time.time() - start_time) current_step += 1 lengths = np.sum(target_mask, axis=0) mean_length = np.mean(lengths) std_length = np.std(lengths) if not exp_cost: exp_cost = cost exp_length = mean_length exp_norm = grad_norm else: exp_cost = 0.99 * exp_cost + 0.01 * cost exp_length = 0.99 * exp_length + 0.01 * mean_length exp_norm = 0.99 * exp_norm + 0.01 * grad_norm cost = cost / mean_length if current_step % FLAGS.print_every == 0: logging.info( 'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, tps %f, ' 'length mean/std %f/%f' % (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, tps, mean_length, std_length)) epoch_toc = time.time() # Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt") # Validate valid_cost = validate(model, sess, x_dev, y_dev) logging.info("Epoch %d Validation cost: %f time: %f" % (epoch, valid_cost, epoch_toc - epoch_tic)) if len(previous_losses) > 2 and valid_cost > previous_losses[-1]: logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor) sess.run(model.learning_rate_decay_op) model.saver.restore(sess, checkpoint_path) else: previous_losses.append(valid_cost) model.saver.save(sess, checkpoint_path) sys.stdout.flush()
def train(): """Train a translation model using NLC data.""" # Prepare NLC data. logging.info("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data( FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer()) vocab, _ = initialize_vocabulary(vocab_path) vocab_size = len(vocab) logging.info("Vocabulary size: %d" % vocab_size) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir)) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) logging.info('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev)) epoch = 0 previous_losses = [] exp_cost = None exp_length = None exp_norm = None total_iters = 0 start_time = time.time() while FLAGS.epochs == 0 or epoch < FLAGS.epochs: epoch += 1 current_step = 0 # Train epoch_tic = time.time() for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_train, y_train, FLAGS.batch_size, FLAGS.num_layers): # Get a batch and make a step. grad_norm, cost, param_norm = model.train(sess, source_tokens, source_mask, target_tokens, target_mask) total_iters += np.sum(target_mask) tps = total_iters / (time.time() - start_time) current_step += 1 lengths = np.sum(target_mask, axis=0) mean_length = np.mean(lengths) std_length = np.std(lengths) if not exp_cost: exp_cost = cost exp_length = mean_length exp_norm = grad_norm else: exp_cost = 0.99 * exp_cost + 0.01 * cost exp_length = 0.99 * exp_length + 0.01 * mean_length exp_norm = 0.99 * exp_norm + 0.01 * grad_norm cost = cost / mean_length if current_step % FLAGS.print_every == 0: logging.info( 'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, tps %f, ' 'length mean/std %f/%f' % (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, tps, mean_length, std_length)) epoch_toc = time.time() # Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt") # Validate valid_cost = validate(model, sess, x_dev, y_dev) logging.info("Epoch %d Validation cost: %f time: %f" % (epoch, valid_cost, epoch_toc - epoch_tic)) if len(previous_losses) > 2 and valid_cost > previous_losses[-1]: logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor) sess.run(model.learning_rate_decay_op) model.saver.restore(sess, checkpoint_path) else: previous_losses.append(valid_cost) model.saver.save(sess, checkpoint_path) sys.stdout.flush()