def decode(): # Prepare NLC data. global reverse_vocab, vocab, lm if FLAGS.lmfile is not None: print("Loading Language model from %s" % FLAGS.lmfile) lm = kenlm.LanguageModel(FLAGS.lmfile) print("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data( FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=get_tokenizer(FLAGS)) vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path) vocab_size = len(vocab) print("Vocabulary size: %d" % vocab_size) with tf.Session() as sess: print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) while True: sent = raw_input("Enter a sentence: ") output_sent = fix_sent(model, sess, sent) print("Candidate: ", output_sent)
def decode(): # Prepare NLC data. global reverse_vocab, vocab, lm if FLAGS.lmfile is not None: print("Loading Language model from %s" % FLAGS.lmfile) lm = kenlm.LanguageModel(FLAGS.lmfile) print("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data( FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=get_tokenizer(FLAGS)) vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path) vocab_size = len(vocab) print("Vocabulary size: %d" % vocab_size) config = tf.ConfigProto( device_count={'GPU': 0} ) with tf.Session(config=config) as sess: print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) while True: sent = input("Enter a sentence: ") output_sent = fix_sent(model, sess, sent) print("Candidate: ", output_sent)
def minimize_language(args): tokenizer = util.get_tokenizer(args.tokenizer_name) if not args.splits: minimize_partition(args.filename, "conll", args, tokenizer) else: minimize_partition('dev', 'v4_gold_conll', args, tokenizer) minimize_partition('test', 'v4_gold_conll', args, tokenizer) minimize_partition('train', 'v4_gold_conll', args, tokenizer)
def __init__(self, config, language='english'): self.config = config self.language = language self.max_seg_len = config['max_segment_len'] self.max_training_seg = config['max_training_sentences'] self.data_dir = config['data_dir'] self.tokenizer = util.get_tokenizer(config['bert_tokenizer_name']) self.tensor_samples, self.stored_info = None, None # For dataset samples; lazy loading
def tokenize(sent, vocab, depth=FLAGS.num_layers): align = pow(2, depth - 1) token_ids = nlc_data.sentence_to_token_ids(sent, vocab, get_tokenizer(FLAGS)) ones = [1] * len(token_ids) pad = (align - len(token_ids)) % align token_ids += [nlc_data.PAD_ID] * pad ones += [0] * pad source = np.array(token_ids).reshape([-1, 1]) mask = np.array(ones).reshape([-1, 1]) return source, mask
def decode(): # Prepare NLC data. global reverse_vocab, vocab, lm if FLAGS.lmfile is not None: print("Loading Language model from %s" % FLAGS.lmfile) lm = kenlm.LanguageModel(FLAGS.lmfile) print("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data( FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=get_tokenizer(FLAGS)) vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path) vocab_size = len(vocab) print("Vocabulary size: %d" % vocab_size) with tf.Session() as sess: print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) # import codecs # outfile = open('predictions.txt','w') # outfile2 = open('predictions_all.txt','w') # outfile = open('lambda_train.txt','w') # infile = open(FLAGS.data_dir+'/'+FLAGS.tokenizer.lower()+'/test.y.txt') # lines = infile.readlines() # index = 0 # with codecs.open(FLAGS.data_dir+'/'+FLAGS.tokenizer.lower()+'/test.x.txt', encoding='utf-8') as fr: # with codecs.open('ytc_test.txt', encoding='utf-8') as fr: # for sent in fr: # print("Original: ", sent.strip().encode('utf-8')) # output_sent,all_sents,all_prob,all_lmscore = fix_sent(model, sess, sent.encode('utf-8')) # print("Revised: ", output_sent) # print('*'*30) # outfile.write(output_sent+'\n') # outfile2.write('\t'.join(all_sents)+'\n') # correct_sent = lines[index].strip('\n').strip('\r') # for i in range(len(all_sents)): # if all_sents[i] == correct_sent: # outfile.write('10 qid:'+str(index)+' 1:'+str(all_prob[i])+' 2:'+str(all_lmscore[i])+' #'+all_sents[i]+'\n') # else: # outfile.write('0 qid:'+str(index)+' 1:'+str(all_prob[i])+' 2:'+str(all_lmscore[i])+' #'+all_sents[i]+'\n') # index+=1 while True: sent = raw_input("Enter a sentence: ") output_sent, _, _, _ = fix_sent(model, sess, sent) print("Candidate: ", output_sent)
def setup_batch_decode(sess): # decode for dev-sets, in batches global reverse_vocab, vocab, lm if FLAGS.lmfile is not None: print("Loading Language model from %s" % FLAGS.lmfile) lm = kenlm.LanguageModel(FLAGS.lmfile) print("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data( FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=get_tokenizer(FLAGS), other_dev_path="/deep/group/nlp_data/nlc_data/ourdev/bpe") vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path, bpe=(FLAGS.tokenizer.lower()=="bpe")) vocab_size = len(vocab) print("Vocabulary size: %d" % vocab_size) print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) return model, x_dev, y_dev
def decode(): # Prepare NLC data. global reverse_vocab, vocab, lm if FLAGS.lmfile is not None: print("Loading Language model from %s" % FLAGS.lmfile) lm = kenlm.LanguageModel(FLAGS.lmfile) print("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data( FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=get_tokenizer(FLAGS)) vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path) vocab_size = len(vocab) print("Vocabulary size: %d" % vocab_size) with tf.Session() as sess: print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) if FLAGS.interactive: while True: sent = raw_input("Enter a sentence: ") if sent == 'exit': exit(0) output_sent = fix_sent(model, sess, sent.decode('utf-8')) print("Candidate: ", output_sent) else: test_x_data = os.path.join(FLAGS.data_dir, FLAGS.tokenizer.lower()+'/test.x.txt') if not os.path.exists(test_x_data): print("Please provide {} to test.".format(test_x_data)) exit(-1) with codecs.open(test_x_data, encoding='utf-8') as fr: for sent in fr: print("Original: ", sent.strip().encode('utf-8')) output_sent = fix_sent(model, sess, sent1) print("Revised: ", output_sent.encode('utf-8')) print('*'*30)
def minimize_language(args): tokenizer = util.get_tokenizer(args.tokenizer_name) minimize_partition('dev', 'v4_gold_conll', args, tokenizer) minimize_partition('test', 'v4_gold_conll', args, tokenizer) minimize_partition('train', 'v4_gold_conll', args, tokenizer)
nargs='?', default=256) parser.add_argument('-ep', '--epochs', type=int, nargs='?', default=7) args = parser.parse_args() batch_size = args.batch_size epochs = args.epochs # 1. data process logger.info("start train...") x_train, y_train, x_dev, y_dev, x_test, y_test = get_x_and_y_for_train_dev_test( ) tokenizer = util.get_tokenizer(num_words=constant.MAX_NUM_WORDS, texts=list(x_train) + list(x_dev)) train_sequences, dev_sequences, test_sequences = util.get_train_dev_test_sequences( tokenizer, x_train, x_dev, x_test) padded_train_sequences, padded_dev_sequences, padded_test_sequences = util.get_train_dev_test_padded_sequences( maxlen=constant.MAX_LEN, train_sequences=train_sequences, dev_sequences=dev_sequences, test_sequences=test_sequences) # 2. embedding_matrix from config import glove_embedding_data_path num_words = constant.MAX_NUM_WORDS
def train(): """Train a translation model using NLC data.""" # Prepare NLC data. logging.info("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data( FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=get_tokenizer(FLAGS)) vocab, _ = nlc_data.initialize_vocabulary(vocab_path) vocab_size = len(vocab) logging.info("Vocabulary size: %d" % vocab_size) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir)) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) logging.info('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev)) if False: tic = time.time() params = tf.trainable_variables() num_params = sum( map(lambda t: np.prod(tf.shape(t.value()).eval()), params)) toc = time.time() print("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic)) epoch = 0 best_epoch = 0 previous_losses = [] exp_cost = None exp_length = None exp_norm = None total_iters = 0 start_time = time.time() while (FLAGS.epochs == 0 or epoch < FLAGS.epochs): epoch += 1 current_step = 0 ## Train epoch_tic = time.time() for source_tokens, source_mask, target_tokens, target_mask in pair_iter( x_train, y_train, FLAGS.batch_size, FLAGS.num_layers): # Get a batch and make a step. tic = time.time() grad_norm, cost, param_norm = model.train( sess, source_tokens, source_mask, target_tokens, target_mask) toc = time.time() iter_time = toc - tic total_iters += np.sum(target_mask) tps = total_iters / (time.time() - start_time) current_step += 1 lengths = np.sum(target_mask, axis=0) mean_length = np.mean(lengths) std_length = np.std(lengths) if not exp_cost: exp_cost = cost exp_length = mean_length exp_norm = grad_norm else: exp_cost = 0.99 * exp_cost + 0.01 * cost exp_length = 0.99 * exp_length + 0.01 * mean_length exp_norm = 0.99 * exp_norm + 0.01 * grad_norm cost = cost / mean_length if current_step % FLAGS.print_every == 0: logging.info( 'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, tps %f, length mean/std %f/%f' % (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, tps, mean_length, std_length)) epoch_toc = time.time() ## Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt") ## Validate valid_cost = validate(model, sess, x_dev, y_dev) logging.info("Epoch %d Validation cost: %f time: %f" % (epoch, valid_cost, epoch_toc - epoch_tic)) if len(previous_losses) > 2 and valid_cost > previous_losses[-1]: logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor) sess.run(model.learning_rate_decay_op) model.saver.restore(sess, checkpoint_path + ("-%d" % best_epoch)) else: previous_losses.append(valid_cost) best_epoch = epoch model.saver.save(sess, checkpoint_path, global_step=epoch) sys.stdout.flush()
def train(): """Train a translation model using NLC data.""" # Prepare NLC data. logging.info("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data( FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=get_tokenizer(FLAGS)) vocab, _ = nlc_data.initialize_vocabulary(vocab_path) vocab_size = len(vocab) logging.info("Vocabulary size: %d" % vocab_size) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir)) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) logging.info('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev)) if False: tic = time.time() params = tf.trainable_variables() num_params = sum(map(lambda t: np.prod(tf.shape(t.value()).eval()), params)) toc = time.time() print ("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic)) epoch = 0 best_epoch = 0 previous_losses = [] exp_cost = None exp_length = None exp_norm = None while (FLAGS.epochs == 0 or epoch < FLAGS.epochs): epoch += 1 current_step = 0 ## Train epoch_tic = time.time() for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_train, y_train, FLAGS.batch_size, FLAGS.num_layers): # Get a batch and make a step. tic = time.time() grad_norm, cost, param_norm = model.train(sess, source_tokens, source_mask, target_tokens, target_mask) toc = time.time() iter_time = toc - tic current_step += 1 lengths = np.sum(target_mask, axis=0) mean_length = np.mean(lengths) std_length = np.std(lengths) if not exp_cost: exp_cost = cost exp_length = mean_length exp_norm = grad_norm else: exp_cost = 0.99*exp_cost + 0.01*cost exp_length = 0.99*exp_length + 0.01*mean_length exp_norm = 0.99*exp_norm + 0.01*grad_norm cost = cost / mean_length if current_step % FLAGS.print_every == 0: logging.info('epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f' % (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, iter_time, mean_length, std_length)) epoch_toc = time.time() ## Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt") ## Validate valid_cost = validate(model, sess, x_dev, y_dev) logging.info("Epoch %d Validation cost: %f time: %f" % (epoch, valid_cost, epoch_toc - epoch_tic)) if len(previous_losses) > 2 and valid_cost > previous_losses[-1]: logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor) sess.run(model.learning_rate_decay_op) model.saver.restore(sess, checkpoint_path + ("-%d" % best_epoch)) else: previous_losses.append(valid_cost) best_epoch = epoch model.saver.save(sess, checkpoint_path, global_step=epoch) sys.stdout.flush()
def train(): """Train a translation model using NLC data.""" # Prepare NLC data. logging.info("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data( FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=get_tokenizer(FLAGS)) vocab, _ = nlc_data.initialize_vocabulary(vocab_path) vocab_size = len(vocab) logging.info("Vocabulary size: %d" % vocab_size) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir)) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) tic = time.time() params = tf.trainable_variables() num_params = sum( map(lambda t: np.prod(tf.shape(t.value()).eval()), params)) toc = time.time() print("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic)) epoch = 0 best_epoch = 0 train_costs = [] valid_costs = [] previous_valid_losses = [] while (FLAGS.epochs == 0 or epoch < FLAGS.epochs): epoch += 1 current_step = 0 epoch_cost = 0 epoch_tic = time.time() for source_tokens, source_mask, target_tokens, target_mask in pair_iter( x_train, y_train, FLAGS.batch_size, FLAGS.num_layers): # Get a batch and make a step. grad_norm, cost, param_norm = model.train( sess, source_tokens, source_mask, target_tokens, target_mask) lengths = np.sum(target_mask, axis=0) mean_length = np.mean(lengths) std_length = np.std(lengths) cost = cost / mean_length epoch_cost += cost current_step += 1 if current_step % FLAGS.print_every == 0: logging.info( 'epoch %d, iter %d, cost %f, length mean/std %f/%f' % (epoch, current_step, cost, mean_length, std_length)) # One epoch average train cost train_costs.append(epoch_cost / current_step) # After one epoch average validate cost epoch_toc = time.time() epoch_time = epoch_toc - epoch_tic valid_cost = validate(model, sess, x_dev, y_dev) valid_costs.append(valid_cost) logging.info("Epoch %d Validation cost: %f time: %2fs" % (epoch, valid_cost, epoch_time)) # Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt") if len(previous_valid_losses ) > 2 and valid_cost > previous_valid_losses[-1]: logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor) sess.run(model.learning_rate_decay_op) model.saver.restore(sess, checkpoint_path + ("-%d" % best_epoch)) else: previous_valid_losses.append(valid_cost) best_epoch = epoch model.saver.save(sess, checkpoint_path, global_step=epoch) import cPickle as pickle pickle.dump([train_costs, valid_costs], open('costs_data.pkl', 'wb'))