def run_eval(dataset, hps, logdir, mode, num_eval_steps): with tf.variable_scope('model'): hps.num_sampled = 0 # Always using full softmax at evaluation. hps.keep_prob = 1.0 model = LM(hps, 'eval', '/cpu:0') if hps.average_params: print('Averaging parameters for evaluation.') saver = tf.train.Saver(model.avg_dict) else: saver = tf.train.Saver() # Use only 4 threads for the evaluation. config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=4, inter_op_parallelism_threads=1) sess = tf.Session(config=config) sw = tf.summary.FileWriter(logdir + '/' + mode, sess.graph) ckpt_loader = CheckpointLoader(saver, model.global_step, logdir + '/train') with sess.as_default(): while ckpt_loader.load_checkpoint(): global_step = ckpt_loader.last_global_step data_iterator = dataset.iterate_once( hps.batch_size * hps.num_gpus, hps.num_steps) tf.initialize_local_variables().run() for v in tf.get_collection('initial_state'): sess.run(v.initializer, feed_dict={model.batch_size: hps.batch_size}) loss_nom = 0.0 loss_den = 0.0 for i, (x, y, w) in enumerate(data_iterator): if i >= num_eval_steps: break loss = sess.run(model.loss, { model.x: x, model.y: y, model.w: w, model.batch_size: hps.batch_size}) loss_nom += loss loss_den += w.mean() loss = loss_nom / loss_den sys.stdout.write('%d: %.3f (%.3f) ... ' % ( i, loss, np.exp(loss))) sys.stdout.flush() sys.stdout.write('\n') log_perplexity = loss_nom / loss_den print('Results at %d: log_perplexity = %.3f perplexity = %.3f' % ( global_step, log_perplexity, np.exp(log_perplexity))) summary = tf.Summary() summary.value.add( tag='eval/log_perplexity', simple_value=log_perplexity) summary.value.add( tag='eval/perplexity', simple_value=np.exp(log_perplexity)) sw.add_summary(summary, global_step) sw.flush()
def run_eval(dataset, hps, logdir, mode, num_eval_steps): with tf.variable_scope("model"): hps.num_sampled = 0 # Always using full softmax at evaluation. hps.keep_prob = 1.0 #model = LM(hps, "eval", "/cpu:0") model = LM(hps, "eval", "/gpu:0") if hps.average_params: print("Averaging parameters for evaluation.") saver = tf.train.Saver(model.avg_dict) else: saver = tf.train.Saver() # Use only 4 threads for the evaluation. #config = tf.ConfigProto(allow_soft_placement=True, # intra_op_parallelism_threads=20, # inter_op_parallelism_threads=1) config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=config) sw = tf.summary.FileWriter(logdir + "/" + mode, sess.graph) ckpt_loader = CheckpointLoader(saver, model.global_step, logdir + "/train") with sess.as_default(): while ckpt_loader.load_checkpoint(): global_step = ckpt_loader.last_global_step data_iterator = dataset.iterate_once(hps.batch_size * hps.num_gpus, hps.num_steps) #tf.initialize_local_variables().run() tf.local_variables_initializer().run() loss_nom = 0.0 loss_den = 0.0 #for i, (x, y, w) in enumerate(data_iterator): for i, (x, y) in enumerate(data_iterator): if i >= num_eval_steps and mode!="eval_full": break #loss = sess.run(model.loss, {model.x: x, model.y: y, model.w: w}) loss = sess.run(model.loss, {model.x: x, model.y: y}) loss_nom += loss loss_den += 1 # ??? #loss_den += w.mean() loss = loss_nom / loss_den sys.stdout.write("%d: %.3f (%.3f) ... " % (i, loss, np.exp(loss))) sys.stdout.flush() sys.stdout.write("\n") log_perplexity = loss_nom / loss_den print("Results at %d: log_perplexity = %.3f perplexity = %.3f" % ( global_step, log_perplexity, np.exp(log_perplexity))) summary = tf.Summary() summary.value.add(tag='eval/log_perplexity', simple_value=log_perplexity) summary.value.add(tag='eval/perplexity', simple_value=np.exp(log_perplexity)) sw.add_summary(summary, global_step) sw.flush() if mode == "eval_full": break #we don't need to wait for other checkpoints in this mode
def main(_): hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps.num_gpus = FLAGS.num_gpus vocab = Vocabulary.from_file("small_voca.txt") # ("1b_word_vocab.txt") if FLAGS.mode == "train": hps.batch_size = 256 dataset = Dataset( vocab, FLAGS.datadir + "/training-monolingual.tokenized.shuffled/*") run_train(dataset, hps, FLAGS.logdir + "/train", ps_device="/gpu:0") elif FLAGS.mode.startswith("eval_"): hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps.num_gpus = FLAGS.num_gpus if FLAGS.mode.startswith("eval_train"): data_dir = FLAGS.datadir + "/training-monolingual.tokenized.shuffled/*" else: data_dir = FLAGS.datadir + "/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050" dataset = Dataset(vocab, data_dir, deterministic=True) run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps)
def main(_): """ Start either train or eval. Note hardcoded parts of path for training and eval data """ hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps._set("num_gpus", FLAGS.num_gpus) print('*****HYPER PARAMETERS*****') print(hps) print('**************************') print_debug('our training DataSetDir=%s , LogDir=%s' % (FLAGS.datadir, FLAGS.logdir)) #vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "1b_word_vocab.txt")) vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "vocabulary.txt")) FLAGS.mode = "train" for i in range(10): print("Iteration ", i, " phase: ", FLAGS.mode) if FLAGS.mode == "train": #hps.batch_size = 256 # dataset = Dataset(vocab, os.path.join(FLAGS.datadir, # "training-monolingual.tokenized.shuffled/*")) dataset = Dataset(vocab, os.path.join(FLAGS.datadir, "ptb.train.txt")) trainlogdir = ( FLAGS.logdir + str("/") + "train" ) #(FLAGS.logdir+str("\\")+"train")#os.path.join(FLAGS.logdir, "train") print_debug('train log dir=%s' % (trainlogdir)) run_train(dataset, hps, trainlogdir, ps_device="/gpu:0") print_debug('Finished run_train !!!!!!!!!!!') elif FLAGS.mode.startswith("eval"): print_debug('eval mode') # if FLAGS.mode.startswith("eval_train"): # data_dir = os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*") # elif FLAGS.mode.startswith("eval_full"): # data_dir = os.path.join(FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/*") # else: # data_dir = os.path.join(FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050") dataset = Dataset(vocab, os.path.join(FLAGS.datadir, "ptb.test.txt"), deterministic=True) run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps) print_debug('Finished run_eval !!!!!!!!!!!') if FLAGS.mode == "train": FLAGS.mode = "eval_full" else: FLAGS.mode = "train"
def main(_): """ Start either train or eval. Note hardcoded parts of path for training and eval data """ hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps._set("num_gpus", FLAGS.num_gpus) print('*****HYPER PARAMETERS*****') print(hps) print('**************************') vocab = Vocabulary.from_file( os.path.join(FLAGS.datadir, "1b_word_vocab.txt")) if FLAGS.mode == "train": #hps.batch_size = 256 dataset = Dataset( vocab, os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*")) run_train(dataset, hps, os.path.join(FLAGS.logdir, "train"), ps_device="/gpu:0") elif FLAGS.mode.startswith("eval_"): if FLAGS.mode.startswith("eval_train"): data_dir = os.path.join( FLAGS.datadir, "training-monolingual.tokenized.shuffled/*") elif FLAGS.mode.startswith("eval_full"): data_dir = os.path.join( FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050" ) else: data_dir = os.path.join( FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050" ) dataset = Dataset(vocab, data_dir, deterministic=True) run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps) elif FLAGS.mode.startswith("infer"): data_dir = os.path.join( FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050" ) dataset = Dataset(vocab, data_dir, deterministic=True) run_infer(dataset, hps, FLAGS.logdir, FLAGS.mode, vocab)
def run(): parser = argparse.ArgumentParser() arg = parser.add_argument hps = LM.get_default_hparams().parse('num_steps=20,num_shards=8,num_layers=2,emb_size=12,projected_size=12,state_size=80,num_sampled=0,batch_size=1,vocab_size=102') hps._set("num_gpus", 1) #arg('model') #arg('vocab') arg('--port', type=int, default=8000) arg('--host', default='localhost') arg('--debug', action='store_true') args = parser.parse_args() global model #model = Model(args.model, args.vocab, hps) model = Model('/Users/ruiyangwang/Desktop/f-lm/logs/test/train/model.ckpt-0','/Users/ruiyangwang/Desktop/examples/word_language_model/data/penn/vocabulary.txt', hps) app.run(port=args.port, host=args.host, debug=args.debug)
def test(config, dataset, model_dir, summary_dir): logger = logging.getLogger('lm_zh') config.keep_prob = 1.0 config.num_sampled = 0 logger.info('Build graph ...') initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope('model', initializer=initializer): model = LM(dataset, config, model_dir, summary_dir) logger.info('Restore model ...') model.restore() logger.info('Start test model ...') model.test() logger.info('Test done')
def run(): parser = argparse.ArgumentParser() arg = parser.add_argument hps = LM.get_default_hparams().parse( 'num_steps=20,num_shards=6,num_layers=2,learning_rate=0.2,max_grad_norm=1,keep_prob=0.9,emb_size=1024,projected_size=1024,state_size=8192,num_sampled=8192,batch_size=512,vocab_size=11859,num_of_groups=4' ) hps._set("num_gpus", 1) #arg('model') #arg('vocab') arg('--port', type=int, default=8000) arg('--host', default='localhost') arg('--debug', action='store_true') args = parser.parse_args() global model #model = Model(args.model, args.vocab, hps) model = Model('/Users/ruiyangwang/Desktop/model/model.ckpt-44260', '/Users/ruiyangwang/Desktop/vocabulary2.txt', hps) app.run(port=args.port, host=args.host, debug=args.debug)
def __init__(self, hps, logdir, datadir, mode='eval'): with tf.variable_scope("model"): hps.num_sampled = 0 hps.keep_prob = 1.0 self.model = LM(hps, "eval", "/gpu:0") if hps.average_params: print("Averaging parameters for evaluation.") saver = tf.train.Saver(self.model.avg_dict) else: saver = tf.train.Saver() config = tf.ConfigProto(allow_soft_placement=True) self.sess = tf.Session(config=config) sw = tf.summary.FileWriter(logdir + "/" + mode, self.sess.graph) self.hps = hps self.num_steps = self.hps.num_steps vocab_path = os.path.join(datadir, "vocabulary.txt") with self.sess.as_default(): success = common.load_from_checkpoint(saver, logdir + "/train") if not success: raise Exception('Loading Checkpoint failed') self.vocabulary = Vocabulary.from_file(vocab_path)
def main(_): hvd.init() hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps.num_gpus = FLAGS.num_gpus vocab = Vocabulary.from_file(FLAGS.vocab) hps.vocab_size = vocab.num_tokens config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank()) if FLAGS.logdir is None: FLAGS.logdir = os.path.join('/tmp', 'lm-run-{}'.format(int(time.time()))) print('logdir: {}'.format(FLAGS.logdir)) hps.batch_size = 256 dataset = Dataset(vocab, FLAGS.datadir) run_train(dataset, hps, FLAGS.logdir + '/train', ps_device='/gpu:' + str(hvd.local_rank()))
def main(_): hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps.num_gpus = FLAGS.num_gpus vocab = Vocabulary.from_file(FLAGS.datadir + "/lm_vocab.txt", hps.vocab_size) if FLAGS.mode == "train": hps.batch_size = 256 # reset batchsize dataset = Dataset(vocab, FLAGS.datadir + "/train/*") run_train(dataset, hps, FLAGS.logdir + "/train", ps_device="/gpu:0") elif FLAGS.mode.startswith("eval_"): if FLAGS.mode.startswith("eval_train"): data_dir = FLAGS.datadir + "/train/*" elif FLAGS.mode.startswith("eval_test"): data_dir = FLAGS.datadir + "/heldout/*" print("data_dir:",data_dir) dataset = Dataset(vocab, data_dir, deterministic=True) run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps) elif FLAGS.mode.startswith("predict_next"): data_dir = "data/news.en.heldout-00001-of-00050" dataset = Dataset(vocab, data_dir) predict_next(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps,vocab)
def topkwords(prefix_words, dataset, hps, logdir, mode,top=10): inputs, targets = process_sentence(prefix_words, dataset._vocab, hps) with tf.variable_scope("model"): hps.num_sampled = 0 # Always using full softmax at evaluation. hps.keep_prob = 1.0 # model = LM(hps, "eval", "/cpu:0") model = LM(hps, "eval", "/gpu:0") if hps.average_params: print("Averaging parameters for evaluation.") saver = tf.train.Saver(model.avg_dict) else: saver = tf.train.Saver() config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=config) sw = tf.summary.FileWriter(logdir + "/" + mode, sess.graph) ckpt_loader = CheckpointLoader(saver, model.global_step, logdir + "/train") with sess.as_default(): while ckpt_loader.load_checkpoint(): tf.local_variables_initializer().run() ppl = sess.run(model.loss, {model.x: inputs, model.y: targets})
def test_lm(self): hps = get_test_hparams() with tf.variable_scope("model"): model = LM(hps) with self.test_session() as sess: tf.initialize_all_variables().run() tf.initialize_local_variables().run() loss = 1e5 for i in range(50): x, y, w = simple_data_generator(hps.batch_size, hps.num_steps) loss, _ = sess.run([model.loss, model.train_op], { model.x: x, model.y: y, model.w: w }) print("%d: %.3f %.3f" % (i, loss, np.exp(loss))) if np.isnan(loss): print("NaN detected") break self.assertLess(loss, 1.0)
def run_train(dataset, hps, logdir, ps_device, task=0, master=""): t0 = time.time() f = open('loss-log.txt', 'w') with tf.variable_scope("model"): model = LM(hps, "train", ps_device) stime = time.time() print("Current time: %s" % stime) print("ALL VARIABLES") for v in tf.all_variables(): print("%s %s %s %s" % (v.name, v.get_shape(), v.dtype, v.device)) print("TRAINABLE VARIABLES") for v in tf.trainable_variables(): print("%s %s %s %s" % (v.name, v.get_shape(), v.dtype, v.device)) print("LOCAL VARIABLES") for v in tf.local_variables(): print("%s %s %s %s" % (v.name, v.get_shape(), v.dtype, v.device)) sv = tf.train.Supervisor(is_chief=(task == 0), logdir=logdir, summary_op=None, # Automatic summaries don't work with placeholders. global_step=model.global_step, save_summaries_secs=60*hps.save_summary_every_min, save_model_secs=60*hps.save_model_every_min) #save_summaries_secs=30, #save_model_secs=120 * 5) #config = tf.ConfigProto(allow_soft_placement=True, # intra_op_parallelism_threads=2, # inter_op_parallelism_threads=20) config = tf.ConfigProto(allow_soft_placement=True) with sv.managed_session(master, config=config) as sess: # Slowly increase the number of workers during beginning of the training. #while not sv.should_stop() and (time.time() - stime) < hps.max_time: # step = int(sess.run(model.global_step)) # waiting_until_step = task * hps.num_delayed_steps # if step >= waiting_until_step: # break # else: # print("Current step is %d. Waiting until: %d" % (step, waiting_until_step)) # time.sleep(20.0) local_step = 0 prev_global_step = sess.run(model.global_step) cur_global_step = 0 prev_time = time.time() data_iterator = dataset.iterate_forever(hps.batch_size * hps.num_gpus, hps.num_steps) while not sv.should_stop() and (time.time() - stime) < hps.max_time: fetches = [model.global_step, model.loss, model.train_op] # Chief worker computes summaries every 100 steps. should_compute_summary = (task == 0 and local_step % 100 == 0) if should_compute_summary: fetches += [model.summary_op] #x, y, w = next(data_iterator) x, y = next(data_iterator) should_run_profiler = (hps.run_profiler and task == 0 and local_step % 1000 == 13) if should_run_profiler: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() #fetched = sess.run(fetches, {model.x: x, model.y: y, model.w: w}, fetched = sess.run(fetches, {model.x: x, model.y: y}, options=run_options, run_metadata=run_metadata) # Create the Timeline object, and write it to a json tl = timeline.Timeline(run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() print("Running profiler") with open(logdir + "/timeline.json", 'w') as f: f.write(ctf) print("Finished profiling!") else: #fetched = sess.run(fetches, {model.x: x, model.y: y, model.w: w}) fetched = sess.run(fetches, {model.x: x, model.y: y}) cur_global_step = fetched[0] local_step += 1 if should_compute_summary: sv.summary_computed(sess, fetched[-1]) if local_step < 10 or local_step % 20 == 0: cur_time = time.time() num_words = hps.batch_size * hps.num_gpus * hps.num_steps wps = (cur_global_step - prev_global_step) * num_words / (cur_time - prev_time) prev_global_step = cur_global_step print("Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f" % ( cur_global_step, cur_time - prev_time, wps, fetched[1])) f.write("%s,%s,%s,%s,%s\r\n" % (cur_global_step, cur_time-t0, cur_time-prev_time, wps, fetched[1])) f.flush() prev_time = cur_time #save last model sv._saver.save(sess, sv.save_path, cur_global_step) sv.stop() f.close()
def run_train(dataset, hps, logdir, ps_device, task=0, master=''): config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=2, inter_op_parallelism_threads=20) with tf.variable_scope('model'): model = LM(hps, 'train', ps_device) print('ALL VARIABLES') for v in tf.all_variables(): print('%s %s %s' % (v.name, v.get_shape(), v.device)) print('TRAINABLE VARIABLES') for v in tf.trainable_variables(): print('%s %s %s' % (v.name, v.get_shape(), v.device)) print('LOCAL VARIABLES') for v in tf.local_variables(): print('%s %s %s' % (v.name, v.get_shape(), v.device)) #sv = tf.train.Supervisor( # is_chief=(task == 0), # logdir=logdir, # summary_op=None, # Automatic summaries don't work with placeholders. # global_step=model.global_step, # save_summaries_secs=30, # save_model_secs=120 * 5) hooks = [hvd.BroadcastGlobalVariablesHook(0)] total_step = 0 with tf.train.MonitoredTrainingSession(config=config, hooks=hooks) as sess: for v in tf.get_collection('initial_state'): sess.run(v.initializer, feed_dict={model.batch_size: hps.batch_size}) # Slowly increase the number of workers during # beginning of the training. while not sess.should_stop(): step = int(sess.run(model.global_step)) waiting_until_step = task * hps.num_delayed_steps if step >= waiting_until_step: break else: print('Current step is %d. Waiting until: %d' % (step, waiting_until_step)) time.sleep(10.0) local_step = 0 prev_global_step = sess.run(model.global_step) prev_time = time.time() data_iterator = dataset.iterate_forever( hps.batch_size * hps.num_gpus, hps.num_steps) while not sess.should_stop(): fetches = [model.global_step, model.loss, model.train_op] # Chief worker computes summaries every 20 steps. should_compute_summary = ( hvd.rank() == 0 and local_step > 0 and local_step % 20 == 0) if should_compute_summary: fetches += [model.summary_op] x, y, w = next(data_iterator) fetched = sess.run(fetches, {model.x: x, model.y: y, model.w: w}) local_step += 1 #if should_compute_summary: # sess.summary_computed(sess, fetched[-1]) if hvd.rank() == 0: if local_step < 10 or local_step % 200 == 0: cur_time = time.time() num_words = hps.batch_size * hps.num_gpus * hps.num_steps sps = hps.batch_size * hps.num_gpus * (fetched[0] - prev_global_step) / (cur_time - prev_time) wps = ((fetched[0] - prev_global_step) * num_words / (cur_time - prev_time)) prev_global_step = fetched[0] print('Iteration %d, time = %.2fs, wps = %.0f, sps = %.0f ' 'train loss = %.4f' % ( fetched[0], cur_time - prev_time, wps * hvd.size(), sps * hvd.size(), fetched[1])) prev_time = cur_time
import json import numpy as np import time import tensorflow as tf from data_utils import Vocabulary, Dataset from language_model import LM from common import CheckpointLoader BATCH_SIZE = 1 NUM_TIMESTEPS = 1 MAX_WORD_LEN = 50 UPLOAD_FOLDER = '/data/ngramTest/uploads' UPLOAD_FOLDER = './' hps = LM.get_default_hparams() vocab = Vocabulary.from_file("1b_word_vocab.txt") with tf.variable_scope("model"): hps.num_sampled = 0 # Always using full softmax at evaluation. run out of memory hps.keep_prob = 1.0 hps.num_gpus = 1 model = LM(hps, "predict_next", "/cpu:0") if hps.average_params: print("Averaging parameters for evaluation.") saver = tf.train.Saver(model.avg_dict) else: saver = tf.train.Saver() # Use only 4 threads for the evaluation. config = tf.ConfigProto(allow_soft_placement=True,
config = p.parse_args() return config if __name__ == '__main__': config = define_argparser() loader = DataLoader(config.train, config.valid, batch_size=config.batch_size, device=config.gpu_id, max_length=config.max_length) model = LM(len(loader.text.vocab), word_vec_dim=config.word_vec_dim, hidden_size=config.hidden_size, n_layers=config.n_layers, dropout_p=config.dropout, max_length=config.max_length) # Let criterion cannot count PAD as right prediction, because PAD is easy to predict. loss_weight = torch.ones(len(loader.text.vocab)) loss_weight[data_loader.PAD] = 0 criterion = nn.NLLLoss(weight=loss_weight, size_average=False) print(model) print(criterion) if config.gpu_id >= 0: model.cuda(config.gpu_id) criterion.cuda(config.gpu_id)
def predict_next(dataset, hps, logdir, mode, num_eval_steps, vocab): with tf.variable_scope("model"): hps.num_sampled = 0 # Always using full softmax at evaluation. run out of memory hps.keep_prob = 1.0 model = LM(hps, "predict_next", "/cpu:0") if hps.average_params: print("Averaging parameters for evaluation.") saver = tf.train.Saver(model.avg_dict) else: saver = tf.train.Saver() # Use only 4 threads for the evaluation. config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=20, inter_op_parallelism_threads=1) sess = tf.Session(config=config) sw = tf.summary.FileWriter(logdir + "/" + mode, sess.graph) ckpt_loader = CheckpointLoader(saver, model.global_step, logdir + "/train") with sess.as_default(): ckpt_loader.load_checkpoint() # FOR ONLY ONE CHECKPOINT global_step = ckpt_loader.last_global_step data_iterator = dataset.iterate_once(hps.batch_size * hps.num_gpus, hps.num_steps) sess.run(tf.local_variables_initializer()) print("global_step:", global_step) loss_nom = 0.0 loss_den = 0.0 cur_time = time.time() savedKey = 0 totalKey = 0 ''' text = open("data/news.en.heldout-00001-of-00050","r") for kk,line in enumerate(text): totalKey += len(line.strip()) if kk==0: print len(line) print "totalKey:",totalKey ''' predicted_words = [] for i, (x, y, w) in enumerate(data_iterator): #if i >= num_eval_steps: # break ''' print "i",i print "x",x for j in x[:]: print j for jj in j: print vocab.get_token(jj) ''' #print "x:",[vocab.get_token(ix) for ix in x[0]] #print "y:",[vocab.get_token(ix) for ix in y[0]] inputs = [vocab.get_token(ix) for ix in x[0]] labels = [vocab.get_token(ix) for ix in y[0]] loss, logits, indexes = sess.run( [model.loss, model.logits, model.index], { model.x: x, model.y: y, model.w: w }) #print logits.shape,indexes #print indexes[0] tmpKS = 0 tmpAllKey = 0 for step in range(hps.num_steps): words = [] totalKey += len(inputs[step]) tmpAllKey += len(inputs[step]) if step > 0: totalKey += 1 # for space between two keys tmpAllKey += 1 for j in range(hps.arg_max): word = vocab.get_token(indexes[0][step][j]) words += [word] if word == labels[step]: predicted_words += [word] tmpKS += len(labels[step]) savedKey += len(labels[step]) #print "predict: ", words # print "x:",x print("i:%6d, savedKey:%d , totalKey:%d, ksr : %.3f " % (i, tmpKS, tmpAllKey, tmpKS * 1.0 / tmpAllKey)) print("savedKey:%d , totalKey:%d, ksr : %.3f " % (savedKey, totalKey, savedKey * 1.0 / totalKey)) print("predicted_words:") print(predicted_words) now = time.time() print "time:", now - cur_time
class HomoNoiserScript: def __init__(self, generator, **kwargs): self.verbose = kwargs.get("verbose", False) # Error params self.max_M = kwargs.get("max_m", 2) self.max_N = kwargs.get("max_n", 2) self.sampling_m = kwargs.get("sampling_m", "weighted") self.sampling_n = kwargs.get("sampling_n", "weighted") self.error_rate = kwargs.get("error_rate", 0.3) self.error_model = kwargs.get("error_model", 'phoneme') self.cnt_error_samples = kwargs.get("error_samples", 5) self.sampling_error_samples = kwargs.get("sampling_error_samples", "weighted") # Sentence error parameters self.min_wer = kwargs.get("min_wer", 0.1) self.max_wer = kwargs.get("max_wer", 0.6) self.cnt_sentence_samples = kwargs.get("sentence_samples", 10) self.sampling_sentence_samples = kwargs.get( "sampling_sentence_samples", "weighted_lm") self.use_lm = kwargs.get("use_lm", True) self.lm_name = kwargs.get("bert_lm", None) # Phoneme model parameters self.g2p_model = kwargs.get("g2p_model", None) self.p2g_model = kwargs.get("p2g_model", None) self.lexicon = kwargs.get("lexicon", None) # Dictionary model parameters self.dictionary_filename_list = kwargs.get("dictionary_filename_list", None) self.jaro_winkler_threshold = kwargs.get("jaro_winkler_threshold", 0.8) # Embedding model parameters # --- # Target parameters self.base_target_dir = kwargs.get("base_target_dir", None) # Set logger self.logger = logger.BasicLogger.setupLogger(verbose=self.verbose) # Check everything is OK self.checkConfig() # Set Generator self.generator = generator noise_generator = self.get_noise_generator() self.sentence_graph = SamplingGraph( noise_generator=noise_generator, error_prob=self.error_rate, max_M=self.max_M, sampling_M=self.sampling_m, sampling_N=self.sampling_n, sampling_error_samples=self.sampling_error_samples) # Check if target directory is empty self.input_filename_list = kwargs.get("input_filename_list", None) self.input_source_dir = kwargs.get("input_source_dir", None) self.check_target_directory(self.base_target_dir, self.input_filename_list) # Set LM if self.use_lm: self.logger.info('loading {} model'.format(self.lm_name)) self.bert_lm = LM(self.lm_name) else: self.bert_lm = None self.logger.info("Language model not used") def delete_files(self, list_of_filenames): import shutil for file_path in list_of_filenames: # os.listdir(folder): # file_path = os.path.join(folder, filename) try: if os.path.isfile(file_path) or os.path.islink(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: print('Failed to delete %s. Reason: %s' % (file_path, e)) def check_target_directory(self, base_target_dir, file_list): """ https://stackoverflow.com/questions/185936/how-to-delete-the-contents-of-a-folder """ with open(file_list) as f: files_list = [line.rstrip() for line in f] list_of_filenames = [ os.path.join(base_target_dir, f) for f in files_list ] existing_files = [] for file_path in list_of_filenames: if os.path.isfile(file_path): existing_files.append(file_path) if len(existing_files) > 0: self.logger.error( "!!!!! Target directory already contains target files !!!!! ----- Should the files be deleted ????? [Y/N]" ) time.sleep(0.5) print("delete files? [Y/N]: ", end="") while True: input1 = input() input1 = input1.lower() if input1 == "y": self.logger.warning("Deleting files ...") self.delete_files(existing_files) break if input1 == "n": self.logger.error( "Move files somewhere else ... exiting now!") exit(1) print("delete files? [Y/N]: ", end="") # self.logger.error("????? Should the files be deleted ????? [Y/N]") def get_noise_generator(self): if self.error_model == "phoneme": return NoiseFromP2G(g2p_model_path=self.g2p_model, p2g_model_path=self.p2g_model, pronounc_dict_path=self.lexicon, cnt_error_samples=self.cnt_error_samples, max_N=self.max_N) elif self.error_model == "dictionary": return NoiseFromDict(db_file_list=self.dictionary_filename_list, threshold=self.jaro_winkler_threshold, cnt_error_samples=self.cnt_error_samples, max_N=self.max_N) else: self.logger.error("Error model not implemented: {}".format( self.error_model)) raise NotImplementedError def which(self, program): """Basic 'which' implementation for python. Basic 'which' implementation for python from stackoverflow: * https://stackoverflow.com/a/377028/6739158 """ def is_exe(fpath): return os.path.isfile(fpath) and os.access(fpath, os.X_OK) fpath, fname = os.path.split(program) if fpath: if is_exe(program): return program else: for path in os.environ["PATH"].split(os.pathsep): path = path.strip('"') exe_file = os.path.join(path, program) if is_exe(exe_file): return exe_file return None def validateLexicon(self): validator_pattern = u"[\\}\\|_]" # python2: unicode, python3: str validator = re.compile(validator_pattern) with open(self.lexicon, "r") as ifp: for line in ifp: if validator.search(line): error = "Bad line contains reservered character:\n\t{0}" error = error.format(line) raise ValueError(error) return def checkConfig(self): self.logger.info("Checking command configuration...") for program in [ "phonetisaurus-g2pfst", "phonetisaurus-align", "phonetisaurus-arpa2wfst" ]: if not self.which(program): raise EnvironmentError(", ".join( ["Phonetisaurus command, '{0}'", "not found in path."]).format(program)) # Create target_meta directory if not exists if not os.path.isdir(self.base_target_dir): self.logger.debug("Directory does not exist. Trying to create.") os.makedirs(self.base_target_dir) if self.error_model == 'phoneme': self.logger.info( "Checking lexicon for reserved characters: '}', '|', '_'...") self.validateLexicon() # Basic assertions if self.max_M < 1: self.logger.error("max_M must be >= 1, but {} given".format( self.max_M)) raise ValueError if self.max_N < 1: self.logger.error("max_N must be >= 1, but {} given".format( self.max_N)) raise ValueError if self.cnt_error_samples < 1: self.logger.error( "cnt_error_samples must be >= 1, but {} given".format( self.cnt_error_samples)) raise ValueError if not 0.0 <= self.error_rate <= 1.0: self.logger.error( "error_rate must be in [0,1], but: {} was given".format( self.error_rate)) raise ValueError if not 0.0 <= self.min_wer <= 1.0: self.logger.error( "min_wer must be in [0,1], but: {} was given".format( self.min_wer)) raise ValueError if not 0.0 <= self.max_wer <= 1.0: self.logger.error( "max_wer must be in [0,1], but: {} was given".format( self.max_wer)) raise ValueError if self.cnt_sentence_samples < 1: self.logger.error( "cnt_sentence_samples must be >= 1, but {} given".format( self.cnt_sentence_samples)) raise ValueError # Other basic assertions if self.sampling_m not in ['uniform', 'weighted']: self.logger.error("sampling_m options are {}".format( ['uniform', 'weighted'])) raise ValueError if self.sampling_n not in ['uniform', 'weighted']: self.logger.error("sampling_n options are {}".format( ['uniform', 'weighted'])) raise ValueError if self.error_model not in ['phoneme', 'dictionary', 'embedding']: self.logger.error("sampling_n options are {}".format( ['phoneme', 'dictionary', 'embedding'])) raise ValueError if self.sampling_error_samples not in ['weighted', 'uniform']: self.logger.error("sampling_error_samples options are {}".format( ['weighted', 'uniform'])) raise ValueError if self.sampling_sentence_samples not in [ 'uniform', 'weighted_lm', 'max_lm' ]: self.logger.error( "sampling_sentence_samples options are {}".format( ['uniform', 'weighted_lm', 'max_lm'])) raise ValueError if self.error_model == 'phoneme': if not os.path.isfile(self.p2g_model): self.logger.error("p2g_model not found: {}".format( self.p2g_model)) raise FileNotFoundError if not os.path.isfile(self.g2p_model): self.logger.error("g2p_model not found: {}".format( self.g2p_model)) raise FileNotFoundError if not os.path.isfile(self.lexicon): self.logger.error("lexicon not found: {}".format(self.lexicon)) raise FileNotFoundError if self.error_model == 'dictionary': if not os.path.isfile(self.dictionary_filename_list): self.logger.error( "dictionary_filename_list not found: {}".format( self.dictionary_filename_list)) raise FileNotFoundError if self.error_model == 'embedding': self.logger.error("error model: {} not yet implemented".format( self.error_model)) raise NotImplementedError items = vars(self).items() for key, val in sorted(items): self.logger.debug(u"{0}: {1}".format(key, val)) return def run(self): # For all sentences in the dataset ... for s in input_sentence_generator: source_doc_path, sentence_id, sentence = s # We must add sentence to the graph self.sentence_graph.set_sentence(sentence) # Score of the original sentence (only for debug purposes ...) if self.verbose: if self.bert_lm is not None: score = self.bert_lm.get_score(sentence) self.logger.debug("LM[{:.2f}]{}".format(score, sentence)) else: self.logger.debug("LM[ - ]{}".format(sentence)) # Now we generate multiple (cnt_sentence_samples) "noisified variants" from current sentence avg_wer = 0 samples_list = [] # A. Crate samples tries = -1 while True: tries += 1 # all samples collected ( or we do not want to wait too long ... ) if len( samples_list ) == self.cnt_sentence_samples or tries * 2 > self.cnt_sentence_samples: break debug, sample = self.sentence_graph.sample_sentence() if self.bert_lm is not None: score = self.bert_lm.get_score(sample) else: score = 1. error = wer(sentence, sample) if self.min_wer <= error <= self.max_wer: avg_wer += error samples_list.append((sample, score, error)) # DEBUG: print the all sentence the variants if self.verbose: for sam in samples_list: sample, score, error = sam if self.bert_lm is not None: self.logger.debug(" LM[{:.2f}] WER[{:.2f}]{}".format( score, error, sample)) else: self.logger.debug(" LM[ - ] WER[{:.2f}]{}".format( error, sample)) self.logger.debug("avg WER: {:.2f}".format( avg_wer / self.cnt_sentence_samples)) # B. Finally we choose one sentence ..... if len(samples_list) == 0: selected_sentence = sentence else: sentences = [] lm_weights = [] for s in samples_list: # (sentence, LM, WER) sentences.append(s[0]) lm_weights.append(s[1]) selected_sentence = utils.choice(sentences, lm_weights) # And we write it to file source_doc = ntpath.basename(source_doc_path) target_doc_path = os.path.join(self.base_target_dir, source_doc) if os.path.isfile(target_doc_path): newline = True else: newline = False with open(target_doc_path, "a") as file: if newline: file.write("\n") file.write(selected_sentence) self.logger.info("All files successfully processed") self.logger.info("Calculating WER on files...") time.sleep(0.5) return wer_over_files(self.input_source_dir, self.base_target_dir, self.input_filename_list)
def __init__(self, generator, **kwargs): self.verbose = kwargs.get("verbose", False) # Error params self.max_M = kwargs.get("max_m", 2) self.max_N = kwargs.get("max_n", 2) self.sampling_m = kwargs.get("sampling_m", "weighted") self.sampling_n = kwargs.get("sampling_n", "weighted") self.error_rate = kwargs.get("error_rate", 0.3) self.error_model = kwargs.get("error_model", 'phoneme') self.cnt_error_samples = kwargs.get("error_samples", 5) self.sampling_error_samples = kwargs.get("sampling_error_samples", "weighted") # Sentence error parameters self.min_wer = kwargs.get("min_wer", 0.1) self.max_wer = kwargs.get("max_wer", 0.6) self.cnt_sentence_samples = kwargs.get("sentence_samples", 10) self.sampling_sentence_samples = kwargs.get( "sampling_sentence_samples", "weighted_lm") self.use_lm = kwargs.get("use_lm", True) self.lm_name = kwargs.get("bert_lm", None) # Phoneme model parameters self.g2p_model = kwargs.get("g2p_model", None) self.p2g_model = kwargs.get("p2g_model", None) self.lexicon = kwargs.get("lexicon", None) # Dictionary model parameters self.dictionary_filename_list = kwargs.get("dictionary_filename_list", None) self.jaro_winkler_threshold = kwargs.get("jaro_winkler_threshold", 0.8) # Embedding model parameters # --- # Target parameters self.base_target_dir = kwargs.get("base_target_dir", None) # Set logger self.logger = logger.BasicLogger.setupLogger(verbose=self.verbose) # Check everything is OK self.checkConfig() # Set Generator self.generator = generator noise_generator = self.get_noise_generator() self.sentence_graph = SamplingGraph( noise_generator=noise_generator, error_prob=self.error_rate, max_M=self.max_M, sampling_M=self.sampling_m, sampling_N=self.sampling_n, sampling_error_samples=self.sampling_error_samples) # Check if target directory is empty self.input_filename_list = kwargs.get("input_filename_list", None) self.input_source_dir = kwargs.get("input_source_dir", None) self.check_target_directory(self.base_target_dir, self.input_filename_list) # Set LM if self.use_lm: self.logger.info('loading {} model'.format(self.lm_name)) self.bert_lm = LM(self.lm_name) else: self.bert_lm = None self.logger.info("Language model not used")
print("You're not in the cluster spec! exiting!") exit(-1) else: print("ROLE: %s" % role) print("INDEX: %s" % task_index) cluster = tf.train.ClusterSpec(cluster_spec) server = tf.train.Server(cluster, job_name=role, task_index=task_index) if role == "ps": server.join() else: ps_device = '/job:ps/task:0' """ Start either train or eval. Note hardcoded parts of path for training and eval data """ hps = LM.get_default_hparams().parse(FLAGS.hpconfig) hps._set("num_gpus", FLAGS.num_gpus) print('*****HYPER PARAMETERS*****') print(hps) print('**************************') vocab = Vocabulary.from_file( os.path.join(FLAGS.datadir, "1b_word_vocab.txt")) if FLAGS.mode == "train": #hps.batch_size = 256 dataset = Dataset( vocab, os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*")) run_train(dataset,
import urllib2 from thriftpy.rpc import make_server import tensorflow as tf from data_utils import Vocabulary, Dataset from language_model import LM, inference_graph from common import CheckpointLoader import numpy as np interface_thrift = thriftpy.load("interface.thrift", module_name="interface_thrift") #import pdb #pdb.set_trace() top_k = 3 pattern = re.compile('[\w+]') p_punc = re.compile('(\.|\"|,|\?|\!)') hps = LM.get_default_hparams() vocab = Vocabulary.from_file("1b_word_vocab.txt") st = hps.num_steps with tf.variable_scope("model"): hps.vocab_size = 793470 hps.num_sampled = 0 # Always using full softmax at evaluation. run out of memory hps.keep_prob = 1.0 hps.num_gpus = 1 model = inference_graph(hps) if hps.average_params: print("Averaging parameters for evaluation.") saver = tf.train.Saver(model.avg_dict) else: saver = tf.train.Saver()