Beispiel #1
0
def run_eval(dataset, hps, logdir, mode, num_eval_steps):
    with tf.variable_scope('model'):
        hps.num_sampled = 0  # Always using full softmax at evaluation.
        hps.keep_prob = 1.0
        model = LM(hps, 'eval', '/cpu:0')

    if hps.average_params:
        print('Averaging parameters for evaluation.')
        saver = tf.train.Saver(model.avg_dict)
    else:
        saver = tf.train.Saver()

    # Use only 4 threads for the evaluation.
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=4,
                            inter_op_parallelism_threads=1)
    sess = tf.Session(config=config)
    sw = tf.summary.FileWriter(logdir + '/' + mode, sess.graph)
    ckpt_loader = CheckpointLoader(saver, model.global_step, logdir + '/train')

    with sess.as_default():
        while ckpt_loader.load_checkpoint():
            global_step = ckpt_loader.last_global_step
            data_iterator = dataset.iterate_once(
                hps.batch_size * hps.num_gpus, hps.num_steps)
            tf.initialize_local_variables().run()
            for v in tf.get_collection('initial_state'):
                sess.run(v.initializer,
                         feed_dict={model.batch_size: hps.batch_size})
            loss_nom = 0.0
            loss_den = 0.0
            for i, (x, y, w) in enumerate(data_iterator):
                if i >= num_eval_steps:
                    break

                loss = sess.run(model.loss, {
                    model.x: x, model.y: y, model.w: w,
                    model.batch_size: hps.batch_size})
                loss_nom += loss
                loss_den += w.mean()
                loss = loss_nom / loss_den
                sys.stdout.write('%d: %.3f (%.3f) ... ' % (
                    i, loss, np.exp(loss)))
                sys.stdout.flush()
            sys.stdout.write('\n')

            log_perplexity = loss_nom / loss_den
            print('Results at %d: log_perplexity = %.3f perplexity = %.3f' % (
                global_step, log_perplexity, np.exp(log_perplexity)))

            summary = tf.Summary()
            summary.value.add(
                tag='eval/log_perplexity', simple_value=log_perplexity)
            summary.value.add(
                tag='eval/perplexity', simple_value=np.exp(log_perplexity))
            sw.add_summary(summary, global_step)
            sw.flush()
Beispiel #2
0
def run_eval(dataset, hps, logdir, mode, num_eval_steps):
    with tf.variable_scope("model"):
        hps.num_sampled = 0  # Always using full softmax at evaluation.
        hps.keep_prob = 1.0
        #model = LM(hps, "eval", "/cpu:0")
        model = LM(hps, "eval", "/gpu:0")

    if hps.average_params:
        print("Averaging parameters for evaluation.")
        saver = tf.train.Saver(model.avg_dict)
    else:
        saver = tf.train.Saver()

    # Use only 4 threads for the evaluation.
    #config = tf.ConfigProto(allow_soft_placement=True,
    #                        intra_op_parallelism_threads=20,
    #                        inter_op_parallelism_threads=1)
    config = tf.ConfigProto(allow_soft_placement=True)
    sess = tf.Session(config=config)
    sw = tf.summary.FileWriter(logdir + "/" + mode, sess.graph)
    ckpt_loader = CheckpointLoader(saver, model.global_step, logdir + "/train")

    with sess.as_default():
        while ckpt_loader.load_checkpoint():
            global_step = ckpt_loader.last_global_step
            data_iterator = dataset.iterate_once(hps.batch_size * hps.num_gpus, hps.num_steps)
            #tf.initialize_local_variables().run()
            tf.local_variables_initializer().run()
            loss_nom = 0.0
            loss_den = 0.0
            #for i, (x, y, w) in enumerate(data_iterator):
            for i, (x, y) in enumerate(data_iterator):
                if i >= num_eval_steps and mode!="eval_full":
                    break

                #loss = sess.run(model.loss, {model.x: x, model.y: y, model.w: w})
                loss = sess.run(model.loss, {model.x: x, model.y: y})
                loss_nom += loss
                loss_den += 1 # ???
                #loss_den += w.mean()
                loss = loss_nom / loss_den
                sys.stdout.write("%d: %.3f (%.3f) ... " % (i, loss, np.exp(loss)))
                sys.stdout.flush()
            sys.stdout.write("\n")

            log_perplexity = loss_nom / loss_den
            print("Results at %d: log_perplexity = %.3f perplexity = %.3f" % (
                global_step, log_perplexity, np.exp(log_perplexity)))

            summary = tf.Summary()
            summary.value.add(tag='eval/log_perplexity', simple_value=log_perplexity)
            summary.value.add(tag='eval/perplexity', simple_value=np.exp(log_perplexity))
            sw.add_summary(summary, global_step)
            sw.flush()
            if mode == "eval_full":
                break #we don't need to wait for other checkpoints in this mode
Beispiel #3
0
def main(_):
    hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
    hps.num_gpus = FLAGS.num_gpus

    vocab = Vocabulary.from_file("small_voca.txt")  # ("1b_word_vocab.txt")

    if FLAGS.mode == "train":
        hps.batch_size = 256
        dataset = Dataset(
            vocab,
            FLAGS.datadir + "/training-monolingual.tokenized.shuffled/*")
        run_train(dataset, hps, FLAGS.logdir + "/train", ps_device="/gpu:0")
    elif FLAGS.mode.startswith("eval_"):
        hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
        hps.num_gpus = FLAGS.num_gpus

        if FLAGS.mode.startswith("eval_train"):
            data_dir = FLAGS.datadir + "/training-monolingual.tokenized.shuffled/*"
        else:
            data_dir = FLAGS.datadir + "/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050"
        dataset = Dataset(vocab, data_dir, deterministic=True)
        run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps)
Beispiel #4
0
def main(_):
    """
    Start either train or eval. Note hardcoded parts of path for training and eval data
    """
    hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
    hps._set("num_gpus", FLAGS.num_gpus)
    print('*****HYPER PARAMETERS*****')
    print(hps)
    print('**************************')

    print_debug('our training DataSetDir=%s  , LogDir=%s' %
                (FLAGS.datadir, FLAGS.logdir))

    #vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "1b_word_vocab.txt"))
    vocab = Vocabulary.from_file(os.path.join(FLAGS.datadir, "vocabulary.txt"))
    FLAGS.mode = "train"
    for i in range(10):
        print("Iteration ", i, " phase: ", FLAGS.mode)
        if FLAGS.mode == "train":
            #hps.batch_size = 256
            # dataset = Dataset(vocab, os.path.join(FLAGS.datadir,
            #                                       "training-monolingual.tokenized.shuffled/*"))
            dataset = Dataset(vocab,
                              os.path.join(FLAGS.datadir, "ptb.train.txt"))

            trainlogdir = (
                FLAGS.logdir + str("/") + "train"
            )  #(FLAGS.logdir+str("\\")+"train")#os.path.join(FLAGS.logdir, "train")
            print_debug('train log dir=%s' % (trainlogdir))

            run_train(dataset, hps, trainlogdir, ps_device="/gpu:0")
            print_debug('Finished run_train !!!!!!!!!!!')
        elif FLAGS.mode.startswith("eval"):
            print_debug('eval mode')

            # if FLAGS.mode.startswith("eval_train"):
            #     data_dir = os.path.join(FLAGS.datadir, "training-monolingual.tokenized.shuffled/*")
            # elif FLAGS.mode.startswith("eval_full"):
            #     data_dir = os.path.join(FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/*")
            # else:
            #     data_dir = os.path.join(FLAGS.datadir, "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050")
            dataset = Dataset(vocab,
                              os.path.join(FLAGS.datadir, "ptb.test.txt"),
                              deterministic=True)
            run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps)
            print_debug('Finished run_eval !!!!!!!!!!!')

        if FLAGS.mode == "train":
            FLAGS.mode = "eval_full"
        else:
            FLAGS.mode = "train"
Beispiel #5
0
def main(_):
    """
    Start either train or eval. Note hardcoded parts of path for training and eval data
    """
    hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
    hps._set("num_gpus", FLAGS.num_gpus)
    print('*****HYPER PARAMETERS*****')
    print(hps)
    print('**************************')

    vocab = Vocabulary.from_file(
        os.path.join(FLAGS.datadir, "1b_word_vocab.txt"))

    if FLAGS.mode == "train":
        #hps.batch_size = 256
        dataset = Dataset(
            vocab,
            os.path.join(FLAGS.datadir,
                         "training-monolingual.tokenized.shuffled/*"))
        run_train(dataset,
                  hps,
                  os.path.join(FLAGS.logdir, "train"),
                  ps_device="/gpu:0")
    elif FLAGS.mode.startswith("eval_"):
        if FLAGS.mode.startswith("eval_train"):
            data_dir = os.path.join(
                FLAGS.datadir, "training-monolingual.tokenized.shuffled/*")
        elif FLAGS.mode.startswith("eval_full"):
            data_dir = os.path.join(
                FLAGS.datadir,
                "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050"
            )
        else:
            data_dir = os.path.join(
                FLAGS.datadir,
                "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050"
            )
        dataset = Dataset(vocab, data_dir, deterministic=True)
        run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps)
    elif FLAGS.mode.startswith("infer"):
        data_dir = os.path.join(
            FLAGS.datadir,
            "heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050"
        )
        dataset = Dataset(vocab, data_dir, deterministic=True)
        run_infer(dataset, hps, FLAGS.logdir, FLAGS.mode, vocab)
Beispiel #6
0
def run():

    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    hps = LM.get_default_hparams().parse('num_steps=20,num_shards=8,num_layers=2,emb_size=12,projected_size=12,state_size=80,num_sampled=0,batch_size=1,vocab_size=102')
    hps._set("num_gpus", 1)
    #arg('model')
    #arg('vocab')
    arg('--port', type=int, default=8000)
    arg('--host', default='localhost')
    arg('--debug', action='store_true')
    args = parser.parse_args()

    global model
    #model = Model(args.model, args.vocab, hps)
    model = Model('/Users/ruiyangwang/Desktop/f-lm/logs/test/train/model.ckpt-0','/Users/ruiyangwang/Desktop/examples/word_language_model/data/penn/vocabulary.txt', hps)
    app.run(port=args.port, host=args.host, debug=args.debug)
Beispiel #7
0
def test(config, dataset, model_dir, summary_dir):
    logger = logging.getLogger('lm_zh')
    config.keep_prob = 1.0
    config.num_sampled = 0
    logger.info('Build graph ...')
    initializer = tf.random_uniform_initializer(-config.init_scale,
                                                config.init_scale)
    with tf.variable_scope('model', initializer=initializer):
        model = LM(dataset, config, model_dir, summary_dir)
        logger.info('Restore model ...')
        model.restore()
        logger.info('Start test model ...')
        model.test()
    logger.info('Test done')
Beispiel #8
0
def run():

    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    hps = LM.get_default_hparams().parse(
        'num_steps=20,num_shards=6,num_layers=2,learning_rate=0.2,max_grad_norm=1,keep_prob=0.9,emb_size=1024,projected_size=1024,state_size=8192,num_sampled=8192,batch_size=512,vocab_size=11859,num_of_groups=4'
    )
    hps._set("num_gpus", 1)
    #arg('model')
    #arg('vocab')
    arg('--port', type=int, default=8000)
    arg('--host', default='localhost')
    arg('--debug', action='store_true')
    args = parser.parse_args()

    global model
    #model = Model(args.model, args.vocab, hps)
    model = Model('/Users/ruiyangwang/Desktop/model/model.ckpt-44260',
                  '/Users/ruiyangwang/Desktop/vocabulary2.txt', hps)
    app.run(port=args.port, host=args.host, debug=args.debug)
Beispiel #9
0
 def __init__(self, hps, logdir, datadir, mode='eval'):
     with tf.variable_scope("model"):
         hps.num_sampled = 0
         hps.keep_prob = 1.0
         self.model = LM(hps, "eval", "/gpu:0")
     if hps.average_params:
         print("Averaging parameters for evaluation.")
         saver = tf.train.Saver(self.model.avg_dict)
     else:
         saver = tf.train.Saver()
     config = tf.ConfigProto(allow_soft_placement=True)
     self.sess = tf.Session(config=config)
     sw = tf.summary.FileWriter(logdir + "/" + mode, self.sess.graph)
     self.hps = hps
     self.num_steps = self.hps.num_steps
     vocab_path = os.path.join(datadir, "vocabulary.txt")
     with self.sess.as_default():
         success = common.load_from_checkpoint(saver, logdir + "/train")
     if not success:
         raise Exception('Loading Checkpoint failed')
     self.vocabulary = Vocabulary.from_file(vocab_path)
Beispiel #10
0
def main(_):
    hvd.init()
    hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
    hps.num_gpus = FLAGS.num_gpus

    vocab = Vocabulary.from_file(FLAGS.vocab)
    hps.vocab_size = vocab.num_tokens

    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())

    if FLAGS.logdir is None:
        FLAGS.logdir = os.path.join('/tmp',
                                    'lm-run-{}'.format(int(time.time())))
        print('logdir: {}'.format(FLAGS.logdir))
    hps.batch_size = 256
    dataset = Dataset(vocab, FLAGS.datadir)
    run_train(dataset,
              hps,
              FLAGS.logdir + '/train',
              ps_device='/gpu:' + str(hvd.local_rank()))
def main(_):
    hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
    hps.num_gpus = FLAGS.num_gpus
    
    vocab = Vocabulary.from_file(FLAGS.datadir + "/lm_vocab.txt", hps.vocab_size)

    if FLAGS.mode == "train":
        hps.batch_size = 256  # reset batchsize
        dataset = Dataset(vocab, FLAGS.datadir + "/train/*")
        run_train(dataset, hps, FLAGS.logdir + "/train", ps_device="/gpu:0")
    elif FLAGS.mode.startswith("eval_"):
        if FLAGS.mode.startswith("eval_train"):
            data_dir = FLAGS.datadir + "/train/*"
        elif FLAGS.mode.startswith("eval_test"):
            data_dir = FLAGS.datadir + "/heldout/*"
        print("data_dir:",data_dir)
        dataset = Dataset(vocab, data_dir, deterministic=True)
        run_eval(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps)
    elif  FLAGS.mode.startswith("predict_next"):
        data_dir = "data/news.en.heldout-00001-of-00050"
        dataset = Dataset(vocab, data_dir)
        predict_next(dataset, hps, FLAGS.logdir, FLAGS.mode, FLAGS.eval_steps,vocab) 
Beispiel #12
0
def topkwords(prefix_words, dataset, hps, logdir, mode,top=10):
    inputs, targets = process_sentence(prefix_words, dataset._vocab, hps)
    with tf.variable_scope("model"):
        hps.num_sampled = 0  # Always using full softmax at evaluation.
        hps.keep_prob = 1.0
        # model = LM(hps, "eval", "/cpu:0")
        model = LM(hps, "eval", "/gpu:0")

    if hps.average_params:
        print("Averaging parameters for evaluation.")
        saver = tf.train.Saver(model.avg_dict)
    else:
        saver = tf.train.Saver()

    config = tf.ConfigProto(allow_soft_placement=True)
    sess = tf.Session(config=config)
    sw = tf.summary.FileWriter(logdir + "/" + mode, sess.graph)
    ckpt_loader = CheckpointLoader(saver, model.global_step, logdir + "/train")

    with sess.as_default():
        while ckpt_loader.load_checkpoint():
            tf.local_variables_initializer().run()
            ppl = sess.run(model.loss, {model.x: inputs, model.y: targets})
Beispiel #13
0
    def test_lm(self):
        hps = get_test_hparams()

        with tf.variable_scope("model"):
            model = LM(hps)

        with self.test_session() as sess:
            tf.initialize_all_variables().run()
            tf.initialize_local_variables().run()

            loss = 1e5
            for i in range(50):
                x, y, w = simple_data_generator(hps.batch_size, hps.num_steps)
                loss, _ = sess.run([model.loss, model.train_op], {
                    model.x: x,
                    model.y: y,
                    model.w: w
                })
                print("%d: %.3f %.3f" % (i, loss, np.exp(loss)))
                if np.isnan(loss):
                    print("NaN detected")
                    break

            self.assertLess(loss, 1.0)
Beispiel #14
0
def run_train(dataset, hps, logdir, ps_device, task=0, master=""):
    t0 = time.time()
    f = open('loss-log.txt', 'w')
    with tf.variable_scope("model"):
        model = LM(hps, "train", ps_device)
    stime = time.time()
    print("Current time: %s" % stime)
    print("ALL VARIABLES")
    for v in tf.all_variables():
        print("%s %s %s %s" % (v.name, v.get_shape(), v.dtype, v.device))
    print("TRAINABLE VARIABLES")
    for v in tf.trainable_variables():
        print("%s %s %s %s" % (v.name, v.get_shape(), v.dtype, v.device))
    print("LOCAL VARIABLES")
    for v in tf.local_variables():
        print("%s %s %s %s" % (v.name, v.get_shape(), v.dtype, v.device))

    sv = tf.train.Supervisor(is_chief=(task == 0),
                             logdir=logdir,
                             summary_op=None,  # Automatic summaries don't work with placeholders.
                             global_step=model.global_step,
                             save_summaries_secs=60*hps.save_summary_every_min,
                             save_model_secs=60*hps.save_model_every_min)
                             #save_summaries_secs=30,
                             #save_model_secs=120 * 5)

    #config = tf.ConfigProto(allow_soft_placement=True,
    #                        intra_op_parallelism_threads=2,
    #                        inter_op_parallelism_threads=20)
    config = tf.ConfigProto(allow_soft_placement=True)
    with sv.managed_session(master, config=config) as sess:
        # Slowly increase the number of workers during beginning of the training.
        #while not sv.should_stop() and (time.time() - stime) < hps.max_time:
        #    step = int(sess.run(model.global_step))
        #    waiting_until_step = task * hps.num_delayed_steps
        #    if step >= waiting_until_step:
        #        break
        #    else:
        #        print("Current step is %d. Waiting until: %d" % (step, waiting_until_step))
        #    time.sleep(20.0)
	

        local_step = 0
        prev_global_step = sess.run(model.global_step)
        cur_global_step = 0
        prev_time = time.time()
        data_iterator = dataset.iterate_forever(hps.batch_size * hps.num_gpus, hps.num_steps)
        while not sv.should_stop() and (time.time() - stime) < hps.max_time:
            fetches = [model.global_step, model.loss, model.train_op]
            # Chief worker computes summaries every 100 steps.
            should_compute_summary = (task == 0  and local_step % 100 == 0)
            if should_compute_summary:
                fetches += [model.summary_op]

            #x, y, w = next(data_iterator)
            x, y = next(data_iterator)
            should_run_profiler = (hps.run_profiler and task == 0 and local_step % 1000 == 13)
            if should_run_profiler:
                run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                run_metadata = tf.RunMetadata()
                #fetched = sess.run(fetches, {model.x: x, model.y: y, model.w: w},
                fetched = sess.run(fetches, {model.x: x, model.y: y},
                                   options=run_options, run_metadata=run_metadata)
                # Create the Timeline object, and write it to a json
                tl = timeline.Timeline(run_metadata.step_stats)
                ctf = tl.generate_chrome_trace_format()
                print("Running profiler")
                with open(logdir + "/timeline.json", 'w') as f:
                    f.write(ctf)
                print("Finished profiling!")
            else:
                #fetched = sess.run(fetches, {model.x: x, model.y: y, model.w: w})
                fetched = sess.run(fetches, {model.x: x, model.y: y})
            
            cur_global_step = fetched[0]

            local_step += 1
            if should_compute_summary:
                sv.summary_computed(sess, fetched[-1])

            if local_step < 10 or local_step % 20 == 0:
                cur_time = time.time()
                num_words = hps.batch_size * hps.num_gpus * hps.num_steps
                wps = (cur_global_step - prev_global_step) * num_words / (cur_time - prev_time)
                prev_global_step = cur_global_step
                print("Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f" % (
                    cur_global_step, cur_time - prev_time, wps, fetched[1]))
                f.write("%s,%s,%s,%s,%s\r\n" % (cur_global_step, cur_time-t0, cur_time-prev_time, wps, fetched[1]))
                f.flush()
                prev_time = cur_time
        #save last model
        sv._saver.save(sess, sv.save_path, cur_global_step)
    sv.stop()
    f.close()
Beispiel #15
0
def run_train(dataset, hps, logdir, ps_device, task=0, master=''):
    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=2,
                            inter_op_parallelism_threads=20)
    
    with tf.variable_scope('model'):
        model = LM(hps, 'train', ps_device)

    print('ALL VARIABLES')
    for v in tf.all_variables():
        print('%s %s %s' % (v.name, v.get_shape(), v.device))
    print('TRAINABLE VARIABLES')
    for v in tf.trainable_variables():
        print('%s %s %s' % (v.name, v.get_shape(), v.device))
    print('LOCAL VARIABLES')
    for v in tf.local_variables():
        print('%s %s %s' % (v.name, v.get_shape(), v.device))

    

    #sv = tf.train.Supervisor(
    #    is_chief=(task == 0),
    #    logdir=logdir,
    #    summary_op=None,  # Automatic summaries don't work with placeholders.
    #    global_step=model.global_step,
    #    save_summaries_secs=30,
    #    save_model_secs=120 * 5)
    hooks = [hvd.BroadcastGlobalVariablesHook(0)]
    total_step = 0
    with tf.train.MonitoredTrainingSession(config=config, hooks=hooks) as sess:
        for v in tf.get_collection('initial_state'):
            sess.run(v.initializer, feed_dict={model.batch_size: hps.batch_size})
        # Slowly increase the number of workers during
        # beginning of the training.
        while not sess.should_stop():
            step = int(sess.run(model.global_step))
            waiting_until_step = task * hps.num_delayed_steps
            if step >= waiting_until_step:
                break
            else:
                print('Current step is %d. Waiting until: %d' %
                      (step, waiting_until_step))
            time.sleep(10.0)

        local_step = 0
        prev_global_step = sess.run(model.global_step)
        prev_time = time.time()
        data_iterator = dataset.iterate_forever(
            hps.batch_size * hps.num_gpus, hps.num_steps)
        while not sess.should_stop():
            fetches = [model.global_step, model.loss, model.train_op]
            # Chief worker computes summaries every 20 steps.
            should_compute_summary = (
                hvd.rank() == 0 and local_step > 0 and local_step % 20 == 0)
            if should_compute_summary:
                fetches += [model.summary_op]

            x, y, w = next(data_iterator)
            fetched = sess.run(fetches, {model.x: x, model.y: y, model.w: w})

            local_step += 1
            #if should_compute_summary:
            #    sess.summary_computed(sess, fetched[-1])
            if hvd.rank() == 0:
                if local_step < 10 or local_step % 200 == 0:
                    cur_time = time.time()
                    num_words = hps.batch_size * hps.num_gpus * hps.num_steps
                    sps = hps.batch_size * hps.num_gpus * (fetched[0] - prev_global_step) / (cur_time - prev_time)
                    wps = ((fetched[0] - prev_global_step) * num_words /
                           (cur_time - prev_time))
                    prev_global_step = fetched[0]
                    print('Iteration %d, time = %.2fs, wps = %.0f, sps = %.0f '
                          'train loss = %.4f' % (
                            fetched[0], cur_time - prev_time, wps * hvd.size(), sps * hvd.size(), fetched[1]))
                    prev_time = cur_time
import json
import numpy as np
import time
import tensorflow as tf
from data_utils import Vocabulary, Dataset
from language_model import LM
from common import CheckpointLoader

BATCH_SIZE = 1
NUM_TIMESTEPS = 1
MAX_WORD_LEN = 50

UPLOAD_FOLDER = '/data/ngramTest/uploads'
UPLOAD_FOLDER = './'

hps = LM.get_default_hparams()
vocab = Vocabulary.from_file("1b_word_vocab.txt")
with tf.variable_scope("model"):
    hps.num_sampled = 0  # Always using full softmax at evaluation.   run out of memory
    hps.keep_prob = 1.0
    hps.num_gpus = 1
    model = LM(hps, "predict_next", "/cpu:0")

if hps.average_params:
    print("Averaging parameters for evaluation.")
    saver = tf.train.Saver(model.avg_dict)
else:
    saver = tf.train.Saver()

# Use only 4 threads for the evaluation.
config = tf.ConfigProto(allow_soft_placement=True,
Beispiel #17
0
    config = p.parse_args()

    return config


if __name__ == '__main__':
    config = define_argparser()

    loader = DataLoader(config.train,
                        config.valid,
                        batch_size=config.batch_size,
                        device=config.gpu_id,
                        max_length=config.max_length)
    model = LM(len(loader.text.vocab),
               word_vec_dim=config.word_vec_dim,
               hidden_size=config.hidden_size,
               n_layers=config.n_layers,
               dropout_p=config.dropout,
               max_length=config.max_length)

    # Let criterion cannot count PAD as right prediction, because PAD is easy to predict.
    loss_weight = torch.ones(len(loader.text.vocab))
    loss_weight[data_loader.PAD] = 0
    criterion = nn.NLLLoss(weight=loss_weight, size_average=False)

    print(model)
    print(criterion)

    if config.gpu_id >= 0:
        model.cuda(config.gpu_id)
        criterion.cuda(config.gpu_id)
def predict_next(dataset, hps, logdir, mode, num_eval_steps, vocab):
    with tf.variable_scope("model"):
        hps.num_sampled = 0  # Always using full softmax at evaluation.   run out of memory
        hps.keep_prob = 1.0
        model = LM(hps, "predict_next", "/cpu:0")

    if hps.average_params:
        print("Averaging parameters for evaluation.")
        saver = tf.train.Saver(model.avg_dict)
    else:
        saver = tf.train.Saver()

    # Use only 4 threads for the evaluation.
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=20,
                            inter_op_parallelism_threads=1)
    sess = tf.Session(config=config)
    sw = tf.summary.FileWriter(logdir + "/" + mode, sess.graph)
    ckpt_loader = CheckpointLoader(saver, model.global_step, logdir + "/train")
    with sess.as_default():
        ckpt_loader.load_checkpoint()  #  FOR ONLY ONE CHECKPOINT
        global_step = ckpt_loader.last_global_step
        data_iterator = dataset.iterate_once(hps.batch_size * hps.num_gpus,
                                             hps.num_steps)
        sess.run(tf.local_variables_initializer())
        print("global_step:", global_step)
        loss_nom = 0.0
        loss_den = 0.0
        cur_time = time.time()
        savedKey = 0
        totalKey = 0
        '''
            text = open("data/news.en.heldout-00001-of-00050","r")
            for kk,line in enumerate(text):
                totalKey += len(line.strip())
                if kk==0:
                    print len(line)
            print "totalKey:",totalKey
            '''
        predicted_words = []
        for i, (x, y, w) in enumerate(data_iterator):
            #if i >= num_eval_steps:
            #    break
            '''
                print "i",i
                print "x",x
                
                for j in x[:]:
                    print j
                    for jj in j:
                        print vocab.get_token(jj)
                '''
            #print "x:",[vocab.get_token(ix) for ix in x[0]]
            #print "y:",[vocab.get_token(ix) for ix in y[0]]
            inputs = [vocab.get_token(ix) for ix in x[0]]
            labels = [vocab.get_token(ix) for ix in y[0]]
            loss, logits, indexes = sess.run(
                [model.loss, model.logits, model.index], {
                    model.x: x,
                    model.y: y,
                    model.w: w
                })
            #print logits.shape,indexes
            #print indexes[0]
            tmpKS = 0
            tmpAllKey = 0

            for step in range(hps.num_steps):
                words = []
                totalKey += len(inputs[step])
                tmpAllKey += len(inputs[step])
                if step > 0:
                    totalKey += 1  # for space between two keys
                    tmpAllKey += 1
                for j in range(hps.arg_max):
                    word = vocab.get_token(indexes[0][step][j])
                    words += [word]
                    if word == labels[step]:
                        predicted_words += [word]
                        tmpKS += len(labels[step])
                        savedKey += len(labels[step])
                #print "predict: ", words
            # print "x:",x
            print("i:%6d,  savedKey:%d , totalKey:%d,  ksr : %.3f " %
                  (i, tmpKS, tmpAllKey, tmpKS * 1.0 / tmpAllKey))
        print("savedKey:%d , totalKey:%d,  ksr : %.3f " %
              (savedKey, totalKey, savedKey * 1.0 / totalKey))
        print("predicted_words:")
        print(predicted_words)
        now = time.time()
        print "time:", now - cur_time
Beispiel #19
0
class HomoNoiserScript:
    def __init__(self, generator, **kwargs):
        self.verbose = kwargs.get("verbose", False)

        # Error params
        self.max_M = kwargs.get("max_m", 2)
        self.max_N = kwargs.get("max_n", 2)
        self.sampling_m = kwargs.get("sampling_m", "weighted")
        self.sampling_n = kwargs.get("sampling_n", "weighted")
        self.error_rate = kwargs.get("error_rate", 0.3)
        self.error_model = kwargs.get("error_model", 'phoneme')
        self.cnt_error_samples = kwargs.get("error_samples", 5)
        self.sampling_error_samples = kwargs.get("sampling_error_samples",
                                                 "weighted")

        # Sentence error parameters
        self.min_wer = kwargs.get("min_wer", 0.1)
        self.max_wer = kwargs.get("max_wer", 0.6)
        self.cnt_sentence_samples = kwargs.get("sentence_samples", 10)
        self.sampling_sentence_samples = kwargs.get(
            "sampling_sentence_samples", "weighted_lm")
        self.use_lm = kwargs.get("use_lm", True)
        self.lm_name = kwargs.get("bert_lm", None)

        # Phoneme model parameters
        self.g2p_model = kwargs.get("g2p_model", None)
        self.p2g_model = kwargs.get("p2g_model", None)
        self.lexicon = kwargs.get("lexicon", None)

        # Dictionary model parameters
        self.dictionary_filename_list = kwargs.get("dictionary_filename_list",
                                                   None)
        self.jaro_winkler_threshold = kwargs.get("jaro_winkler_threshold", 0.8)
        # Embedding model parameters
        # ---

        # Target parameters
        self.base_target_dir = kwargs.get("base_target_dir", None)

        # Set logger
        self.logger = logger.BasicLogger.setupLogger(verbose=self.verbose)

        # Check everything is OK
        self.checkConfig()

        # Set Generator
        self.generator = generator

        noise_generator = self.get_noise_generator()

        self.sentence_graph = SamplingGraph(
            noise_generator=noise_generator,
            error_prob=self.error_rate,
            max_M=self.max_M,
            sampling_M=self.sampling_m,
            sampling_N=self.sampling_n,
            sampling_error_samples=self.sampling_error_samples)

        # Check if target directory is empty
        self.input_filename_list = kwargs.get("input_filename_list", None)
        self.input_source_dir = kwargs.get("input_source_dir", None)

        self.check_target_directory(self.base_target_dir,
                                    self.input_filename_list)

        # Set LM
        if self.use_lm:
            self.logger.info('loading {} model'.format(self.lm_name))
            self.bert_lm = LM(self.lm_name)
        else:
            self.bert_lm = None
            self.logger.info("Language model not used")

    def delete_files(self, list_of_filenames):
        import shutil
        for file_path in list_of_filenames:  # os.listdir(folder):
            # file_path = os.path.join(folder, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print('Failed to delete %s. Reason: %s' % (file_path, e))

    def check_target_directory(self, base_target_dir, file_list):
        """
        https://stackoverflow.com/questions/185936/how-to-delete-the-contents-of-a-folder
        """

        with open(file_list) as f:
            files_list = [line.rstrip() for line in f]
        list_of_filenames = [
            os.path.join(base_target_dir, f) for f in files_list
        ]

        existing_files = []
        for file_path in list_of_filenames:
            if os.path.isfile(file_path):
                existing_files.append(file_path)

        if len(existing_files) > 0:
            self.logger.error(
                "!!!!! Target directory already contains target files !!!!! ----- Should the files be deleted ????? [Y/N]"
            )
            time.sleep(0.5)
            print("delete files? [Y/N]: ", end="")
            while True:
                input1 = input()
                input1 = input1.lower()
                if input1 == "y":
                    self.logger.warning("Deleting files ...")
                    self.delete_files(existing_files)
                    break
                if input1 == "n":
                    self.logger.error(
                        "Move files somewhere else ... exiting now!")
                    exit(1)
                print("delete files? [Y/N]: ", end="")
                # self.logger.error("????? Should the files be deleted ????? [Y/N]")

    def get_noise_generator(self):
        if self.error_model == "phoneme":
            return NoiseFromP2G(g2p_model_path=self.g2p_model,
                                p2g_model_path=self.p2g_model,
                                pronounc_dict_path=self.lexicon,
                                cnt_error_samples=self.cnt_error_samples,
                                max_N=self.max_N)

        elif self.error_model == "dictionary":
            return NoiseFromDict(db_file_list=self.dictionary_filename_list,
                                 threshold=self.jaro_winkler_threshold,
                                 cnt_error_samples=self.cnt_error_samples,
                                 max_N=self.max_N)
        else:
            self.logger.error("Error model not implemented: {}".format(
                self.error_model))
            raise NotImplementedError

    def which(self, program):
        """Basic 'which' implementation for python.

        Basic 'which' implementation for python from stackoverflow:
          * https://stackoverflow.com/a/377028/6739158
        """
        def is_exe(fpath):
            return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

        fpath, fname = os.path.split(program)
        if fpath:
            if is_exe(program):
                return program
        else:
            for path in os.environ["PATH"].split(os.pathsep):
                path = path.strip('"')
                exe_file = os.path.join(path, program)
                if is_exe(exe_file):
                    return exe_file

        return None

    def validateLexicon(self):
        validator_pattern = u"[\\}\\|_]"  # python2: unicode, python3: str
        validator = re.compile(validator_pattern)

        with open(self.lexicon, "r") as ifp:
            for line in ifp:
                if validator.search(line):
                    error = "Bad line contains reservered character:\n\t{0}"
                    error = error.format(line)
                    raise ValueError(error)

        return

    def checkConfig(self):
        self.logger.info("Checking command configuration...")
        for program in [
                "phonetisaurus-g2pfst", "phonetisaurus-align",
                "phonetisaurus-arpa2wfst"
        ]:
            if not self.which(program):
                raise EnvironmentError(", ".join(
                    ["Phonetisaurus command, '{0}'",
                     "not found in path."]).format(program))

        # Create target_meta directory if not exists
        if not os.path.isdir(self.base_target_dir):
            self.logger.debug("Directory does not exist.  Trying to create.")
            os.makedirs(self.base_target_dir)

        if self.error_model == 'phoneme':
            self.logger.info(
                "Checking lexicon for reserved characters: '}', '|', '_'...")
            self.validateLexicon()

        # Basic assertions
        if self.max_M < 1:
            self.logger.error("max_M must be >= 1, but {} given".format(
                self.max_M))
            raise ValueError
        if self.max_N < 1:
            self.logger.error("max_N must be >= 1, but {} given".format(
                self.max_N))
            raise ValueError
        if self.cnt_error_samples < 1:
            self.logger.error(
                "cnt_error_samples must be >= 1, but {} given".format(
                    self.cnt_error_samples))
            raise ValueError
        if not 0.0 <= self.error_rate <= 1.0:
            self.logger.error(
                "error_rate must be in [0,1], but: {} was given".format(
                    self.error_rate))
            raise ValueError
        if not 0.0 <= self.min_wer <= 1.0:
            self.logger.error(
                "min_wer must be in [0,1], but: {} was given".format(
                    self.min_wer))
            raise ValueError
        if not 0.0 <= self.max_wer <= 1.0:
            self.logger.error(
                "max_wer must be in [0,1], but: {} was given".format(
                    self.max_wer))
            raise ValueError
        if self.cnt_sentence_samples < 1:
            self.logger.error(
                "cnt_sentence_samples must be >= 1, but {} given".format(
                    self.cnt_sentence_samples))
            raise ValueError

        # Other basic assertions
        if self.sampling_m not in ['uniform', 'weighted']:
            self.logger.error("sampling_m options are {}".format(
                ['uniform', 'weighted']))
            raise ValueError
        if self.sampling_n not in ['uniform', 'weighted']:
            self.logger.error("sampling_n options are {}".format(
                ['uniform', 'weighted']))
            raise ValueError
        if self.error_model not in ['phoneme', 'dictionary', 'embedding']:
            self.logger.error("sampling_n options are {}".format(
                ['phoneme', 'dictionary', 'embedding']))
            raise ValueError
        if self.sampling_error_samples not in ['weighted', 'uniform']:
            self.logger.error("sampling_error_samples options are {}".format(
                ['weighted', 'uniform']))
            raise ValueError
        if self.sampling_sentence_samples not in [
                'uniform', 'weighted_lm', 'max_lm'
        ]:
            self.logger.error(
                "sampling_sentence_samples options are {}".format(
                    ['uniform', 'weighted_lm', 'max_lm']))
            raise ValueError

        if self.error_model == 'phoneme':
            if not os.path.isfile(self.p2g_model):
                self.logger.error("p2g_model not found: {}".format(
                    self.p2g_model))
                raise FileNotFoundError
            if not os.path.isfile(self.g2p_model):
                self.logger.error("g2p_model not found: {}".format(
                    self.g2p_model))
                raise FileNotFoundError
            if not os.path.isfile(self.lexicon):
                self.logger.error("lexicon not found: {}".format(self.lexicon))
                raise FileNotFoundError

        if self.error_model == 'dictionary':
            if not os.path.isfile(self.dictionary_filename_list):
                self.logger.error(
                    "dictionary_filename_list not found: {}".format(
                        self.dictionary_filename_list))
                raise FileNotFoundError

        if self.error_model == 'embedding':
            self.logger.error("error model: {} not yet implemented".format(
                self.error_model))
            raise NotImplementedError

        items = vars(self).items()
        for key, val in sorted(items):
            self.logger.debug(u"{0}:  {1}".format(key, val))
        return

    def run(self):
        # For all sentences in the dataset ...
        for s in input_sentence_generator:
            source_doc_path, sentence_id, sentence = s

            # We must add sentence to the graph
            self.sentence_graph.set_sentence(sentence)

            # Score of the original sentence (only for debug purposes ...)
            if self.verbose:
                if self.bert_lm is not None:
                    score = self.bert_lm.get_score(sentence)
                    self.logger.debug("LM[{:.2f}]{}".format(score, sentence))
                else:
                    self.logger.debug("LM[ - ]{}".format(sentence))

            # Now we generate multiple (cnt_sentence_samples) "noisified variants" from current sentence
            avg_wer = 0
            samples_list = []

            # A. Crate samples
            tries = -1
            while True:
                tries += 1

                # all samples collected ( or we  do not want to wait too long ... )
                if len(
                        samples_list
                ) == self.cnt_sentence_samples or tries * 2 > self.cnt_sentence_samples:
                    break

                debug, sample = self.sentence_graph.sample_sentence()
                if self.bert_lm is not None:
                    score = self.bert_lm.get_score(sample)
                else:
                    score = 1.
                error = wer(sentence, sample)

                if self.min_wer <= error <= self.max_wer:
                    avg_wer += error
                    samples_list.append((sample, score, error))

            # DEBUG: print the all sentence the variants
            if self.verbose:
                for sam in samples_list:
                    sample, score, error = sam
                    if self.bert_lm is not None:
                        self.logger.debug(" LM[{:.2f}] WER[{:.2f}]{}".format(
                            score, error, sample))
                    else:
                        self.logger.debug(" LM[ - ] WER[{:.2f}]{}".format(
                            error, sample))

                self.logger.debug("avg WER: {:.2f}".format(
                    avg_wer / self.cnt_sentence_samples))

            # B. Finally we choose one sentence .....
            if len(samples_list) == 0:
                selected_sentence = sentence
            else:
                sentences = []
                lm_weights = []
                for s in samples_list:  # (sentence, LM, WER)
                    sentences.append(s[0])
                    lm_weights.append(s[1])
                selected_sentence = utils.choice(sentences, lm_weights)

            # And we write it to file
            source_doc = ntpath.basename(source_doc_path)
            target_doc_path = os.path.join(self.base_target_dir, source_doc)
            if os.path.isfile(target_doc_path):
                newline = True
            else:
                newline = False
            with open(target_doc_path, "a") as file:
                if newline:
                    file.write("\n")
                file.write(selected_sentence)
        self.logger.info("All files successfully processed")
        self.logger.info("Calculating WER on files...")
        time.sleep(0.5)
        return wer_over_files(self.input_source_dir, self.base_target_dir,
                              self.input_filename_list)
Beispiel #20
0
    def __init__(self, generator, **kwargs):
        self.verbose = kwargs.get("verbose", False)

        # Error params
        self.max_M = kwargs.get("max_m", 2)
        self.max_N = kwargs.get("max_n", 2)
        self.sampling_m = kwargs.get("sampling_m", "weighted")
        self.sampling_n = kwargs.get("sampling_n", "weighted")
        self.error_rate = kwargs.get("error_rate", 0.3)
        self.error_model = kwargs.get("error_model", 'phoneme')
        self.cnt_error_samples = kwargs.get("error_samples", 5)
        self.sampling_error_samples = kwargs.get("sampling_error_samples",
                                                 "weighted")

        # Sentence error parameters
        self.min_wer = kwargs.get("min_wer", 0.1)
        self.max_wer = kwargs.get("max_wer", 0.6)
        self.cnt_sentence_samples = kwargs.get("sentence_samples", 10)
        self.sampling_sentence_samples = kwargs.get(
            "sampling_sentence_samples", "weighted_lm")
        self.use_lm = kwargs.get("use_lm", True)
        self.lm_name = kwargs.get("bert_lm", None)

        # Phoneme model parameters
        self.g2p_model = kwargs.get("g2p_model", None)
        self.p2g_model = kwargs.get("p2g_model", None)
        self.lexicon = kwargs.get("lexicon", None)

        # Dictionary model parameters
        self.dictionary_filename_list = kwargs.get("dictionary_filename_list",
                                                   None)
        self.jaro_winkler_threshold = kwargs.get("jaro_winkler_threshold", 0.8)
        # Embedding model parameters
        # ---

        # Target parameters
        self.base_target_dir = kwargs.get("base_target_dir", None)

        # Set logger
        self.logger = logger.BasicLogger.setupLogger(verbose=self.verbose)

        # Check everything is OK
        self.checkConfig()

        # Set Generator
        self.generator = generator

        noise_generator = self.get_noise_generator()

        self.sentence_graph = SamplingGraph(
            noise_generator=noise_generator,
            error_prob=self.error_rate,
            max_M=self.max_M,
            sampling_M=self.sampling_m,
            sampling_N=self.sampling_n,
            sampling_error_samples=self.sampling_error_samples)

        # Check if target directory is empty
        self.input_filename_list = kwargs.get("input_filename_list", None)
        self.input_source_dir = kwargs.get("input_source_dir", None)

        self.check_target_directory(self.base_target_dir,
                                    self.input_filename_list)

        # Set LM
        if self.use_lm:
            self.logger.info('loading {} model'.format(self.lm_name))
            self.bert_lm = LM(self.lm_name)
        else:
            self.bert_lm = None
            self.logger.info("Language model not used")
Beispiel #21
0
    print("You're not in the cluster spec!  exiting!")
    exit(-1)
else:
    print("ROLE: %s" % role)
    print("INDEX: %s" % task_index)

cluster = tf.train.ClusterSpec(cluster_spec)
server = tf.train.Server(cluster, job_name=role, task_index=task_index)
if role == "ps":
    server.join()
else:
    ps_device = '/job:ps/task:0'
    """
    Start either train or eval. Note hardcoded parts of path for training and eval data
    """
    hps = LM.get_default_hparams().parse(FLAGS.hpconfig)
    hps._set("num_gpus", FLAGS.num_gpus)
    print('*****HYPER PARAMETERS*****')
    print(hps)
    print('**************************')

    vocab = Vocabulary.from_file(
        os.path.join(FLAGS.datadir, "1b_word_vocab.txt"))

    if FLAGS.mode == "train":
        #hps.batch_size = 256
        dataset = Dataset(
            vocab,
            os.path.join(FLAGS.datadir,
                         "training-monolingual.tokenized.shuffled/*"))
        run_train(dataset,
Beispiel #22
0
import urllib2
from thriftpy.rpc import make_server
import tensorflow as tf
from data_utils import Vocabulary, Dataset
from language_model import LM, inference_graph
from common import CheckpointLoader
import numpy as np

interface_thrift = thriftpy.load("interface.thrift",
                                 module_name="interface_thrift")
#import pdb
#pdb.set_trace()
top_k = 3
pattern = re.compile('[\w+]')
p_punc = re.compile('(\.|\"|,|\?|\!)')
hps = LM.get_default_hparams()
vocab = Vocabulary.from_file("1b_word_vocab.txt")
st = hps.num_steps

with tf.variable_scope("model"):
    hps.vocab_size = 793470
    hps.num_sampled = 0  # Always using full softmax at evaluation.   run out of memory
    hps.keep_prob = 1.0
    hps.num_gpus = 1
    model = inference_graph(hps)
if hps.average_params:
    print("Averaging parameters for evaluation.")
    saver = tf.train.Saver(model.avg_dict)
else:
    saver = tf.train.Saver()