Exemple #1
0
def test_read_train_dev_test():
    data_dir = os.path.join(root_dir, "data/ptb")
    train_path = os.path.join(data_dir, "train")
    dev_path = os.path.join(data_dir, "valid")
    test_path = os.path.join(data_dir, "test")
    cache_dir = os.path.join(root_dir, "data/ptb/cache")
    vocab_size = 20000
    if not os.path.exists(cache_dir):
        os.mkdir(cache_dir)
    train_data_bucket, dev_data_bucket, _buckets, vocab_path = data_util.read_train_dev(
        cache_dir, train_path, dev_path, vocab_size, 100, 10)
    test_data_bucket, _buckets_test = data_util.read_test(
        cache_dir, test_path, vocab_path, vocab_size, 100, 10)

    def print_bucket_data(data):
        l = [len(x) for x in data]
        print l

    print "_buckets: {}\n".format(_buckets)
    print_bucket_data(train_data_bucket)
    print_bucket_data(dev_data_bucket)
    print "_buckets_test: {}\n".format(_buckets_test)
    print_bucket_data(test_data_bucket)
Exemple #2
0
def train():
    #1.读入train数据和dev数据
    mylog_section('READ DATA')
    train_data_bucket, dev_data_bucket, _buckets, vocab_path = read_train_dev(
        FLAGS.data_cache_dir, FLAGS.train_path, FLAGS.dev_path,
        FLAGS.vocab_size, FLAGS.L, FLAGS.n_bucket)
    ##########以下是打印需要的信息 start #####################
    real_vocab_size = get_real_vocab_size(vocab_path)

    FLAGS._buckets = _buckets
    FLAGS.real_vocab_size = real_vocab_size

    # 计算总共要处理的tokens个数
    train_n_tokens = np.sum([
        np.sum([len(sentence) for sentence in bucket])
        for bucket in train_data_bucket
    ])

    # train_data_bucket
    train_bucket_sizes = [
        len(train_data_bucket[index]) for index in xrange(len(_buckets))
    ]
    train_total_size = float(sum(train_bucket_sizes))
    # 计算累计值,用于计算bucket,在 data_iterator中随机生成一个0-1的数,这里的train_buckets_scale根据每个bucket中句子数量的不同,切分成不同的权重[0.1,0.3,0.5,0.8,1]
    # 当随机的0-1的数落到上述权重的某个区间,那么就选哪个bucket。
    train_buckets_scale = [
        sum(train_bucket_sizes[:i + 1]) / train_total_size
        for i in xrange(len(train_bucket_sizes))
    ]

    dev_bucket_sizes = [
        len(dev_data_bucket[index]) for index in xrange(len(_buckets))
    ]
    dev_total_size = int(sum(dev_bucket_sizes))

    mylog_section("REPORT")
    # steps
    batch_size = FLAGS.batch_size
    n_epoch = FLAGS.n_epoch
    steps_per_epoch = int(train_total_size / batch_size)
    steps_per_checkpoint = int(steps_per_epoch / 2)  #每半个epoch 验证一次模型
    total_steps = steps_per_epoch * n_epoch

    # reports
    mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size))
    mylog("_buckets: {}".format(FLAGS._buckets))
    mylog("Train:")
    mylog("total: {}".format(train_total_size))
    mylog("bucket sizes: {}".format(train_bucket_sizes))
    mylog("Dev:")
    mylog("total: {}".format(dev_total_size))
    mylog("bucket sizes: {}".format(dev_bucket_sizes))
    mylog("Steps_per_epoch: {}".format(steps_per_epoch))
    mylog("Total_steps:{}".format(total_steps))
    mylog("Steps_per_checkpoint: {}".format(steps_per_checkpoint))
    ##########打印需要的信息 end #####################

    mylog_section("IN TENSORFLOW")

    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = FLAGS.allow_growth
    with tf.Session(config=config) as sess:
        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog_section("MODEL/SUMMARY/WRITER")
        mylog("Creating Model.. (this can take a few minutes)")
        model = create_model(sess, run_options, run_metadata)

        mylog_section("All Variables")
        show_all_variables()

        # Data Iterators
        mylog_section("Data Iterators")
        dite = DataIterator(model, train_data_bucket, len(train_buckets_scale),
                            batch_size, train_buckets_scale)
        iteType = 0
        if iteType == 0:
            mylog("Itetype: withRandom")
            ite = dite.next_random()
        elif iteType == 1:
            mylog("Itetype: withSequence")
            ite = dite.next_sequence()

        # statistics during training
        step_time, loss = 0.0, 0.0
        current_step = 0
        low_ppx = float("inf")
        steps_per_report = 30
        n_targets_report = 0
        report_time = 0
        n_valid_sents = 0
        n_valid_words = 0
        patience = FLAGS.patience

        mylog_section("TRAIN")
        while current_step < total_steps:
            # start
            start_time = time.time()
            # data and train
            inputs, outputs, weights, bucket_id = ite.next()  #训练数据

            L = model.step(sess, inputs, outputs, weights, bucket_id)

            # loss and time
            step_time += (time.time() - start_time) / steps_per_checkpoint
            loss += L
            current_step += 1
            # 此处 weights 等数据的格式是 len(weights) == 句子长度
            # len(weights[0]) 是 batch size
            n_valid_sents += np.sum(np.sign(weights[0]))
            n_valid_words += np.sum(weights)
            # for report
            report_time += (time.time() - start_time)
            n_targets_report += np.sum(weights)

            #显示信息
            if current_step % steps_per_report == 0:
                sect_name = "STEP {}".format(current_step)
                msg = "StepTime: {:.2f} sec Speed: {:.2f} targets/s Total_targets: {}".format(
                    report_time / steps_per_report,
                    n_targets_report * 1.0 / report_time, train_n_tokens)
                mylog_line(sect_name, msg)

                report_time = 0
                n_targets_report = 0

                # Create the Timeline object, and write it to a json
                if FLAGS.profile:
                    tl = timeline.Timeline(run_metadata.step_stats)
                    ctf = tl.generate_chrome_trace_format()
                    with open('timeline.json', 'w') as f:
                        f.write(ctf)
                    exit()

            #达到半个epoch,计算ppx(dev)
            if current_step % steps_per_checkpoint == 0:
                i_checkpoint = int(current_step / steps_per_checkpoint)
                # train_ppx
                loss = loss / n_valid_words
                train_ppx = math.exp(
                    float(loss)) if loss < 300 else float("inf")
                learning_rate = model.learning_rate.eval()

                # dev_ppx
                dev_loss, dev_ppx = evaluate(sess, model, dev_data_bucket)

                # report
                sect_name = "CHECKPOINT {} STEP {}".format(
                    i_checkpoint, current_step)
                msg = "Learning_rate: {:.4f} Dev_ppx: {:.2f} Train_ppx: {:.2f}".format(
                    learning_rate, dev_ppx, train_ppx)
                mylog_line(sect_name, msg)

                # save model per checkpoint
                if FLAGS.saveCheckpoint:
                    checkpoint_path = os.path.join(FLAGS.saved_model_dir,
                                                   "model")
                    s = time.time()
                    model.saver.save(sess,
                                     checkpoint_path,
                                     global_step=i_checkpoint,
                                     write_meta_graph=False)
                    msg = "Model saved using {:.2f} sec at {}".format(
                        time.time() - s, checkpoint_path)
                    mylog_line(sect_name, msg)

                # save best model
                if dev_ppx < low_ppx:
                    patience = FLAGS.patience
                    low_ppx = dev_ppx
                    checkpoint_path = os.path.join(FLAGS.saved_model_dir,
                                                   "best")
                    s = time.time()
                    model.best_saver.save(sess,
                                          checkpoint_path,
                                          global_step=0,
                                          write_meta_graph=False)
                    msg = "Model saved using {:.2f} sec at {}".format(
                        time.time() - s, checkpoint_path)
                    mylog_line(sect_name, msg)
                else:
                    patience -= 1
                    #每次当 dev_ppx >= low_ppx时 学习步长减半
                    sess.run(model.learning_rate_decay_op)
                    msg = 'dev_ppx:{}, low_ppx:{}'.format(
                        str(dev_ppx), str(low_ppx))
                    mylog_line(sect_name, msg)
                    msg = 'dev_ppx >= low_ppx,patience ={}, learning_reate ={}'.format(
                        str(patience), str(model.learning_rate.eval()))
                    mylog_line(sect_name, msg)

                if patience <= 0:
                    mylog("Training finished. Running out of patience.")
                    break

                # Save checkpoint and zero timer and loss.
                step_time, loss, n_valid_sents, n_valid_words = 0.0, 0.0, 0, 0
Exemple #3
0
def train():
    # Read Data
    mylog_section("READ DATA")
    train_data_bucket, dev_data_bucket, _buckets, vocab_path = read_train_dev(
        FLAGS.data_cache_dir, FLAGS.train_path, FLAGS.dev_path,
        FLAGS.vocab_size, FLAGS.L, FLAGS.n_bucket)
    # 执行到此处, train_data_bucket,dev_data_bucket,_buckets 长度相同
    # train_data_bucket,dev_data_bucket 都是 [b1,b2,b3, ..., bn] 格式
    # 每个 bi 中都是化为数字的 sentence
    # _buckets [2,4,5] 类似,是分割的句子长度
    real_vocab_size = get_real_vocab_size(vocab_path)

    FLAGS._buckets = _buckets
    FLAGS.real_vocab_size = real_vocab_size
    # 计算总共要处理的tokens个数
    train_n_tokens = np.sum(
        [np.sum([len(items) for items in x]) for x in train_data_bucket])
    # train_data_bucket
    train_bucket_sizes = [
        len(train_data_bucket[b]) for b in range(len(_buckets))
    ]
    train_total_size = float(sum(train_bucket_sizes))
    # 计算累计值
    train_buckets_scale = [
        sum(train_bucket_sizes[:i + 1]) / train_total_size
        for i in range(len(train_bucket_sizes))
    ]
    dev_bucket_sizes = [len(dev_data_bucket[b]) for b in range(len(_buckets))]
    dev_total_size = int(sum(dev_bucket_sizes))

    mylog_section("REPORT")
    # steps
    batch_size = FLAGS.batch_size
    n_epoch = FLAGS.n_epoch
    steps_per_epoch = int(train_total_size / batch_size)
    steps_per_dev = int(dev_total_size / batch_size)
    steps_per_checkpoint = int(steps_per_epoch / 2)
    total_steps = steps_per_epoch * n_epoch

    # reports
    mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size))
    mylog("_buckets: {}".format(FLAGS._buckets))
    mylog("Train:")
    mylog("total: {}".format(train_total_size))
    mylog("bucket sizes: {}".format(train_bucket_sizes))
    mylog("Dev:")
    mylog("total: {}".format(dev_total_size))
    mylog("bucket sizes: {}".format(dev_bucket_sizes))
    mylog("Steps_per_epoch: {}".format(steps_per_epoch))
    mylog("Total_steps:{}".format(total_steps))
    mylog("Steps_per_checkpoint: {}".format(steps_per_checkpoint))

    mylog_section("IN TENSORFLOW")

    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = FLAGS.allow_growth
    with tf.Session(config=config) as sess:

        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog_section("MODEL/SUMMARY/WRITER")

        mylog("Creating Model.. (this can take a few minutes)")
        model = create_model(sess, run_options, run_metadata)

        mylog("Creating ModelSummary")
        modelSummary = ModelSummary()

        mylog("Creating tf.summary.FileWriter")
        summaryWriter = tf.summary.FileWriter(
            os.path.join(FLAGS.summary_dir, "train.summary"), sess.graph)

        mylog_section("All Variables")
        show_all_variables()

        # Data Iterators
        mylog_section("Data Iterators")

        dite = DataIterator(model, train_data_bucket, len(train_buckets_scale),
                            batch_size, train_buckets_scale)

        iteType = 0
        if iteType == 0:
            mylog("Itetype: withRandom")
            ite = dite.next_random()
        elif iteType == 1:
            mylog("Itetype: withSequence")
            ite = dite.next_sequence()

        # statistics during training
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []
        low_ppx = float("inf")
        low_ppx_step = 0
        steps_per_report = 30
        n_targets_report = 0
        report_time = 0
        n_valid_sents = 0
        n_valid_words = 0
        patience = FLAGS.patience

        mylog_section("TRAIN")

        while current_step < total_steps:

            # start
            start_time = time.time()

            # data and train
            inputs, outputs, weights, bucket_id = next(ite)

            L = model.step(sess, inputs, outputs, weights, bucket_id)

            # loss and time
            step_time += (time.time() - start_time) / steps_per_checkpoint

            loss += L
            current_step += 1
            # 此处 weights 等数据的格式是 len(weights) == 句子长度
            # len(weights[0]) 是 batch size
            n_valid_sents += np.sum(np.sign(weights[0]))
            n_valid_words += np.sum(weights)

            # for report
            report_time += (time.time() - start_time)
            n_targets_report += np.sum(weights)

            if current_step % steps_per_report == 0:
                sect_name = "STEP {}".format(current_step)
                msg = "StepTime: {:.2f} sec Speed: {:.2f} targets/s Total_targets: {}".format(
                    report_time / steps_per_report,
                    n_targets_report * 1.0 / report_time, train_n_tokens)
                mylog_line(sect_name, msg)

                report_time = 0
                n_targets_report = 0

                # Create the Timeline object, and write it to a json
                if FLAGS.profile:
                    tl = timeline.Timeline(run_metadata.step_stats)
                    ctf = tl.generate_chrome_trace_format()
                    with open('timeline.json', 'w') as f:
                        f.write(ctf)
                    exit()

            if current_step % steps_per_checkpoint == 0:

                i_checkpoint = int(current_step / steps_per_checkpoint)

                # train_ppx
                loss = loss / n_valid_words
                train_ppx = math.exp(
                    float(loss)) if loss < 300 else float("inf")
                learning_rate = model.learning_rate.eval()

                # dev_ppx
                dev_loss, dev_ppx = evaluate(sess, model, dev_data_bucket)

                # report
                sect_name = "CHECKPOINT {} STEP {}".format(
                    i_checkpoint, current_step)
                msg = "Learning_rate: {:.4f} Dev_ppx: {:.2f} Train_ppx: {:.2f}".format(
                    learning_rate, dev_ppx, train_ppx)
                mylog_line(sect_name, msg)

                # save summary
                _summaries = modelSummary.step_record(sess, train_ppx, dev_ppx)
                for _summary in _summaries:
                    summaryWriter.add_summary(_summary, i_checkpoint)

                # save model per checkpoint
                if FLAGS.saveCheckpoint:
                    checkpoint_path = os.path.join(FLAGS.saved_model_dir,
                                                   "model")
                    s = time.time()
                    model.saver.save(sess,
                                     checkpoint_path,
                                     global_step=i_checkpoint,
                                     write_meta_graph=False)
                    msg = "Model saved using {:.2f} sec at {}".format(
                        time.time() - s, checkpoint_path)
                    mylog_line(sect_name, msg)

                # save best model
                if dev_ppx < low_ppx:
                    patience = FLAGS.patience
                    low_ppx = dev_ppx
                    low_ppx_step = current_step
                    checkpoint_path = os.path.join(FLAGS.saved_model_dir,
                                                   "best")
                    s = time.time()
                    model.best_saver.save(sess,
                                          checkpoint_path,
                                          global_step=0,
                                          write_meta_graph=False)
                    msg = "Model saved using {:.2f} sec at {}".format(
                        time.time() - s, checkpoint_path)
                    mylog_line(sect_name, msg)
                else:
                    patience -= 1

                if patience <= 0:
                    mylog("Training finished. Running out of patience.")
                    break

                # Save checkpoint and zero timer and loss.
                step_time, loss, n_valid_sents, n_valid_words = 0.0, 0.0, 0, 0