Ejemplo n.º 1
0
def gen_barrage_wechcat(raw_input, model):
    raw_input = pro_sentpiece(raw_input, hp.bpe_model)
    bar_input = []
    bar_input.append(raw_input)

    test_batches = input_fn(bar_input, bar_input, hp.barrages_vocab, 1, shuffle=False)

    iter = tf.data.Iterator.from_structure(test_batches.output_types, test_batches.output_shapes)
    xs, ys = iter.get_next()

    test_init_op = iter.make_initializer(test_batches)
    y_hat, _, random_predict = model.eval_gen(xs, ys)

    with tf.Session() as sess:
        ckpt_ = tf.train.latest_checkpoint(hp.ckpt)
        ckpt = hp.ckpt if ckpt_ is None else ckpt_  # None: ckpt is a file. otherwise dir.
        saver = tf.train.Saver()
        saver.restore(sess, ckpt)

        sess.run(test_init_op)

        logging.info("# get hypotheses")
        hypotheses, yy = get_hypotheses(1, 1, sess, y_hat, model.idx2token)

        logging.info("# write results")
        logging.info(hypotheses)

        logging.info("# Done")

    return "".join(hypotheses)
Ejemplo n.º 2
0
def test(hp):
    # Loading hyper params
    load_hparams(hp, hp.ckpt)

    logging.info("# Prepare test batches")
    test_batches, num_test_batches, num_test_samples = get_batch(
        hp.test1,
        hp.test1,
        100000,
        100000,
        hp.vocab,
        hp.test_batch_size,
        shuffle=False)
    iter = tf.data.Iterator.from_structure(test_batches.output_types,
                                           test_batches.output_shapes)
    xs, ys = iter.get_next()

    test_init_op = iter.make_initializer(test_batches)

    logging.info("# Load model")
    model = Transformer(hp)

    logging.info("# Session")
    with tf.Session() as sess:
        ckpt_ = tf.train.latest_checkpoint(hp.ckpt)
        ckpt = ckpt_ if ckpt_ else hp.ckpt
        saver = tf.train.Saver()

        saver.restore(sess, ckpt)

        y_hat, mean_loss = model.eval(sess, test_init_op, xs, ys,
                                      num_test_batches)

        logging.info("# get hypotheses")
        hypotheses = get_hypotheses(num_test_samples, y_hat, model.idx2token)

        logging.info("# write results")
        model_output = os.path.split(ckpt)[-1]
        if not os.path.exists(hp.testdir):
            os.makedirs(hp.testdir)
        translation = os.path.join(hp.testdir, model_output)
        with open(translation, 'w', encoding="utf-8") as fout:
            fout.write("\n".join(hypotheses))

        logging.info("# calc bleu score and append it to translation")
        calc_bleu_nltk(hp.test2, translation)
Ejemplo n.º 3
0
    sess.run(train_init_op)
    for i in tqdm(range(_gs, total_steps+1)):
        _, _gs, _summary = sess.run([train_op, global_step, train_summaries])
        epoch = _gs // train_num_batches
        summary_writer.add_summary(_summary, _gs)

        if _gs and _gs % train_num_batches == 0:
            logging.info("# Epoch {} is done".format(epoch))
            _loss = sess.run(loss)

            logging.info("# Test evaluation")
            _, _eval_summary = sess.run([eval_init_op, eval_summaries])
            summary_writer.add_summary(_eval_summary, epoch)

            logging.info("# Get hypotheses")
            hypotheses = get_hypotheses(num_eval_batches, num_eval_samples, sess, y_hat, m.idx2token)

            logging.info("# Write results")
            model_output = "iwslt2016_E%02dL%.2f" % (epoch, _loss)
            if not os.path.exsits(hp.evaldir): os.makedirs(hp.evaldir)
            translation = os.path.join(hp.evaldir, model_output)
            with open(translation, "w") as f:
                f.write("\n".join(hypotheses))

            logging.info("# Calc bleu score and append it to translation")
            calc_bleu(hp.eval3, translation)

            logging.info("# Save model")
            ckpt_name = os.ptah.join(hp.logdir, model_output)
            saver.save(sess, ckpt_name, global_step=_gs)
            logging.info("After training of {} epochs, {} has been saved.".format(epoch, ckpt_name))
def calculate_average(dict_accumulator, number_of_iterations):
    for key in dict_accumulator:
        dict_accumulator[key] = dict_accumulator[key] / number_of_iterations
    return dict_accumulator


#enddef

if __name__ == "__main__":

    final_dict_accumulator = {}
    number_of_iterations = 10
    exp_iter = 0
    #input all hypotheses
    hypotheses = utils.get_hypotheses(hypotheses_file)
    #get all examples
    examples = utils.input_examples(ds)
    # initial distribution
    Q_0 = np.ones(len(hypotheses)) / len(hypotheses)

    for iteration in range(0, number_of_iterations):

        #Define deltas
        deltas_for_noise_feature = np.arange(0, 0.201, 0.02)
        teacher_type = "noise_feature"  #"noise_feature"  #limited_ground_truth

        accumulator = {}
        for delta in deltas_for_noise_feature:
            delta = np.round(delta, 2)
            teachers_examples = utils.get_teachers_examples(
Ejemplo n.º 5
0
logging.info("# Load model")
m = Transformer(hp)
y_hat, _, refs = m.eval(xs, ys)

logging.info("# Session")
with tf.Session() as sess:
    ckpt = tf.train.latest_checkpoint(hp.modeldir)
    saver = tf.train.Saver()

    saver.restore(sess, ckpt)

    sess.run(test_init_op)

    logging.info("# get hypotheses")
    hypotheses, refs_result = get_hypotheses(num_test_batches,
                                             num_test_samples, sess, y_hat,
                                             refs, m.idx2token)

    # 将原始的结果写到本地
    logging.info("write references")
    result_output = "refs"
    if not os.path.exists(hp.test_result): os.makedirs(hp.test_result)
    ref_path = os.path.join(hp.test_result, result_output)
    with open(ref_path, 'w', encoding='utf-8') as fout:
        _refs = []
        for r in refs_result:
            words = r.decode('utf-8').split()
            s = [word.replace("▁", " ")
                 for word in words]  # remove bpe symbols
            sent = ''.join(s)
            _refs.append(sent.strip())
Ejemplo n.º 6
0
from utils import get_hypotheses
from collections import Counter
import nltk
from nltk.tag import pos_tag, map_tag
from nltk.corpus import wordnet as wn
import numpy, pickle

HYP_FILE = "./data/train-test.hyp1-hyp2-ref"
GOLD_FILE = "./data/train.gold"
OUTPUT_FILE = "./output-exp.pred"
TRADEOFF_PARAM = 0.35# between 0 and 1

[labeled_instances, unlabeled_instances] = get_hypotheses(HYP_FILE, GOLD_FILE)

def memoize(func):
    memodict = {}
    def inner(stuff):
        if stuff not in memodict:
            memodict[stuff] = func(stuff)
        return memodict[stuff]
    return inner

# nltk can't pos tag unicode
def preprocess(text):
    return ''.join(character for character in text if ord(character)<128)

wntags = {'NOUN':'n','VERB':'v','ADJ':'a','ADV':'r'}

@memoize
def get_word_count_dict(sentence):
    wcount_dict = Counter()
Ejemplo n.º 7
0
        xs_output, ys_output, handle_output, _, _gs, _summary = sess.run(
            [xs, ys, handle, train_op, global_step, train_summaries],
            feed_dict={handle: training_handle})
        summary_writer.add_summary(_summary, _gs)
        if _gs % (hp.gpu_nums * 5000) == 0 and _gs != 0:
            logging.info("steps {} is done".format(_gs))

            logging.info("# test evaluation")
            sess.run(val_iter.initializer)  # initial val dataset
            _eval_summaries = sess.run(eval_summaries,
                                       feed_dict={handle: val_handle})
            summary_writer.add_summary(_eval_summaries, _gs)

            logging.info("# beam search")
            hypotheses, all_targets = get_hypotheses(num_eval_batches,
                                                     num_eval_samples, sess, m,
                                                     bs, [xs[0], ys[2]],
                                                     handle, val_handle)

            logging.info("# calc rouge score ")
            if not os.path.exists(hp.evaldir): os.makedirs(hp.evaldir)
            rouge_l = calc_rouge(rouge, all_targets, hypotheses, _gs,
                                 hp.evaldir)

            model_output = "trans_pointer%02dL%.2f" % (_gs, rouge_l)

            logging.info('# write hypotheses')
            with open(os.path.join(hp.evaldir, model_output),
                      'w',
                      encoding='utf-8') as f:
                for target, hypothes in zip(all_targets, hypotheses):
                    f.write('{}-{} \n'.format(target, ' '.join(hypothes)))
Ejemplo n.º 8
0
logging.info("# Session")
saver = tf.train.Saver()
with tf.Session() as sess:
    #ckpt_ = tf.train.latest_checkpoint(hp.ckpt)
    #ckpt = hp.ckpt if ckpt_ is None else ckpt_ # None: ckpt is a file. otherwise dir.
    ckpt = hp.ckpt

    #saver = tf.train.import_meta_graph('/home/shuangzhao/transformer_1/mt_log/g213_2/NMT_E6_L2.444_lr7e-05_-73488.meta')

    saver.restore(sess, ckpt)

    sess.run(test_init_op)

    logging.info("# get hypotheses")
    t1 = time.time()
    hypotheses = get_hypotheses(num_test_batches, num_test_samples, sess,
                                y_hat, m.en_idx2token)
    t2 = time.time()
    print("-" * 80)
    print(f"Time for getting results: {t2 - t1}")

    logging.info("# write results")
    model_output = ckpt.split("/")[-1]
    if not os.path.exists(hp.testdir): os.makedirs(hp.testdir)
    translation = os.path.join(hp.testdir, model_output)
    with open(translation, 'w') as fout:
        fout.write("\n".join(hypotheses))

    logging.info("# calc bleu score and append it to translation")
    #calc_bleu(hp.test2, translation)
    print(f"Time rest: {time.time() - t2}")
Ejemplo n.º 9
0
    def train_(self, epochs):
        train_batches, num_train_batches, num_train_samples = get_batch(
            '../data/iwslt2016/segmented/train.de.bpe',
            '../data/iwslt2016/segmented/train.en.bpe',
            self.sequence_length,
            self.sequence_length,
            self.vocab_file,
            self.batch_size,
            shuffle=True)
        eval_batches, num_eval_batches, num_eval_samples = get_batch(
            '../data/iwslt2016/segmented/eval.de.bpe',
            '../data/iwslt2016/segmented/eval.en.bpe',
            100000,
            100000,
            self.vocab_file,
            self.batch_size,
            shuffle=False)
        iter = tf.data.Iterator.from_structure(train_batches.output_types,
                                               train_batches.output_shapes)
        xs, ys = iter.get_next()

        train_init_op = iter.make_initializer(train_batches)
        eval_init_op = iter.make_initializer(eval_batches)
        loss, train_op, global_step, train_summaries = self.model.train(xs, ys)
        y_hat, eval_summaries = self.model.eval(xs, ys)

        logging.info("# Session")
        with tf.Session() as sess:
            ckpt = tf.train.latest_checkpoint(self.model_dir)
            if ckpt is None:
                logging.info("Initializing from scratch")
                sess.run(tf.global_variables_initializer())
                save_variable_specs(os.path.join('../data/log/1', "specs"))
            else:
                self.saver.restore(sess, ckpt)

            summary_writer = tf.summary.FileWriter(self.model_dir, sess.graph)

            sess.run(train_init_op)
            total_steps = epochs * num_train_batches
            _gs = sess.run(global_step)
            for i in tqdm(range(_gs, total_steps + 1)):
                _, _gs, _summary = sess.run(
                    [train_op, global_step, train_summaries])
                epoch = math.ceil(_gs / num_train_batches)
                summary_writer.add_summary(_summary, _gs)

                if _gs and _gs % num_train_batches == 0:
                    logging.info("epoch {} is done".format(epoch))
                    _loss = sess.run(loss)  # train loss

                    logging.info("# test evaluation")
                    _, _eval_summaries = sess.run(
                        [eval_init_op, eval_summaries])
                    summary_writer.add_summary(_eval_summaries, _gs)

                    logging.info("# get hypotheses")
                    hypotheses = get_hypotheses(num_eval_batches,
                                                num_eval_samples, sess, y_hat,
                                                self.model.index_char)
                    logging.info("# write results")
                    model_output = "iwslt2016_E%02dL%.2f" % (epoch, _loss)
                    if not os.path.exists('data/eval/1'):
                        os.makedirs('../data/eval/1')
                    translation = os.path.join('../data/eval/1', model_output)
                    with open(translation, 'w') as fout:
                        fout.write("\n".join(hypotheses))

                    logging.info(
                        "# calc bleu score and append it to translation")
                    calc_bleu('../data/iwslt2016/prepro/eval.en', translation)

                    logging.info("# save models")
                    self.saver.save(sess,
                                    os.path.join(self.model_dir,
                                                 'transformer.dat'),
                                    global_step=_gs)
                    sess.run(train_init_op)
            summary_writer.close()

        logging.info("Done")
Ejemplo n.º 10
0
def train(hp):
    save_hparams(hp, hp.checkpoints_dir)
    # Data generator
    logging.info("Prepare Train/Eval batches...")
    train_batches, num_train_batches, num_train_samples = get_batch(
        hp.train1,
        hp.train2,
        hp.maxlen1,
        hp.maxlen2,
        hp.vocab,
        hp.batch_size,
        shuffle=True)
    eval_batches, num_eval_batches, num_eval_samples = get_batch(hp.eval1,
                                                                 hp.eval2,
                                                                 10000,
                                                                 10000,
                                                                 hp.vocab,
                                                                 hp.batch_size,
                                                                 shuffle=False)

    # Batch iterator
    iter = tf.data.Iterator.from_structure(train_batches.output_types,
                                           train_batches.output_shapes)
    xs, ys = iter.get_next()

    train_init_op = iter.make_initializer(train_batches)
    eval_init_op = iter.make_initializer(eval_batches)

    # Build model
    logging.info("Build model...")
    model = Transformer(hp)
    logging.info("Model is built!")

    # Session
    logging.info("Session initialize")
    saver = tf.train.Saver(max_to_keep=5)

    with tf.Session() as sess:
        # Check & Load latest version model checkpoint
        ckpt = tf.train.latest_checkpoint(hp.checkpoints_dir)
        if ckpt is None:
            logging.info("Initializing from scratch")
            sess.run(tf.global_variables_initializer())
            save_variable_specs(os.path.join(hp.checkpoints_dir, "specs"))
        else:
            saver.restore(sess, ckpt)

        summary_writer = tf.summary.FileWriter(hp.checkpoints_dir, sess.graph)

        sess.run(train_init_op)
        total_steps = hp.num_epochs * num_train_batches
        _gs = sess.run(model.global_step)

        k = 5
        min_dev_loss = 0
        stop_alpha = 20.0
        eval_losses = []
        # Start training
        for i in tqdm(range(_gs, total_steps + 1)):
            _input_x, _decoder_input, _target = sess.run([xs[0], ys[0], ys[1]])
            _, _gs, _summary = sess.run(
                [model.train_op, model.global_step, model.summaries],
                feed_dict={
                    model.input_x: _input_x,
                    model.decoder_input: _decoder_input,
                    model.target: _target,
                    model.is_training: True
                })
            epoch = math.ceil(_gs / num_train_batches)
            summary_writer.add_summary(_summary, _gs)

            # Evaluation
            if _gs and _gs % num_train_batches == 0:
                logging.info("Epoch {} is done".format(epoch))
                _loss = sess.run(model.loss,
                                 feed_dict={
                                     model.input_x: _input_x,
                                     model.decoder_input: _decoder_input,
                                     model.target: _target,
                                     model.is_training: False
                                 })

                # evaluation
                y_hat, mean_loss = model.eval(sess, eval_init_op, xs, ys,
                                              num_eval_batches)

                # id to token
                logging.info("# Get hypotheses")
                hypotheses = get_hypotheses(num_eval_samples, y_hat,
                                            model.idx2token)

                # save translation results
                if not os.path.exists(hp.evaldir):
                    os.makedirs(hp.evaldir)
                logging.info("# Write results")
                model_output = "translation_E{:02d}L{:.2f}EL{:.2f}".format(
                    epoch, _loss, mean_loss)
                translation = os.path.join(hp.evaldir, model_output)
                with open(translation, 'w', encoding="utf-8") as fout:
                    fout.write("\n".join(hypotheses))
                logging.info(
                    "# Calculate bleu score and append it to translation")

                # bleu
                calc_bleu_nltk(hp.eval2, translation)

                # save model
                logging.info("# Save models")
                ckpt_name = os.path.join(hp.checkpoints_dir, model_output)
                saver.save(sess, ckpt_name, global_step=_gs)
                logging.info(
                    "After training of {} epochs, {} has been saved.".format(
                        epoch, ckpt_name))

                # claculate early stop
                if len(eval_losses) == 0:
                    min_dev_loss = mean_loss
                eval_losses.append(mean_loss)
                gl, p_k, pq_alpha = calculate_earlystop_baseline(
                    mean_loss, min_dev_loss, eval_losses, k)
                min_dev_loss = mean_loss if mean_loss < min_dev_loss else min_dev_loss
                eval_losses = eval_losses[-k:]
                logging.info(
                    "GL(t): {:.4f}, P_k: {:.4f}, PQ_alpha: {:.4f}".format(
                        gl, p_k, pq_alpha))
                if gl > stop_alpha:
                    logging.info(
                        "No optimization for a long time, auto-stopping...")
                    break

                # change data iterator back to train iterator
                sess.run(train_init_op)

        summary_writer.close()

    logging.info("Done")