Ejemplo n.º 1
0
def separate_shuffle(data_files, suffix):
    assert isinstance(data_files, list)
    start_time = time.time()
    with Pool(2) as p:
        p.starmap(shuffle_single_train, [(data_file, suffix) for data_file in data_files])
    utils.print_out(
        "  Shuffled monolingual training datasets separately, time %.2fs" % (time.time() - start_time))
Ejemplo n.º 2
0
def run_external_eval(infer_model, infer_sess, model_dir, params, summary_writer):
    with infer_model.graph.as_default():
        loaded_infer_model, global_step = trainer_utils.create_or_load_model(
            infer_model.model, model_dir, infer_sess, "infer")

    out_dir = params["model_dir"]
    misc_utils.print_out("# External BLEU evaluation, global step %d" % global_step)

    infer_sess.run(infer_model.iterator.initializer)

    output = os.path.join(out_dir, "output_eval")
    tags = ["%s2%s" % (params["lang1"], params["lang2"]),
            "%s2%s" % (params["lang2"], params["lang1"])]
    pred_files = ["%s_%s" % (output, tag) for tag in tags]
    ref_files = [params["lang1to2_ref"], params["lang2to1_ref"]]

    scores = trainer_utils.decode_and_evaluate(
        tags,
        loaded_infer_model,
        infer_sess,
        pred_files,
        ref_files,
        bleu_script_path=params["moses_bleu_script"])

    for tag in scores:
        add_summary(summary_writer, global_step, "%s_BLEU" % tag, scores[tag])

    return scores, global_step
Ejemplo n.º 3
0
def dual_inference(params):
    misc_utils.print_out("# lang1_valid_data and lang2_valid_data are used for inference.")

    infer_model = trainer_utils.create_infer_model(TrainerMT, params)

    config_proto = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    config_proto.gpu_options.allow_growth = True

    ckpt_path = tf.train.latest_checkpoint(params["model_dir"])
    with tf.Session(graph=infer_model.graph, config=config_proto) as sess:
        loaded_infer_model = trainer_utils.load_model(
            infer_model.model, ckpt_path, sess, "infer")

        with infer_model.graph.as_default():
            sess.run(infer_model.iterator.initializer)

            output = os.path.join(params["model_dir"], "output_pred")
            tags = ["%s2%s" % (params["lang1"], params["lang2"]),
                    "%s2%s" % (params["lang2"], params["lang1"])]
            pred_files = ["%s_%s" % (output, tag) for tag in tags]
            ref_files = []

            trainer_utils.decode_and_evaluate(
                tags,
                loaded_infer_model,
                sess,
                pred_files,
                ref_files)  # unused since it is empty.
Ejemplo n.º 4
0
def eval_moses_bleu(ref, hyp, bleu_script_path):
    """
    Given a file of hypothesis and reference files,
    evaluate the BLEU score using Moses scripts.
    """
    assert os.path.isfile(ref) and os.path.isfile(hyp)
    command = bleu_script_path + ' %s < %s'
    p = subprocess.Popen(command % (ref, hyp), stdout=subprocess.PIPE, shell=True)
    result = p.communicate()[0].decode("utf-8")
    if result.startswith('BLEU'):
        return float(result[7:result.index(',')])
    else:
        utils.print_out('Impossible to parse BLEU score! "%s"' % result)
        return -1
Ejemplo n.º 5
0
def process_stats(stats, info, global_step, steps_per_stats, log_f):
    """Update info and check for overflow."""
    # Update info
    info["avg_step_time"] = stats["step_time"] / steps_per_stats
    info["avg_train_ae_loss"] = stats["ae_loss"] / steps_per_stats
    info["avg_train_bt_loss"] = stats["bt_loss"] / steps_per_stats

    is_overflow = False
    for avg_loss in [info["avg_train_ae_loss"], info["avg_train_bt_loss"]]:
        if math.isnan(avg_loss) or math.isinf(avg_loss) or avg_loss > 1e20:
            misc_utils.print_out("  step %d overflow loss, stop early" % global_step, log_f)
            is_overflow = True
            break
    return is_overflow
Ejemplo n.º 6
0
def before_train(loaded_train_model, train_model, train_sess, global_step, log_f):
    """Misc tasks to do before training."""
    stats = init_stats()
    info = {"avg_step_time": 0.0,
            "avg_train_ae_loss": 0.0,
            "avg_train_bt_loss": 0.0,
            "learning_rate": loaded_train_model.learning_rate.eval(
                session=train_sess)}
    start_train_time = time.time()
    misc_utils.print_out("# Start step %d, lr %g, %s" %
                         (global_step, info["learning_rate"], time.ctime()), log_f)

    # Initialize all of the iterators
    train_sess.run(train_model.iterator.initializer)

    return stats, info, start_train_time
Ejemplo n.º 7
0
def create_or_load_model(model, model_dir, session, name):
    """Create translation model and initialize or load parameters in session."""
    latest_ckpt = tf.train.latest_checkpoint(model_dir)
    if latest_ckpt:
        model = load_model(model, latest_ckpt, session, name)
    else:
        start_time = time.time()
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        utils.print_out("  created %s model with fresh parameters, time %.2fs" %
                        (name, time.time() - start_time))
        model.saver.save(
            session,
            os.path.join(model_dir, "model.ckpt"),
            global_step=0)
        utils.print_out("# Save model at global step 0, for initial eval/infer.")

    global_step = model.global_step.eval(session=session)
    return model, global_step
Ejemplo n.º 8
0
def decode_and_evaluate(tags,
                        model,
                        sess,
                        pred_files,
                        ref_files,
                        tgt_eos="</s>",
                        bleu_script_path="~/mosesdecoder/scripts/generic/multi-bleu.perl"):
    start_time = time.time()
    num_sentences = 0
    if tgt_eos:
        tgt_eos = tgt_eos.encode("utf-8")

    pred_file_1to2, pred_file_2to1 = pred_files
    with codecs.getwriter("utf-8")(tf.gfile.GFile(pred_file_1to2, mode="w")) as pred_f_1to2:
        with codecs.getwriter("utf-8")(tf.gfile.GFile(pred_file_2to1, mode="w")) as pred_f_2to1:
            pred_f_1to2.write("")
            pred_f_2to1.write("")
            while True:
                try:
                    sample_results = model.infer(sess)
                    batch_size = sample_results[0].shape[0]
                    for sample_words, pred_f in zip(sample_results, (pred_f_1to2, pred_f_2to1)):
                        for sent_id in range(batch_size):
                            output = sample_words[sent_id].tolist()
                            if tgt_eos and tgt_eos in output:
                                output = output[:output.index(tgt_eos)]
                            # pred_f.write((b" ".join(output) + b"\n").decode("utf-8"))
                            pred_f.write(
                                (b" ".join(output).replace(b"@@ ", b"").replace(b"@@", b"") + b"\n").decode("utf-8"))
                    num_sentences += batch_size
                except tf.errors.OutOfRangeError:
                    utils.print_out("  done, num sentences 2 * %d, time %ds" % (num_sentences, time.time() - start_time))
                    break

    # Evaluation
    scores = {}
    if len(ref_files) == len(pred_files):
        for ref_file, pred_file, tag in zip(ref_files, pred_files, tags):
            bleu = eval_moses_bleu(ref_file, pred_file, bleu_script_path)
            scores[tag] = bleu
            utils.print_out(" %s BLEU: %.2f" % (tag, bleu))
    return scores
def _create_pretrained_emb_from_txt(vocab_file,
                                    embed_file,
                                    num_notpretrained_tokens=4,
                                    dtype=tf.float32):
    """Load pretrain embeding from embed_file, and return an embedding matrix.
      Args:
        embed_file: Path to a Glove formated embedding txt file.
        num_notpretrained_tokens: Make the first n tokens in the vocab file as not
          pretrained variables. Default is 4, which is "</s>, <s>, <unk>, <mask>".
    """
    vocab, vocab_size = misc_utils.load_vocab(vocab_file)
    # notpretrained_tokens = vocab[:num_notpretrained_tokens]
    # TODO: hparam to control
    notpretrained_tokens = vocab[:1] + vocab[
        2:num_notpretrained_tokens]  # id=1 of </s> has been pretrained.

    misc_utils.print_out("# Using pre-trained embedding: %s." % embed_file)
    misc_utils.print_out("  Analyzing not pre-trained tokens: ")

    emb_dict, emb_size = misc_utils.load_embed_txt(embed_file)
    assert len(emb_dict) == vocab_size - num_notpretrained_tokens + 1
    for token in notpretrained_tokens:
        misc_utils.print_out("    %s" % token)
        if token == notpretrained_tokens[0]:
            emb_dict[token] = [0.0] * emb_size
        elif token not in emb_dict:
            emb_dict[token] = emb_size**-0.5 * np.random.randn(emb_size)

    emb_np = np.array([emb_dict[token] for token in vocab],
                      dtype=dtype.as_numpy_dtype())
    return emb_np
Ejemplo n.º 10
0
    def _set_train_or_infer(self, res, reverse_vocab_tables, params):
        if self.mode == tf.estimator.ModeKeys.TRAIN:
            self.ae_loss, self.bt_loss, _ = res
        else:
            _, _, sample_ids = res
            self.sample_ids_1to2, self.sample_ids_2to1 = sample_ids

        if self.mode == tf.estimator.ModeKeys.PREDICT:
            self.sample_words_1to2 = reverse_vocab_tables[params["lang2"]].lookup(tf.to_int64(self.sample_ids_1to2))
            self.sample_words_2to1 = reverse_vocab_tables[params["lang1"]].lookup(tf.to_int64(self.sample_ids_2to1))

        # start to optimize
        tvars = tf.trainable_variables()

        if self.mode == tf.estimator.ModeKeys.TRAIN:
            self.learning_rate = trainer_utils.get_learning_rate(
                learning_rate=params["learning_rate"],
                step=self.global_step,
                hidden_size=params["hidden_size"],
                learning_rate_warmup_steps=params["learning_rate_warmup_steps"],
                noam_decay=params["noam_decay"])

            optimizer = tf.contrib.opt.LazyAdamOptimizer(
                self.learning_rate,
                beta1=params["optimizer_adam_beta1"],
                beta2=params["optimizer_adam_beta2"],
                epsilon=params["optimizer_adam_epsilon"])

            self.ae_train_op = tf.contrib.layers.optimize_loss(
                self.lambda_xe * self.ae_loss,
                self.global_step,
                learning_rate=None,
                optimizer=optimizer,
                variables=tvars,
                clip_gradients=params["clip_grad_norm"],
                colocate_gradients_with_ops=True,
                increment_global_step=False)
            
            self.bt_train_op = tf.contrib.layers.optimize_loss(
                self.lambda_xe * self.bt_loss,
                self.global_step,
                learning_rate=None,
                optimizer=optimizer,
                variables=tvars,
                clip_gradients=params["clip_grad_norm"],
                colocate_gradients_with_ops=True,
                increment_global_step=True)
 
            self.train_ae_summary = tf.summary.merge([tf.summary.scalar("lr", self.learning_rate),
                                                      tf.summary.scalar("ae_loss", self.ae_loss)])
            self.train_bt_summary = tf.summary.merge([tf.summary.scalar("lr", self.learning_rate),
                                                      tf.summary.scalar("bt_loss", self.bt_loss)])

            misc_utils.print_out("# Trainable variables")
            misc_utils.print_out("Format: <name>, <shape>, <(soft) device placement>")
            for tvar in tvars:
                misc_utils.print_out("  %s, %s, %s" % (tvar.name, str(tvar.get_shape()),
                                                       tvar.op.device))
Ejemplo n.º 11
0
def load_model(model, ckpt_path, session, name):
    """Load model from a checkpoint."""
    start_time = time.time()
    try:
        model.saver.restore(session, ckpt_path)
    except tf.errors.NotFoundError as e:
        utils.print_out("Can't load checkpoint")
        utils.print_out("%s" % str(e))

    session.run(tf.tables_initializer())
    utils.print_out(
        "  loaded %s model parameters from %s, time %.2fs" %
        (name, ckpt_path, time.time() - start_time))
    return model
Ejemplo n.º 12
0
                        default=False,
                        help="Only inference from saved model dir.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    add_arguments(parser)
    params = vars(parser.parse_args())

    params["lang1_vocab_size"] = count_lines(params["lang1_vocab_file"])
    if params["lang1_vocab_file"] == params["lang2_vocab_file"]:
        params["lang2_vocab_size"] = params["lang1_vocab_size"]
    else:
        params["lang2_vocab_size"] = count_lines(params["lang2_vocab_file"])

    misc_utils.print_out("# All hyperparameters:")
    for key in params:
        misc_utils.print_out("%s=%s" % (key, str(params[key])))

    if params["batch_size"] >= 1024:
        misc_utils.print_out(
            "# batch_size >= 1024 indicates token level batch size for training."
        )

    if params["only_infer"]:
        if not tf.gfile.Exists(params["model_dir"]):
            raise ValueError("No checkpoint saved in %s" % params["model_dir"])
        dual_inference(params)
    else:
        if params["model_dir"] and not tf.gfile.Exists(params["model_dir"]):
            misc_utils.print_out("# Creating saved model directory %s ..." %
Ejemplo n.º 13
0
def train_and_eval(params, target_session=""):
    out_dir = params["model_dir"]
    steps_per_stats = params["steps_per_stats"]
    steps_per_eval = 10 * steps_per_stats

    # Log and output files
    log_file = os.path.join(out_dir, "log_%d" % time.time())
    log_f = tf.gfile.GFile(log_file, mode="a")
    misc_utils.print_out("# log_file=%s" % log_file, log_f)

    # create models
    model_creator = TrainerMT
    train_model = trainer_utils.create_train_model(model_creator, params)
    eval_model = trainer_utils.create_eval_model(model_creator, params)
    infer_model = trainer_utils.create_infer_model(model_creator, params)

    # TensorFlow models
    config_proto = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    config_proto.gpu_options.allow_growth = True

    train_sess = tf.Session(target=target_session, config=config_proto, graph=train_model.graph)
    eval_sess = tf.Session(target=target_session, config=config_proto, graph=eval_model.graph)
    infer_sess = tf.Session(target=target_session, config=config_proto, graph=infer_model.graph)

    with train_model.graph.as_default():
        loaded_train_model, global_step = trainer_utils.create_or_load_model(
            train_model.model, params["model_dir"], train_sess, "train")

    # Summary writer
    summary_writer = tf.summary.FileWriter(
        os.path.join(out_dir, "train_log"), train_model.graph)

    # First evaluation without training yet
    # run_external_eval(infer_model, infer_sess, params["model_dir"], params, summary_writer)

    last_stats_step = global_step
    last_eval_step = global_step

    # This is the train loop.
    trainer_utils.separate_shuffle(
        [params["lang1_train_data"], params["lang2_train_data"]], params["train_data_suffix"])
    stats, info, start_train_time = before_train(
        loaded_train_model, train_model, train_sess, global_step, log_f)
    lambda_xe_mono_config = trainer_utils.parse_lambda_config(params["lambda_xe_mono"])

    loaded_eval_model = sync_eval_model(eval_model, eval_sess, params["model_dir"])

    while global_step < params["num_train_steps"]:
        # Run a step
        start_time = time.time()
        lambda_xe_mono = trainer_utils.get_lambda_xe_mono(lambda_xe_mono_config, global_step)
        try:
            ae_step_result = loaded_train_model.ae_updates(train_sess, lambda_xe_mono)

            new_clean_inputs = ae_step_result[-1]
            ii1, ii2 = new_clean_inputs[params["lang1"]], new_clean_inputs[params["lang2"]]
            ids1to2, ids2to1 = loaded_eval_model.otfb(eval_sess, ii1, ii2)

            bt_step_result = loaded_train_model.bt_updates(
                train_sess, params["lambda_xe_otfb"], ids2to1, ids1to2, ii1, ii2)

            step_result = [ae_step_result[:-1], bt_step_result]
        except tf.errors.OutOfRangeError:
            misc_utils.print_out("# Finished Training of One Epochs.")

            trainer_utils.separate_shuffle(
                [params["lang1_train_data"], params["lang2_train_data"]], params["train_data_suffix"])
            train_sess.run(train_model.iterator.initializer)
            continue

        global_step, info["learning_rate"], step_summary = update_stats(stats, start_time, step_result)
        summary_writer.add_summary(step_summary, global_step)

        if global_step - last_stats_step >= steps_per_stats:
            last_stats_step = global_step
            is_overflow = process_stats(stats, info, global_step, steps_per_stats, log_f)
            print_step_info("  ", global_step, info, log_f)
            if is_overflow:
                break
            # Reset statistics
            stats = init_stats()

        if global_step - last_eval_step >= steps_per_eval:
            last_eval_step = global_step

            misc_utils.print_out("# Save eval, global step %d" % global_step)
            loaded_train_model.saver.save(
                train_sess,
                os.path.join(params["model_dir"], "model.ckpt"),
                global_step=global_step)

            loaded_eval_model = sync_eval_model(eval_model, eval_sess, params["model_dir"])

            run_external_eval(infer_model, infer_sess, params["model_dir"], params, summary_writer)

    # Done training
    loaded_train_model.saver.save(
        train_sess,
        os.path.join(params["model_dir"], "model.ckpt"),
        global_step=global_step)

    misc_utils.print_out("# Done training, time %ds!" % (time.time() - start_train_time))

    summary_writer.close()
    return global_step
Ejemplo n.º 14
0
def print_step_info(prefix, global_step, info, log_f):
    """Print all info at the current global step."""
    misc_utils.print_out("%sstep %d lr %g step-time %.2fs ae_loss %.4f bt_loss %.4f, %s" %
                         (prefix, global_step, info["learning_rate"], info["avg_step_time"],
                          info["avg_train_ae_loss"], info["avg_train_bt_loss"], time.ctime()), log_f)
def get_all_embeddings(params, dtype=tf.float32, scope=None):

    if params["lang1_partitions"] <= 1:
        lang1_partitioner = None
    else:
        lang1_partitioner = tf.fixed_size_partitioner(
            params["lang1_partitions"])

    if params["lang2_partitions"] <= 1:
        lang2_partitioner = None
    else:
        lang2_partitioner = tf.fixed_size_partitioner(
            params["lang2_partitions"])

    encoder_embeddings = {}
    decoder_embeddings = {}

    lang1_emb_np, lang2_emb_np = None, None
    if params["lang1_embed_file"] and params["lang2_embed_file"]:
        lang1_emb_np = _create_pretrained_emb_from_txt(
            params["lang1_vocab_file"], params["lang1_embed_file"])
        if params["lang1_embed_file"] == params["lang2_embed_file"]:
            lang2_emb_np = lang1_emb_np
        else:
            lang2_emb_np = _create_pretrained_emb_from_txt(
                params["lang2_vocab_file"], params["lang2_embed_file"])

    if params["share_decpro_emb"]:
        if params["share_lang_emb"]:
            assert params["share_output_emb"]
            share_bias = tf.get_variable('share_projection/bias', [
                params["lang1_vocab_size"],
            ],
                                         initializer=tf.zeros_initializer())
            pro_embs = {
                params["lang1"]: share_bias,
                params["lang2"]: share_bias
            }
        else:
            pro_embs = {
                params["lang1"]:
                tf.get_variable('bias', [
                    params["lang1_vocab_size"],
                ],
                                initializer=tf.zeros_initializer()),
                params["lang2"]:
                tf.get_variable('bias', [
                    params["lang2_vocab_size"],
                ],
                                initializer=tf.zeros_initializer())
            }
    else:
        if params["share_output_emb"]:
            assert params["share_lang_emb"]
            if params["pretrained_out"]:
                assert params["lang1_embed_file"] == params["lang2_embed_file"]
                misc_utils.print_out(
                    "# Using pre-trained embedding to initialize shared projection kernel."
                )
                share_proj_layer = tf.layers.Dense(
                    params["lang1_vocab_size"],
                    use_bias=True,
                    kernel_initializer=tf.constant_initializer(
                        lang1_emb_np.transpose()),
                    name="share_projection")
            else:
                share_proj_layer = tf.layers.Dense(params["lang1_vocab_size"],
                                                   use_bias=True,
                                                   name="share_projection")
            pro_embs = {
                params["lang1"]: share_proj_layer,
                params["lang2"]: share_proj_layer
            }
        else:
            if params["pretrained_out"]:
                misc_utils.print_out(
                    "# Using pre-trained embedding to initialize two projection kernels."
                )
                pro_embs = {
                    params["lang1"]:
                    tf.layers.Dense(params["lang1_vocab_size"],
                                    use_bias=True,
                                    kernel_initializer=tf.constant_initializer(
                                        lang1_emb_np.transpose()),
                                    name="%s_projection" % params["lang1"]),
                    params["lang2"]:
                    tf.layers.Dense(params["lang2_vocab_size"],
                                    use_bias=True,
                                    kernel_initializer=tf.constant_initializer(
                                        lang2_emb_np.transpose()),
                                    name="%s_projection" % params["lang2"])
                }
            else:
                pro_embs = {
                    params["lang1"]:
                    tf.layers.Dense(params["lang1_vocab_size"],
                                    use_bias=True,
                                    name="%s_projection" % params["lang1"]),
                    params["lang2"]:
                    tf.layers.Dense(params["lang2_vocab_size"],
                                    use_bias=True,
                                    name="%s_projection" % params["lang2"])
                }

    with tf.variable_scope(scope or "all_embeddings", dtype=dtype) as scope:

        # encoder embeddings
        with tf.variable_scope("encoder", partitioner=lang1_partitioner):
            lang = "share" if params["share_lang_emb"] else params["lang1"]
            lang1_enc_embedding = _create_embed("%s_embedding" % lang,
                                                params["lang1_vocab_size"],
                                                params["hidden_size"], dtype,
                                                lang1_emb_np)

        if params["share_lang_emb"]:
            if params["lang1_vocab_size"] != params["lang2_vocab_size"]:
                raise ValueError(
                    "Share embedding but different vocab sizes"
                    " %d vs. %d" %
                    (params["lang1_vocab_size"], params["lang2_vocab_size"]))
            assert params["lang1_vocab_size"] == params["lang2_vocab_size"]

            misc_utils.print_out(
                "# Use the same encoder embedding for both languages.")
            lang2_enc_embedding = lang1_enc_embedding

        else:
            with tf.variable_scope("encoder", partitioner=lang2_partitioner):
                lang2_enc_embedding = _create_embed(
                    "%s_embedding" % params["lang2"],
                    params["lang2_vocab_size"], params["hidden_size"], dtype,
                    lang2_emb_np)

        encoder_embeddings[params["lang1"]] = lang1_enc_embedding
        encoder_embeddings[params["lang2"]] = lang2_enc_embedding

        # decoder embeddings
        if params["share_encdec_emb"]:
            misc_utils.print_out(
                "# Use the same embedding for encoder and decoder of each language."
            )
            decoder_embeddings = encoder_embeddings

        else:
            with tf.variable_scope("decoder", partitioner=lang1_partitioner):
                lang = "share" if params["share_lang_emb"] else params["lang1"]
                lang1_dec_embedding = _create_embed("%s_embedding" % lang,
                                                    params["lang1_vocab_size"],
                                                    params["hidden_size"],
                                                    dtype, lang1_emb_np)

                if params["share_lang_emb"]:
                    misc_utils.print_out(
                        "# Use the same decoder embedding for both languages.")
                    lang2_dec_embedding = lang1_dec_embedding

                else:
                    lang2_dec_embedding = _create_embed(
                        "%s_embedding" % params["lang2"],
                        params["lang2_vocab_size"], params["hidden_size"],
                        dtype, lang2_emb_np)

                decoder_embeddings[params["lang1"]] = lang1_dec_embedding
                decoder_embeddings[params["lang2"]] = lang2_dec_embedding

    return encoder_embeddings, decoder_embeddings, pro_embs