Beispiel #1
0
    def testCheckVocab(self):
        # Create a vocab file
        vocab_dir = os.path.join(tf.test.get_temp_dir(), 'vocab_dir')
        os.makedirs(vocab_dir)
        vocab_file = os.path.join(vocab_dir, 'vocab_file')
        vocab = ["alpha", "beta", "charli", 'delta']
        with codecs.getreader('utf-8')(tf.gfile.GFile(vocab_file, 'wb')) as f:
            for word in vocab:
                f.write('%s\n' % word)

        # Call vocab_utils
        out_dir = os.path.join(tf.test.get_temp_dir(), 'out_dir')
        os.makedirs(out_dir)
        vocab_size, new_vocab_file = vocab_utils.check_vocab(
            vocab_file, out_dir)

        # Assert: we expect the code to add  <unk>, <s>, </s> and
        # create a new vocab file
        self.assertEqual(len(vocab) + 3, vocab_size)
        self.assertEqual(os.path.join(out_dir, 'vocab_file'), new_vocab_file)
        new_vocab = []
        with codecs.getreader('utf-8')(tf.gfile.GFile(new_vocab_file,
                                                      'rb')) as f:
            for line in f:
                new_vocab.append(line.strip())
        self.assertEqual([vocab_utils.UNK, vocab_utils.SOS, vocab_utils.EOS] +
                         vocab, new_vocab)
Beispiel #2
0
  def testCheckVocab(self):
    # Create a vocab file
    vocab_dir = os.path.join(tf.test.get_temp_dir(), "vocab_dir")
    os.makedirs(vocab_dir)
    vocab_file = os.path.join(vocab_dir, "vocab_file")
    vocab = ["a", "b", "c"]
    with codecs.getwriter("utf-8")(tf.gfile.GFile(vocab_file, "wb")) as f:
      for word in vocab:
        f.write("%s\n" % word)

    # Call vocab_utils
    out_dir = os.path.join(tf.test.get_temp_dir(), "out_dir")
    os.makedirs(out_dir)
    vocab_size, new_vocab_file = vocab_utils.check_vocab(
        vocab_file, out_dir)

    # Assert: we expect the code to add  <unk>, <s>, </s> and
    # create a new vocab file
    self.assertEqual(len(vocab) + 3, vocab_size)
    self.assertEqual(os.path.join(out_dir, "vocab_file"), new_vocab_file)
    new_vocab, _ = vocab_utils.load_vocab(new_vocab_file)
    self.assertEqual(
        [vocab_utils.UNK, vocab_utils.SOS, vocab_utils.EOS] + vocab, new_vocab)
Beispiel #3
0
def create_standard_hparams(data_path, out_dir):
    
    hparams = tf.contrib.training.HParams(
        
        # Data
        src="vi",
        tgt="en",
        train_prefix=os.path.join(data_path, "train"),
        dev_prefix=os.path.join(data_path, "tst2012"),
        test_prefix=os.path.join(data_path, "tst2013"),
        vocab_prefix="",
        embed_prefix="",
        out_dir=out_dir,
        src_vocab_file=os.path.join(data_path, "vocab.vi"),
        tgt_vocab_file=os.path.join(data_path, "vocab.en"),
        src_embed_file="",
        tgt_embed_file="",
        src_file=os.path.join(data_path, "train.vi"),
        tgt_file=os.path.join(data_path, "train.en"),
        dev_src_file=os.path.join(data_path, "tst2012.vi"),
        dev_tgt_file=os.path.join(data_path, "tst2012.en"),
        test_src_file=os.path.join(data_path, "tst2013.vi"),
        test_tgt_file=os.path.join(data_path, "tst2013.en"),

        # Networks
        num_units=512,
        num_layers=1,
        num_encoder_layers=1,
        num_decoder_layers=1,
        num_encoder_residual_layers=0,
        num_decoder_residual_layers=0,
        dropout=0.2,
        encoder_type="uni",
        residual=False,
        time_major=True,
        num_embeddings_partitions=0,
        unit_type="custom",
        custom_cell=SkipLSTMCell,

        # Train
        optimizer="sgd",
        batch_size=128,
        init_op="uniform",
        init_weight=0.1,
        max_gradient_norm=100.0,
        learning_rate=1.0,
        warmup_steps=0,
        warmup_scheme="t2t",
        decay_scheme="luong234",
        colocate_gradients_with_ops=True,
        num_train_steps=20000,

        # Data constraints
        num_buckets=5,
        max_train=0,
        src_max_len=25,
        tgt_max_len=25,
        src_max_len_infer=0,
        tgt_max_len_infer=0,

        # Data format
        sos="<s>",
        eos="</s>",
        subword_option="",
        check_special_token=True,

        # Misc
        forget_bias=1.0,
        num_gpus=1,
        epoch_step=0,  # record where we were within an epoch.
        steps_per_stats=100,
        steps_per_eval=1000,
        steps_per_external_eval=500,
        share_vocab=False,
        metrics=["bleu"],
        log_device_placement=False,
        random_seed=None,
        # only enable beam search during inference when beam_width > 0.
        beam_width=0,
        length_penalty_weight=0.0,
        override_loaded_hparams=True,
        num_keep_ckpts=5,
        avg_ckpts=False,
        num_intra_threads=1,
        num_inter_threads=8,

        # For inference
        inference_indices=None,
        infer_batch_size=32,
        sampling_temperature=0.0,
        num_translations_per_input=1,
        
    )
    
    src_vocab_size, _ = vocab_utils.check_vocab(hparams.src_vocab_file, hparams.out_dir)
    tgt_vocab_size, _ = vocab_utils.check_vocab(hparams.tgt_vocab_file, hparams.out_dir)
    hparams.add_hparam('src_vocab_size', src_vocab_size)
    hparams.add_hparam('tgt_vocab_size', tgt_vocab_size)
    
    out_dir = hparams.out_dir
    if not tf.gfile.Exists(out_dir):
        tf.gfile.MakeDirs(out_dir)
         
    for metric in hparams.metrics:
        hparams.add_hparam("best_" + metric, 0)  # larger is better
        best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric)
        hparams.add_hparam("best_" + metric + "_dir", best_metric_dir)
        tf.gfile.MakeDirs(best_metric_dir)

        if hparams.avg_ckpts:
            hparams.add_hparam("avg_best_" + metric, 0)  # larger is better
            best_metric_dir = os.path.join(hparams.out_dir, "avg_best_" + metric)
            hparams.add_hparam("avg_best_" + metric + "_dir", best_metric_dir)
            tf.gfile.MakeDirs(best_metric_dir)

    return hparams
Beispiel #4
0
def extend_hparams(hparams):
    """Extend training hparams."""
    assert hparams.num_encoder_layers and hparams.num_decoder_layers
    if hparams.num_encoder_layers != hparams.num_decoder_layers:
        hparams.pass_hidden_state = False
        utils.print_out(
            "Num encoder layer %d is different from num decoder layer"
            " %d, so set pass_hidden_state to False" %
            (hparams.num_encoder_layers, hparams.num_decoder_layers))

    # Sanity checks
    if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0:
        raise ValueError("For bi, num_encoder_layers %d should be even" %
                         hparams.num_encoder_layers)
    if (hparams.attention_architecture in ["gnmt"]
            and hparams.num_encoder_layers < 2):
        raise ValueError("For gnmt attention architecture, "
                         "num_encoder_layers %d should be >= 2" %
                         hparams.num_encoder_layers)

    # Set residual layers
    num_encoder_residual_layers = 0
    num_decoder_residual_layers = 0
    if hparams.residual:
        if hparams.num_encoder_layers > 1:
            num_encoder_residual_layers = hparams.num_encoder_layers - 1
        if hparams.num_decoder_layers > 1:
            num_decoder_residual_layers = hparams.num_decoder_layers - 1

        if hparams.encoder_type == "gnmt":
            # The first unidirectional layer (after the bi-directional layer) in
            # the GNMT encoder can't have residual connection due to the input is
            # the concatenation of fw_cell and bw_cell's outputs.
            num_encoder_residual_layers = hparams.num_encoder_layers - 2

            # Compatible for GNMT models
            if hparams.num_encoder_layers == hparams.num_decoder_layers:
                num_decoder_residual_layers = num_encoder_residual_layers
    hparams.add_hparam("num_encoder_residual_layers",
                       num_encoder_residual_layers)
    hparams.add_hparam("num_decoder_residual_layers",
                       num_decoder_residual_layers)

    if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]:
        raise ValueError("subword option must be either spm, or bpe")

    # Flags
    utils.print_out("# hparams:")
    utils.print_out("  src=%s" % hparams.src)
    utils.print_out("  tgt=%s" % hparams.tgt)
    utils.print_out("  train_prefix=%s" % hparams.train_prefix)
    utils.print_out("  dev_prefix=%s" % hparams.dev_prefix)
    utils.print_out("  test_prefix=%s" % hparams.test_prefix)
    utils.print_out("  out_dir=%s" % hparams.out_dir)

    ## Vocab
    # Get vocab file names first
    if hparams.vocab_prefix:
        src_vocab_file = hparams.vocab_prefix + "." + hparams.src
        tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
    else:
        raise ValueError("hparams.vocab_prefix must be provided.")

    # Source vocab
    src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
        src_vocab_file,
        hparams.out_dir,
        check_special_token=hparams.check_special_token,
        sos=hparams.sos,
        eos=hparams.eos,
        unk=vocab_utils.UNK)

    # Target vocab
    if hparams.share_vocab:
        utils.print_out("  using source vocab for target")
        tgt_vocab_file = src_vocab_file
        tgt_vocab_size = src_vocab_size
    else:
        tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(
            tgt_vocab_file,
            hparams.out_dir,
            check_special_token=hparams.check_special_token,
            sos=hparams.sos,
            eos=hparams.eos,
            unk=vocab_utils.UNK)
    hparams.add_hparam("src_vocab_size", src_vocab_size)
    hparams.add_hparam("tgt_vocab_size", tgt_vocab_size)
    hparams.add_hparam("src_vocab_file", src_vocab_file)
    hparams.add_hparam("tgt_vocab_file", tgt_vocab_file)

    # Pretrained Embeddings:
    hparams.add_hparam("src_embed_file", "")
    hparams.add_hparam("tgt_embed_file", "")
    if hparams.embed_prefix:
        src_embed_file = hparams.embed_prefix + "." + hparams.src
        tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt

        if tf.gfile.Exists(src_embed_file):
            hparams.src_embed_file = src_embed_file

        if tf.gfile.Exists(tgt_embed_file):
            hparams.tgt_embed_file = tgt_embed_file

    # Check out_dir
    if not tf.gfile.Exists(hparams.out_dir):
        utils.print_out("# Creating output directory %s ..." % hparams.out_dir)
        tf.gfile.MakeDirs(hparams.out_dir)

    # Evaluation
    for metric in hparams.metrics:
        hparams.add_hparam("best_" + metric, 0)  # larger is better
        best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric)
        hparams.add_hparam("best_" + metric + "_dir", best_metric_dir)
        tf.gfile.MakeDirs(best_metric_dir)

        if hparams.avg_ckpts:
            hparams.add_hparam("avg_best_" + metric, 0)  # larger is better
            best_metric_dir = os.path.join(hparams.out_dir,
                                           "avg_best_" + metric)
            hparams.add_hparam("avg_best_" + metric + "_dir", best_metric_dir)
            tf.gfile.MakeDirs(best_metric_dir)

    return hparams
Beispiel #5
0
def extend_hparams(hparams):
    """Add new arguments to hparams."""
    # Sanity checks
    if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0:
        raise ValueError("For bi, num_encoder_layers %d should be even" %
                         hparams.num_encoder_layers)
    if (hparams.attention_architecture in ["gnmt"]
            and hparams.num_encoder_layers < 2):
        raise ValueError("For gnmt attention architecture, "
                         "num_encoder_layers %d should be >= 2" %
                         hparams.num_encoder_layers)
    if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]:
        raise ValueError("subword option must be either spm, or bpe")
    if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0:
        raise ValueError(
            "beam_width must greater than 0 when using beam_search"
            "decoder.")
    if hparams.infer_mode == "sample" and hparams.sampling_temperature <= 0.0:
        raise ValueError(
            "sampling_temperature must greater than 0.0 when using"
            "sample decoder.")

    # Different number of encoder / decoder layers
    assert hparams.num_encoder_layers and hparams.num_decoder_layers
    if hparams.num_encoder_layers != hparams.num_decoder_layers:
        hparams.pass_hidden_state = False
        utils.print_out(
            "Num encoder layer %d is different from num decoder layer"
            " %d, so set pass_hidden_state to False" %
            (hparams.num_encoder_layers, hparams.num_decoder_layers))

    # Set residual layers
    num_encoder_residual_layers = 0
    num_decoder_residual_layers = 0
    if hparams.residual:
        if hparams.num_encoder_layers > 1:
            num_encoder_residual_layers = hparams.num_encoder_layers - 1
        if hparams.num_decoder_layers > 1:
            num_decoder_residual_layers = hparams.num_decoder_layers - 1

        if hparams.encoder_type == "gnmt":
            # The first unidirectional layer (after the bi-directional layer) in
            # the GNMT encoder can't have residual connection due to the input is
            # the concatenation of fw_cell and bw_cell's outputs.
            num_encoder_residual_layers = hparams.num_encoder_layers - 2

            # Compatible for GNMT models
            if hparams.num_encoder_layers == hparams.num_decoder_layers:
                num_decoder_residual_layers = num_encoder_residual_layers
    _add_argument(hparams, "num_encoder_residual_layers",
                  num_encoder_residual_layers)
    _add_argument(hparams, "num_decoder_residual_layers",
                  num_decoder_residual_layers)

    # Language modeling
    if getattr(hparams, "language_model", None):
        hparams.attention = ""
        hparams.attention_architecture = ""
        hparams.pass_hidden_state = False
        hparams.share_vocab = True
        hparams.src = hparams.tgt
        utils.print_out(
            "For language modeling, we turn off attention and "
            "pass_hidden_state; turn on share_vocab; set src to tgt.")

    ## Vocab
    # Get vocab file names first
    if hparams.vocab_prefix:
        src_vocab_file = hparams.vocab_prefix + "." + hparams.src
        tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
    else:
        raise ValueError("hparams.vocab_prefix must be provided.")

    # Source vocab
    check_special_token = getattr(hparams, "check_special_token", True)
    src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
        src_vocab_file,
        hparams.out_dir,
        check_special_token=check_special_token,
        sos=hparams.sos,
        eos=hparams.eos,
        unk=vocab_utils.UNK)

    # Target vocab
    if hparams.share_vocab:
        utils.print_out("  using source vocab for target")
        tgt_vocab_file = src_vocab_file
        tgt_vocab_size = src_vocab_size
    else:
        tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(
            tgt_vocab_file,
            hparams.out_dir,
            check_special_token=check_special_token,
            sos=hparams.sos,
            eos=hparams.eos,
            unk=vocab_utils.UNK)
    _add_argument(hparams, "src_vocab_size", src_vocab_size)
    _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size)
    _add_argument(hparams, "src_vocab_file", src_vocab_file)
    _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file)

    # Num embedding partitions
    num_embeddings_partitions = getattr(hparams, "num_embeddings_partitions",
                                        0)
    _add_argument(hparams, "num_enc_emb_partitions", num_embeddings_partitions)
    _add_argument(hparams, "num_dec_emb_partitions", num_embeddings_partitions)

    # Pretrained Embeddings
    _add_argument(hparams, "src_embed_file", "")
    _add_argument(hparams, "tgt_embed_file", "")
    if getattr(hparams, "embed_prefix", None):
        src_embed_file = hparams.embed_prefix + "." + hparams.src
        tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt

        if tf.gfile.Exists(src_embed_file):
            utils.print_out("  src_embed_file %s exist" % src_embed_file)
            hparams.src_embed_file = src_embed_file

            utils.print_out(
                "For pretrained embeddings, set num_enc_emb_partitions to 1")
            hparams.num_enc_emb_partitions = 1
        else:
            utils.print_out("  src_embed_file %s doesn't exist" %
                            src_embed_file)

        if tf.gfile.Exists(tgt_embed_file):
            utils.print_out("  tgt_embed_file %s exist" % tgt_embed_file)
            hparams.tgt_embed_file = tgt_embed_file

            utils.print_out(
                "For pretrained embeddings, set num_dec_emb_partitions to 1")
            hparams.num_dec_emb_partitions = 1
        else:
            utils.print_out("  tgt_embed_file %s doesn't exist" %
                            tgt_embed_file)

    # Evaluation
    for metric in hparams.metrics:
        best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric)
        tf.gfile.MakeDirs(best_metric_dir)
        _add_argument(hparams, "best_" + metric, 0, update=False)
        _add_argument(hparams, "best_" + metric + "_dir", best_metric_dir)

        if getattr(hparams, "avg_ckpts", None):
            best_metric_dir = os.path.join(hparams.out_dir,
                                           "avg_best_" + metric)
            tf.gfile.MakeDirs(best_metric_dir)
            _add_argument(hparams, "avg_best_" + metric, 0, update=False)
            _add_argument(hparams, "avg_best_" + metric + "_dir",
                          best_metric_dir)

    return hparams