Example #1
0
def create_emb_matric(hparam):
    '''
    
    :param hparam: 
    :return: 
    '''

    src_size, _ = check_vocab(hparam.vocab_src)
    tgt_size, _ = check_vocab(hparam.vocab_tgt)
    emb_size = hparam.emb_size
    share_vocab = hparam.share_vocab

    if share_vocab:
        if src_size != tgt_size:
            raise ValueError(
                'can not share vocab,because src.Vsize !=tgt.Vsize')
        emb_matric = tf.get_variable('embeding',
                                     shape=(src_size, emb_size),
                                     dtype=hparam.dtype)
        return (emb_matric, emb_matric)
    else:
        encode_matric = tf.get_variable('embeding/encoder',
                                        shape=(src_size, emb_size),
                                        dtype=hparam.dtype)
        decoder_matric = tf.get_variable('embeding/decoder',
                                         shape=(tgt_size, emb_size),
                                         dtype=hparam.dtype)
        return (encode_matric, decoder_matric)
Example #2
0
def extend_hparams(hparams):
    """Add new arguments to hparams."""
    # Sanity checks
    if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]:
        raise ValueError("subword option must be either spm, or bpe")
    if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0:
        raise ValueError(
            "beam_width must greater than 0 when using beam_search"
            "decoder.")

    # Different number of encoder / decoder layers
    assert hparams.num_encoder_layers == hparams.num_decoder_layers

    # The first unidirectional layer (after the bi-directional layer) in
    # the GNMT encoder can't have residual connection due to the input is
    # the concatenation of fw_cell and bw_cell's outputs.
    num_encoder_residual_layers = hparams.num_encoder_layers - 2
    num_decoder_residual_layers = num_encoder_residual_layers
    _add_argument(hparams, "num_encoder_residual_layers",
                  num_encoder_residual_layers)
    _add_argument(hparams, "num_decoder_residual_layers",
                  num_decoder_residual_layers)

    ## Vocab
    # Get vocab file names first
    if hparams.vocab_prefix:
        src_vocab_file = hparams.vocab_prefix + "." + hparams.src
        tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
    else:
        raise ValueError("hparams.vocab_prefix must be provided.")

    # Source vocab
    src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
        src_vocab_file,
        hparams.out_dir,
        check_special_token=hparams.check_special_token,
        sos=hparams.sos,
        eos=hparams.eos,
        unk=vocab_utils.UNK)

    # Target vocab
    utils.print_out("  using source vocab for target")
    tgt_vocab_file = src_vocab_file
    tgt_vocab_size = src_vocab_size
    _add_argument(hparams, "src_vocab_size", src_vocab_size)
    _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size)
    _add_argument(hparams, "src_vocab_file", src_vocab_file)
    _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file)

    # Num embedding partitions
    _add_argument(hparams, "num_enc_emb_partitions",
                  hparams.num_embeddings_partitions)
    _add_argument(hparams, "num_dec_emb_partitions",
                  hparams.num_embeddings_partitions)

    # Pretrained Embeddings
    _add_argument(hparams, "src_embed_file", "")
    _add_argument(hparams, "tgt_embed_file", "")

    return hparams
Example #3
0
def extend_hparams(hparams):
    """Extend training hparams."""
    # Sanity checks
    if hparams.encoder_type == "bi" and hparams.num_layers % 2 != 0:
        raise ValueError("For bi, num_layers %d should be even" % hparams.num_layers)

    if hparams.attention_architecture in ["gnmt"] and hparams.num_layers < 2:
        raise ValueError("For gnmt attention architecture, num_layers %d should be >= 2" % hparams.num_layers)

    # Flags
    utils.print_out("# hparams:")
    utils.print_out("  src=%s" % hparams.src)
    utils.print_out("  tgt=%s" % hparams.tgt)
    utils.print_out("  train_prefix=%s" % hparams.train_prefix)
    utils.print_out("  dev_prefix=%s" % hparams.dev_prefix)
    utils.print_out("  test_prefix=%s" % hparams.test_prefix)
    utils.print_out("  out_dir=%s" % hparams.out_dir)

    # Set num_residual_layers
    if hparams.residual and hparams.num_layers > 1:
        if hparams.encoder_type == "gnmt":
            # The first unidirectional layer (after the bi-directional layer) in
            # the GNMT encoder can't have residual connection due to the input is
            # the concatenation of fw_cell and bw_cell's outputs.
            num_residual_layers = hparams.num_layers - 2
        else:
            num_residual_layers = hparams.num_layers - 1
    else:
        num_residual_layers = 0
    hparams.add_hparam("num_residual_layers", num_residual_layers)

    ## Vocab
    # Get vocab file names first
    if hparams.vocab_prefix:
        tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
    else:
        raise ValueError("hparams.vocab_prefix must be provided.")

    # Target Vocab
    tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(tgt_vocab_file,
                                                             hparams.out_dir,
                                                             sos=hparams.sos,
                                                             eos=hparams.eos,
                                                             unk=vocab_utils.UNK)
    hparams.add_hparam("tgt_vocab_size", tgt_vocab_size)
    hparams.add_hparam("tgt_vocab_file", tgt_vocab_file)

    # Check out_dir
    if not tf.gfile.Exists(hparams.out_dir):
        utils.print_out("# Creating output directory %s ..." % hparams.out_dir)
        tf.gfile.MakeDirs(hparams.out_dir)

    # Evaluation
    for metric in hparams.metrics:
        hparams.add_hparam("best_" + metric, 0)  # larger is better
        best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric)
        hparams.add_hparam("best_bleu_dir", best_metric_dir)
        tf.gfile.MakeDirs(best_metric_dir)

    return hparams
    def testCheckVocab(self):
        # Create a vocab file
        vocab_dir = os.path.join(tf.test.get_temp_dir(), "vocab_dir")
        os.makedirs(vocab_dir)
        vocab_file = os.path.join(vocab_dir, "vocab_file")
        vocab = ["a", "b", "c"]
        with codecs.getwriter("utf-8")(tf.gfile.GFile(vocab_file, "wb")) as f:
            for word in vocab:
                f.write("%s\n" % word)

        # Call vocab_utils
        out_dir = os.path.join(tf.test.get_temp_dir(), "out_dir")
        os.makedirs(out_dir)
        vocab_size, new_vocab_file = vocab_utils.check_vocab(
            vocab_file, out_dir)

        # Assert: we expect the code to add  <unk>, <s>, </s> and
        # create a new vocab file
        self.assertEqual(len(vocab) + 3, vocab_size)
        self.assertEqual(os.path.join(out_dir, "vocab_file"), new_vocab_file)
        new_vocab = []
        with codecs.getreader("utf-8")(tf.gfile.GFile(new_vocab_file,
                                                      "rb")) as f:
            for line in f:
                new_vocab.append(line.strip())
        self.assertEqual([vocab_utils.UNK, vocab_utils.SOS, vocab_utils.EOS] +
                         vocab, new_vocab)
Example #5
0
def run_main(unused_argv):
	"""Run main."""

	# Initialization, Vocab generation
	if not tf.gfile.Exists(params['out_dir']):
		utils.print_out("# Creating output directory %s ..." % params['out_dir'])
		tf.gfile.MakeDirs(params['out_dir'])

	char_vocab_file = params['enc_char_map_path']
	src_vocab_file = params['src_vocab_file']
	tgt_vocab_file = params['tgt_vocab_file']


	char_vocab_size, char_vocab_file = vocab_utils.check_char_vocab(char_vocab_file, params['out_dir'])

	src_vocab_size, src_vocab_file = vocab_utils.check_vocab(src_vocab_file, params['out_dir'], type = 'src',
	                                                        check_special_token=params['check_special_token'])
	tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(tgt_vocab_file,
                                                            params['out_dir'], type = 'tgt',
                                                            check_special_token=params['check_special_token'])

	## Train / Decode
	if params['mode'] == 'infer':
	# # Modification required#########
	# # Inference
	# trans_file = params['inference_output_file']
	# ckpt = params['ckpt']
	# if not ckpt:
	#   ckpt = tf.train.latest_checkpoint(out_dir)
	# inference_fn(ckpt, inference_input_file, trans_file, num_workers, jobid)

	# # Evaluation
	# ref_file = params['inference_ref_file']
	# if ref_file and tf.gfile.Exists(trans_file):
	#   for metric in params['metrics']:
	#     score = evaluation_utils.evaluate(ref_file,
				# 						  trans_file,
				# 						  metric,
				# 						  params['subword_option'])
	#     utils.print_out("  %s: %.1f" % (metric, score))
		pass
		infer()
	elif(params['mode'] == 'train'):
		# Train
		train()
Example #6
0
def extend_hparams(hparams):
    """Extend training hparams."""
    assert hparams.num_encoder_layers and hparams.num_decoder_layers

    # Flags
    utils.print_out("# hparams:")
    utils.print_out("  src_file=%s" % hparams.src_file)
    utils.print_out("  tgt_file=%s" % hparams.tgt_file)
    utils.print_out("  out_dir=%s" % hparams.out_dir)

    # Source vocab
    src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
        hparams.src_vocab_file,
        hparams.out_dir,
        check_special_token=hparams.check_special_token,
        sos=hparams.sos,
        eos=hparams.eos,
        unk=vocab_utils.UNK)

    # Target vocab
    if hparams.share_vocab:
        utils.print_out("  using source vocab for target")
        tgt_vocab_file = src_vocab_file
        tgt_vocab_size = src_vocab_size
    else:
        tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(
            hparams.tgt_vocab_file,
            hparams.out_dir,
            check_special_token=hparams.check_special_token,
            sos=hparams.sos,
            eos=hparams.eos,
            unk=vocab_utils.UNK)
    hparams.src_vocab_file = src_vocab_file
    hparams.tgt_vocab_file = tgt_vocab_file
    hparams.add_hparam("src_vocab_size", src_vocab_size)
    hparams.add_hparam("tgt_vocab_size", tgt_vocab_size)

    # Check out_dir
    if not tf.gfile.Exists(hparams.out_dir):
        utils.print_out("# Creating output directory %s ..." % hparams.out_dir)
        tf.gfile.MakeDirs(hparams.out_dir)

    return hparams
Example #7
0
def load_config(filename):
    d = yaml.load(open(filename).read())
    c = namedtuple("config", d.keys())(**d)

    src_vocab_file = os.path.join(c.data_dir, c.vocab_prefix + "." + c.src)
    src_vocab_file, src_vocab_size = vocab_utils.check_vocab(
        src_vocab_file, c.data_dir, c.sos, c.eos, c.unk)
    c = c._replace(src_vocab_size=src_vocab_size)

    if not c.share_vocab:
        tgt_vocab_file = os.path.join(c.data_dir, c.vocab_prefix + "." + c.tgt)
        tgt_vocab_file, tgt_vocab_size = vocab_utils.check_vocab(
            tgt_vocab_file, c.data_dir, c.sos, c.eos, c.unk)
        c = c._replace(tgt_vocab_size=tgt_vocab_size)

    if not os.path.exists(c.out_dir):
        os.makedirs(c.out_dir)

    return c
Example #8
0
 def _set_commom_param(self, hparam):
     self._batch = tf.shape(self._batchInput.src)[0]
     self.C, _ = check_vocab(hparam.vocab_tgt)
     self.SOS = hparam.SOS
     self.EOS = hparam.EOS
     self._subword = hparam.subword_option
     if self.mode != 'infer':
         self._predict_count = tf.reduce_sum(self._batchInput.tgt_seq_len)
         self._word_count = tf.reduce_sum(
             self._batchInput.src_seq_len) + tf.reduce_sum(
                 self._batchInput.tgt_seq_len)
def main(unused_argv):
    make_dir(FLAGS.data_dir)

    train_src_file = FLAGS.train_prefix + "." + FLAGS.src
    train_tgt_file = FLAGS.train_prefix + "." + FLAGS.tgt
    dev_src_file = FLAGS.dev_prefix + "." + FLAGS.src
    dev_tgt_file = FLAGS.dev_prefix + "." + FLAGS.tgt

    if FLAGS.share_vocab:
        src_vocab_file = tgt_vocab_file = FLAGS.vocab_prefix
    else:
        src_vocab_file = FLAGS.vocab_prefix + "." + FLAGS.src
        tgt_vocab_file = FLAGS.vocab_prefix + "." + FLAGS.tgt

    # Source vocab
    src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
        src_vocab_file, FLAGS.data_dir)

    # Target vocab
    if FLAGS.share_vocab:
        print("using source vocab for target")
        tgt_vocab_file = src_vocab_file
        tgt_vocab_size = src_vocab_size
    else:
        tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(
            tgt_vocab_file, FLAGS.data_dir, unk=vocab_utils.UNK)

    tf.logging.info("Encoding files and saving data")
    vocab_helper = vocab_utils.VocabHelper(src_vocab_file, tgt_vocab_file,
                                           FLAGS.share_vocab)

    train_tfrecord_files = encode_and_save_files(vocab_helper, FLAGS.data_dir,
                                                 train_src_file,
                                                 train_tgt_file, _TRAIN_TAG,
                                                 _TRAIN_SHARDS)

    encode_and_save_files(vocab_helper, FLAGS.data_dir, dev_src_file,
                          dev_tgt_file, _DEV_TAG, _DEV_SHARDS)

    for fname in train_tfrecord_files:
        shuffle_records(fname)
Example #10
0
def prepare_dataset(flags):
    """Generate the preprocessed dataset."""
    src_file = "%s.%s" % (flags.data_dir + flags.train_prefix, flags.src)
    tgt_file = "%s.%s" % (flags.data_dir + flags.train_prefix, flags.tgt)
    vocab_file = flags.data_dir + flags.vocab_prefix
    _, vocab_file = vocab_utils.check_vocab(vocab_file, flags.out_dir)
    out_file = flags.out_dir + "preprocessed_dataset"
    src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables(
        vocab_file)
    src_dataset = tf.data.TextLineDataset(src_file)
    tgt_dataset = tf.data.TextLineDataset(tgt_file)
    iterator = iterator_utils.get_iterator(
        src_dataset,
        tgt_dataset,
        src_vocab_table,
        tgt_vocab_table,
        batch_size=1,
        global_batch_size=1,
        sos=vocab_utils.SOS,
        eos=vocab_utils.EOS,
        random_seed=1,
        num_buckets=flags.num_buckets,
        src_max_len=flags.src_max_len,
        tgt_max_len=flags.tgt_max_len,
        filter_oversized_sequences=True,
        return_raw=True).make_initializable_iterator()

    with tf.Session() as sess:
        sess.run(tf.tables_initializer())
        sess.run(iterator.initializer)
        try:
            i = 0
            while True:
                with open(out_file + "_%d" % i, "wb") as f:
                    i += 1
                    for _ in range(100):
                        for j in sess.run(iterator.get_next()):
                            tf.logging.info(j)
                            f.write(bytearray(j))
        except tf.errors.OutOfRangeError:
            pass
Example #11
0
def extend_hparams(hparams):
    """Extend training hparams."""
    hparams.add_hparam("input_emb_pretrain", hparams.input_emb_file
                       is not None)
    # Check if vocab has the unk and pad symbols as first words. If not, create a new vocab file with these symbols as
    # the first two words.
    vocab_size, vocab_path = vocab_utils.check_vocab(hparams.vocab_path,
                                                     hparams.out_dir,
                                                     unk=hparams.unk,
                                                     pad=hparams.pad)
    vocab, _ = vocab_utils.load_vocab(vocab_path)
    # Generating embeddings if flag is true or file is not present
    if hparams.create_new_embeddings or os.path.isfile(
            hparams.input_emb_file) is False:
        embedding.save_embedding(vocab, hparams.embedding_path,
                                 hparams.input_emb_file)
    hparams.add_hparam("vocab_size", vocab_size)
    hparams.set_hparam("vocab_path", vocab_path)
    if not tf.gfile.Exists(hparams.out_dir):
        tf.gfile.MakeDirs(hparams.out_dir)
    return hparams
Example #12
0
def extend_hparams(hparams):
  """Add new arguments to hparams."""
  # Sanity checks
  if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0:
    raise ValueError("For bi, num_encoder_layers %d should be even" %
                     hparams.num_encoder_layers)
  if (hparams.attention_architecture in ["gnmt"] and
      hparams.num_encoder_layers < 2):
    raise ValueError("For gnmt attention architecture, "
                     "num_encoder_layers %d should be >= 2" %
                     hparams.num_encoder_layers)
  if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]:
    raise ValueError("subword option must be either spm, or bpe")
  if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0:
    raise ValueError("beam_width must greater than 0 when using beam_search"
                     "decoder.")
  if hparams.mode == "translate" and not hparams.translate_file:
    raise ValueError("--translate_file flag must be specified in translate mode")

  # Different number of encoder / decoder layers
  assert hparams.num_encoder_layers and hparams.num_decoder_layers
  if hparams.num_encoder_layers != hparams.num_decoder_layers:
    hparams.pass_hidden_state = False
    utils.print_out("Num encoder layer %d is different from num decoder layer"
                    " %d, so set pass_hidden_state to False" % (
                        hparams.num_encoder_layers,
                        hparams.num_decoder_layers))

  # Set residual layers
  num_encoder_residual_layers = 0
  num_decoder_residual_layers = 0
  if hparams.residual:
    if hparams.num_encoder_layers > 1:
      num_encoder_residual_layers = hparams.num_encoder_layers - 1
    if hparams.num_decoder_layers > 1:
      num_decoder_residual_layers = hparams.num_decoder_layers - 1

    if hparams.encoder_type == "gnmt":
      # The first unidirectional layer (after the bi-directional layer) in
      # the GNMT encoder can't have residual connection due to the input is
      # the concatenation of fw_cell and bw_cell's outputs.
      num_encoder_residual_layers = hparams.num_encoder_layers - 2

      # Compatible for GNMT models
      if hparams.num_encoder_layers == hparams.num_decoder_layers:
        num_decoder_residual_layers = num_encoder_residual_layers
  _add_argument(hparams, "num_encoder_residual_layers",
                num_encoder_residual_layers)
  _add_argument(hparams, "num_decoder_residual_layers",
                num_decoder_residual_layers)

  # Language modeling
  if hparams.language_model:
    hparams.attention = ""
    hparams.attention_architecture = ""
    hparams.pass_hidden_state = False
    hparams.share_vocab = True
    hparams.src = hparams.tgt
    utils.print_out("For language modeling, we turn off attention and "
                    "pass_hidden_state; turn on share_vocab; set src to tgt.")

  ## Vocab
  # Get vocab file names first
  if hparams.vocab_prefix:
    src_vocab_file = hparams.vocab_prefix + "." + hparams.src
    tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
  else:
    raise ValueError("hparams.vocab_prefix must be provided.")

  # Source vocab
  src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
      src_vocab_file,
      hparams.output_dir,
      check_special_token=hparams.check_special_token,
      sos=hparams.sos,
      eos=hparams.eos,
      unk=vocab_utils.UNK,
      pad_vocab=True)

  # Target vocab
  if hparams.share_vocab:
    utils.print_out("  using source vocab for target")
    tgt_vocab_file = src_vocab_file
    tgt_vocab_size = src_vocab_size
  else:
    tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(
        tgt_vocab_file,
        hparams.output_dir,
        check_special_token=hparams.check_special_token,
        sos=hparams.sos,
        eos=hparams.eos,
        unk=vocab_utils.UNK)
  _add_argument(hparams, "src_vocab_size", src_vocab_size)
  _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size)
  _add_argument(hparams, "src_vocab_file", src_vocab_file)
  _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file)

  # Num embedding partitions
  _add_argument(
      hparams, "num_enc_emb_partitions", hparams.num_embeddings_partitions)
  _add_argument(
      hparams, "num_dec_emb_partitions", hparams.num_embeddings_partitions)

  # Pretrained Embeddings
  _add_argument(hparams, "src_embed_file", "")
  _add_argument(hparams, "tgt_embed_file", "")
  if hparams.embed_prefix:
    src_embed_file = hparams.embed_prefix + "." + hparams.src
    tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt

    if tf.gfile.Exists(src_embed_file):
      utils.print_out("  src_embed_file %s exist" % src_embed_file)
      hparams.src_embed_file = src_embed_file

      utils.print_out(
          "For pretrained embeddings, set num_enc_emb_partitions to 1")
      hparams.num_enc_emb_partitions = 1
    else:
      utils.print_out("  src_embed_file %s doesn't exist" % src_embed_file)

    if tf.gfile.Exists(tgt_embed_file):
      utils.print_out("  tgt_embed_file %s exist" % tgt_embed_file)
      hparams.tgt_embed_file = tgt_embed_file

      utils.print_out(
          "For pretrained embeddings, set num_dec_emb_partitions to 1")
      hparams.num_dec_emb_partitions = 1
    else:
      utils.print_out("  tgt_embed_file %s doesn't exist" % tgt_embed_file)

  # Evaluation
  metric = "bleu"
  best_metric_dir = os.path.join(hparams.output_dir, "best_" + metric)
  tf.gfile.MakeDirs(best_metric_dir)
  _add_argument(hparams, "best_" + metric, 0, update=False)
  _add_argument(hparams, "best_" + metric + "_dir", best_metric_dir)

  return hparams
Example #13
0
def extend_hparams(hparams):
    """Extend training hparams."""
    assert hparams.num_encoder_layers and hparams.num_decoder_layers
    if hparams.num_encoder_layers != hparams.num_decoder_layers:
        hparams.pass_hidden_state = False
        utils.print_out(
            "Num encoder layer %d is different from num decoder layer"
            " %d, so set pass_hidden_state to False" %
            (hparams.num_encoder_layers, hparams.num_decoder_layers))

    # Sanity checks
    if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0:
        raise ValueError("For bi, num_encoder_layers %d should be even" %
                         hparams.num_encoder_layers)
    if (hparams.attention_architecture in ["gnmt"]
            and hparams.num_encoder_layers < 2):
        raise ValueError("For gnmt attention architecture, "
                         "num_encoder_layers %d should be >= 2" %
                         hparams.num_encoder_layers)

    # Set residual layers
    num_encoder_residual_layers = 0
    num_decoder_residual_layers = 0
    if hparams.residual:
        if hparams.num_encoder_layers > 1:
            num_encoder_residual_layers = hparams.num_encoder_layers - 1
        if hparams.num_decoder_layers > 1:
            num_decoder_residual_layers = hparams.num_decoder_layers - 1

        if hparams.encoder_type == "gnmt":
            # The first unidirectional layer (after the bi-directional layer) in
            # the GNMT encoder can't have residual connection due to the input is
            # the concatenation of fw_cell and bw_cell's outputs.
            num_encoder_residual_layers = hparams.num_encoder_layers - 2

            # Compatible for GNMT models
            if hparams.num_encoder_layers == hparams.num_decoder_layers:
                num_decoder_residual_layers = num_encoder_residual_layers
    hparams.add_hparam("num_encoder_residual_layers",
                       num_encoder_residual_layers)
    hparams.add_hparam("num_decoder_residual_layers",
                       num_decoder_residual_layers)

    if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]:
        raise ValueError("subword option must be either spm, or bpe")

    # Flags
    utils.print_out("# hparams:")
    utils.print_out("  src=%s" % hparams.src)
    utils.print_out("  tgt=%s" % hparams.tgt)
    utils.print_out("  train_prefix=%s" % hparams.train_prefix)
    utils.print_out("  dev_prefix=%s" % hparams.dev_prefix)
    utils.print_out("  test_prefix=%s" % hparams.test_prefix)
    utils.print_out("  out_dir=%s" % hparams.out_dir)

    ## Vocab
    # Get vocab file names first
    if hparams.vocab_prefix:
        src_vocab_file = hparams.vocab_prefix + "." + hparams.src
        tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
    else:
        raise ValueError("hparams.vocab_prefix must be provided.")

    # Source vocab
    src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
        src_vocab_file,
        hparams.out_dir,
        check_special_token=hparams.check_special_token,
        sos=hparams.sos,
        eos=hparams.eos,
        unk=vocab_utils.UNK)

    # Target vocab
    if hparams.share_vocab:
        utils.print_out("  using source vocab for target")
        tgt_vocab_file = src_vocab_file
        tgt_vocab_size = src_vocab_size
    else:
        tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(
            tgt_vocab_file,
            hparams.out_dir,
            check_special_token=hparams.check_special_token,
            sos=hparams.sos,
            eos=hparams.eos,
            unk=vocab_utils.UNK)
    hparams.add_hparam("src_vocab_size", src_vocab_size)
    hparams.add_hparam("tgt_vocab_size", tgt_vocab_size)
    hparams.add_hparam("src_vocab_file", src_vocab_file)
    hparams.add_hparam("tgt_vocab_file", tgt_vocab_file)

    # Pretrained Embeddings:
    hparams.add_hparam("src_embed_file", "")
    hparams.add_hparam("tgt_embed_file", "")
    if hparams.embed_prefix:
        src_embed_file = hparams.embed_prefix + "." + hparams.src
        tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt

        if tf.gfile.Exists(src_embed_file):
            hparams.src_embed_file = src_embed_file

        if tf.gfile.Exists(tgt_embed_file):
            hparams.tgt_embed_file = tgt_embed_file

    # Check out_dir
    if not tf.gfile.Exists(hparams.out_dir):
        utils.print_out("# Creating output directory %s ..." % hparams.out_dir)
        tf.gfile.MakeDirs(hparams.out_dir)

    # Evaluation
    for metric in hparams.metrics:
        hparams.add_hparam("best_" + metric, 0)  # larger is better
        best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric)
        hparams.add_hparam("best_" + metric + "_dir", best_metric_dir)
        tf.gfile.MakeDirs(best_metric_dir)

        if hparams.avg_ckpts:
            hparams.add_hparam("avg_best_" + metric, 0)  # larger is better
            best_metric_dir = os.path.join(hparams.out_dir,
                                           "avg_best_" + metric)
            hparams.add_hparam("avg_best_" + metric + "_dir", best_metric_dir)
            tf.gfile.MakeDirs(best_metric_dir)

    return hparams
Example #14
0
def extend_hparams(hparams):
    assert hparams.num_encoder_layers and hparams.num_decoder_layers
    if hparams.num_encoder_layers != hparams.num_decoder_layers:
        hparams.pass_hidden_state = False
        utils.print_out(
            "Num encoder layer %d is different from num decoder layer %d, so set pass_hidden_state to False"
            % (hparams.num_encoder_layers, hparams.num_decoder_layers))

    if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0:
        raise ValueError("For bi, num_encoder_layers %d should be even" %
                         hparams.num_encoder_layers)

    num_encoder_residual_layers = 0
    num_decoder_residual_layers = 0
    if hparams.residual:
        if hparams.num_encoder_layers > 1:
            num_encoder_residual_layers = hparams.num_encoder_layers - 1
        if hparams.num_decoder_layers > 1:
            num_decoder_residual_layers = hparams.num_decoder_layers - 1

    hparams.add_hparam("num_encoder_residual_layers",
                       num_encoder_residual_layers)
    hparams.add_hparam("num_decoder_residual_layers",
                       num_decoder_residual_layers)

    if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]:
        raise ValueError("subword option must be either spm, or bpe")

    utils.print_out("# hparams:")
    utils.print_out("src=%s" % hparams.src)
    utils.print_out("tgt=%s" % hparams.tgt)
    utils.print_out("train_prefix=%s" % hparams.train_prefix)
    utils.print_out("dev_prefix=%s" % hparams.dev_prefix)
    utils.print_out("test_prefix=%s" % hparams.test_prefix)
    utils.print_out("out_dir=%s" % hparams.out_dir)

    if hparams.vocab_prefix:
        src_vocab_file = hparams.vocab_prefix + "." + hparams.src
        tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
    else:
        raise ValueError("hparams.vocab_prefix must be provided.")

    src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
        src_vocab_file,
        hparams.out_dir,
        check_special_token=hparams.check_special_token,
        sos=hparams.sos,
        eos=hparams.eos,
        unk=vocab_utils.UNK)

    if hparams.share_vocab:
        utils.print_out("using source vocab for target")
        tgt_vocab_file = src_vocab_file
        tgt_vocab_size = src_vocab_size
    else:
        tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(
            tgt_vocab_file,
            hparams.out_dir,
            check_special_token=hparams.check_special_token,
            sos=hparams.sos,
            eos=hparams.eos,
            unk=vocab_utils.UNK)
    hparams.add_hparam("src_vocab_size", src_vocab_size)
    hparams.add_hparam("tgt_vocab_size", tgt_vocab_size)
    hparams.add_hparam("src_vocab_file", src_vocab_file)
    hparams.add_hparam("tgt_vocab_file", tgt_vocab_file)

    hparams.add_hparam("src_embed_file", "")
    hparams.add_hparam("tgt_embed_file", "")
    if hparams.embed_prefix:
        src_embed_file = hparams.embed_prefix + "." + hparams.src
        tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt

        if tf.gfile.Exists(src_embed_file):
            hparams.src_embed_file = src_embed_file

        if tf.gfile.Exists(tgt_embed_file):
            hparams.tgt_embed_file = tgt_embed_file

    if not tf.gfile.Exists(hparams.out_dir):
        utils.print_out("# Creating output directory %s ..." % hparams.out_dir)
        tf.gfile.MakeDirs(hparams.out_dir)

    for metric in hparams.metrics:
        hparams.add_hparam("best_" + metric, 0)
        best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric)
        hparams.add_hparam("best_" + metric + "_dir", best_metric_dir)
        tf.gfile.MakeDirs(best_metric_dir)

        if hparams.avg_ckpts:
            hparams.add_hparam("avg_best_" + metric, 0)
            best_metric_dir = os.path.join(hparams.out_dir,
                                           "avg_best_" + metric)
            hparams.add_hparam("avg_best_" + metric + "_dir", best_metric_dir)
            tf.gfile.MakeDirs(best_metric_dir)

    return hparams
Example #15
0
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            # Memory growth must be set before GPUs have been initialized
            print(e)
    from utils.vocab_utils import create_tgt_vocab_table, check_vocab, UNK
    from utils.dataset import get_train_dataset, get_infer_dataset

    base_path = "/home/panxie/Documents/sign-language/nslt/Data"
    src_file = base_path + "/phoenix2014T.test.sign"
    tgt_file = base_path + "/phoenix2014T.test.gloss"
    tgt_vocab_file = base_path + "/phoenix2014T.vocab.gloss"
    # cnn_model_path = "/home/panxie/Documents/sign-language/nslt/BaseModel/ResNet_18.h5"
    cnn_model_path = "/home/panxie/Documents/sign-language/nslt/BaseModel/bvlc_alexnet.npy"
    tgt_vocab_size, tgt_vocab_file = check_vocab(tgt_vocab_file,
                                                 "./",
                                                 pad="<pad>",
                                                 sos="<s>",
                                                 eos="</s>",
                                                 unk=UNK)
    model = SFNet(input_shape=(227, 227), cnn_model_path=cnn_model_path, tgt_vocab_size=tgt_vocab_size,
                  rnn_units=256, cnn_arch="alexnet")
    tgt_vocab_table = create_tgt_vocab_table(base_path + "/phoenix2014T.vocab.gloss")
    dataset = get_train_dataset(src_file, tgt_file, tgt_vocab_table)
    cnt = 0
    for data in dataset.take(100):
        loss = model(data, training=True)
        print(loss)
def extend_hparams(hparams):
    """Extend training hparams."""
    # Sanity checks
    if hparams.encoder_type == "bi" and hparams.num_layers % 2 != 0:
        raise ValueError("For bi, num_layers %d should be even" %
                         hparams.num_layers)
    if hparams.top_responses < 1:
        raise ValueError("We need to choose from the top responses. %s is not \
                         a valid value" % hparams.top_responses)

    # flags
    utils.print_out("# hparams:")
    utils.print_out("  src=%s" % hparams.src)
    utils.print_out("  tgt=%s" % hparams.tgt)
    utils.print_out("  train_prefix=%s" % hparams.train_prefix)
    utils.print_out("  dev_prefix=%s" % hparams.dev_prefix)
    utils.print_out("  test_prefix=%s" % hparams.test_prefix)
    utils.print_out("  out_dir=%s" % hparams.out_dir)

    # Set num_residual_layers
    if hparams.residual:
        if hparams.num_layers > 1:
            num_residual_layers = hparams.num_layers - 1
        else:
            num_residual_layers = 0
        if hparams.context_num_layers > 1:
            context_num_residual_layers = hparams.context_num_layers - 1
        else:
            context_num_residual_layers = 0
    else:
        num_residual_layers = 0
        context_num_residual_layers = 0

    hparams.add_hparam("num_residual_layers", num_residual_layers)
    hparams.add_hparam("context_num_residual_layers",
                       context_num_residual_layers)

    # Vocab
    if hparams.vocab_file:
        vocab_size, vocab_file = vocab_utils.check_vocab(
            hparams.vocab_file,
            out_dir=hparams.out_dir,
            sos=hparams.sos,
            eos=hparams.eos,
            unk=vocab_utils.UNK)
    else:
        raise ValueError(
            "A vocab_file must be provided by using --vocab_file=<vocab path>")
    # Add the vocab size and override the vocab_file
    hparams.add_hparam("vocab_size", vocab_size)
    hparams.parse("vocab_file=%s" % vocab_file)

    # Check out_dir
    if not tf.gfile.Exists(hparams.out_dir):
        utils.print_out("# Creating output directory %s ..." % hparams.out_dir)
        tf.gfile.MakeDirs(hparams.out_dir)

    # Evaluation
    for metric in hparams.metrics:
        hparams.add_hparam("best_" + metric, 0)  # larger is better
        best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric)
        hparams.add_hparam("best_" + metric + "_dir", best_metric_dir)
        tf.gfile.MakeDirs(best_metric_dir)

    return hparams
Example #17
0
src_file = "%s.%s" % (args.train_prefix, args.src)
tgt_file = "%s.%s" % (args.train_prefix, args.tgt)

src_vocab_file = args.vocab_prefix + "." + args.src
tgt_vocab_file = args.vocab_prefix + "." + args.tgt

#src_embed_file = args.embed_prefix + "." + args.src
#tgt_embed_file = args.embed_prefix + "." + args.tgt
src_embed_file = ""
tgt_embed_file = ""

src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
	src_vocab_file,
	args.out_dir,
	check_special_token=args.check_special_token,
	sos=args.sos,
	eos=args.eos,
	unk=vocab_utils.UNK)

tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(
	tgt_vocab_file,
	args.out_dir,
	check_special_token=args.check_special_token,
	sos=args.sos,
	eos=args.eos,
	unk=vocab_utils.UNK)

#graph = tf.Graph()
scope="train"
#with graph.as_default(), tf.container(scope):
Example #18
0
def extend_hparams(hparams):
    """Add new arguments to hparams."""
    # Sanity checks
    if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0:
        raise ValueError("For bi, num_encoder_layers %d should be even" %
                         hparams.num_encoder_layers)

    if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]:
        raise ValueError("subword option must be either spm, or bpe")
    if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0:
        raise ValueError(
            "beam_width must greater than 0 when using beam_search"
            "decoder.")
    if hparams.infer_mode == "sample" and hparams.sampling_temperature <= 0.0:
        raise ValueError(
            "sampling_temperature must greater than 0.0 when using"
            "sample decoder.")

    # Different number of encoder / decoder layers
    assert hparams.num_encoder_layers and hparams.num_decoder_layers
    if hparams.num_encoder_layers != hparams.num_decoder_layers:
        hparams.pass_hidden_state = False
        utils.print_out(
            "Num encoder layer %d is different from num decoder layer"
            " %d, so set pass_hidden_state to False" %
            (hparams.num_encoder_layers, hparams.num_decoder_layers))

    # Set residual layers
    num_encoder_residual_layers = 0
    num_decoder_residual_layers = 0
    if hparams.residual:
        if hparams.num_encoder_layers > 1:
            num_encoder_residual_layers = hparams.num_encoder_layers - 1
        if hparams.num_decoder_layers > 1:
            num_decoder_residual_layers = hparams.num_decoder_layers - 1

    _add_argument(hparams, "num_encoder_residual_layers",
                  num_encoder_residual_layers)
    _add_argument(hparams, "num_decoder_residual_layers",
                  num_decoder_residual_layers)

    # Language modeling
    if getattr(hparams, "language_model", None):
        hparams.attention = ""
        hparams.attention_architecture = ""
        hparams.pass_hidden_state = False
        hparams.share_vocab = True
        hparams.src = hparams.tgt
        utils.print_out(
            "For language modeling, we turn off attention and "
            "pass_hidden_state; turn on share_vocab; set src to tgt.")

    ## Vocab
    # Get vocab file names first
    if hparams.vocab_prefix:
        src_vocab_file = hparams.vocab_prefix + "." + hparams.src
        tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
    else:
        raise ValueError("hparams.vocab_prefix must be provided.")

    # Source vocab
    check_special_token = getattr(hparams, "check_special_token", True)
    src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
        src_vocab_file,
        hparams.out_dir,
        check_special_token=check_special_token,
        sos=hparams.sos,
        eos=hparams.eos,
        unk=vocab_utils.UNK)

    # Target vocab
    if hparams.share_vocab:
        utils.print_out("  using source vocab for target")
        tgt_vocab_file = src_vocab_file
        tgt_vocab_size = src_vocab_size
    else:
        tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(
            tgt_vocab_file,
            hparams.out_dir,
            check_special_token=check_special_token,
            sos=hparams.sos,
            eos=hparams.eos,
            unk=vocab_utils.UNK)
    _add_argument(hparams, "src_vocab_size", src_vocab_size)
    _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size)
    _add_argument(hparams, "src_vocab_file", src_vocab_file)
    _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file)

    # Num embedding partitions
    num_embeddings_partitions = getattr(hparams, "num_embeddings_partitions",
                                        0)
    _add_argument(hparams, "num_enc_emb_partitions", num_embeddings_partitions)
    _add_argument(hparams, "num_dec_emb_partitions", num_embeddings_partitions)

    # Pretrained Embeddings
    _add_argument(hparams, "src_embed_file", "")
    _add_argument(hparams, "tgt_embed_file", "")
    if getattr(hparams, "embed_prefix", None):
        src_embed_file = hparams.embed_prefix + "." + hparams.src
        tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt

        if tf.gfile.Exists(src_embed_file):
            utils.print_out("  src_embed_file %s exist" % src_embed_file)
            hparams.src_embed_file = src_embed_file

            utils.print_out(
                "For pretrained embeddings, set num_enc_emb_partitions to 1")
            hparams.num_enc_emb_partitions = 1
        else:
            utils.print_out("  src_embed_file %s doesn't exist" %
                            src_embed_file)

        if tf.gfile.Exists(tgt_embed_file):
            utils.print_out("  tgt_embed_file %s exist" % tgt_embed_file)
            hparams.tgt_embed_file = tgt_embed_file

            utils.print_out(
                "For pretrained embeddings, set num_dec_emb_partitions to 1")
            hparams.num_dec_emb_partitions = 1
        else:
            utils.print_out("  tgt_embed_file %s doesn't exist" %
                            tgt_embed_file)

    # Evaluation
    for metric in hparams.metrics:
        best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric)
        tf.gfile.MakeDirs(best_metric_dir)
        _add_argument(hparams, "best_" + metric, 0, update=False)
        _add_argument(hparams, "best_" + metric + "_dir", best_metric_dir)

        if getattr(hparams, "avg_ckpts", None):
            best_metric_dir = os.path.join(hparams.out_dir,
                                           "avg_best_" + metric)
            tf.gfile.MakeDirs(best_metric_dir)
            _add_argument(hparams, "avg_best_" + metric, 0, update=False)
            _add_argument(hparams, "avg_best_" + metric + "_dir",
                          best_metric_dir)

    return hparams
Example #19
0
def create_or_load_hparams(out_dir, default_hparams):
    """
    Create hparams or load hparams from out_dir.
    """

    hparams = utils.load_hparams(out_dir)
    if not hparams:
        hparams = default_hparams

        hparams.add_hparam("best_bleu", 0)
        best_bleu_dir = os.path.join(out_dir, "best_bleu")
        hparams.add_hparam("best_bleu_dir", best_bleu_dir)
        os.makedirs(best_bleu_dir)
        hparams.add_hparam("avg_best_bleu", 0)
        best_bleu_dir = os.path.join(hparams.out_dir, "avg_best_bleu")
        hparams.add_hparam("avg_best_bleu_dir",
                           os.path.join(hparams.out_dir, "avg_best_bleu"))
        os.makedirs(best_bleu_dir)

        # Set num_train_steps
        train_src_file = "%s.%s" % (hparams.train_prefix, hparams.src)
        train_tgt_file = "%s.%s" % (hparams.train_prefix, hparams.tgt)
        with open(train_src_file, 'r', encoding='utf-8') as f:
            train_src_steps = len(f.readlines())
        with open(train_tgt_file, 'r', encoding='utf-8') as f:
            train_tgt_steps = len(f.readlines())
        hparams.add_hparam(
            "num_train_steps",
            min([train_src_steps, train_tgt_steps]) * hparams.epochs)

        # Set encoder/decoder layers
        hparams.add_hparam("num_encoder_layers", hparams.num_layers)
        hparams.add_hparam("num_decoder_layers", hparams.num_layers)

        # Set residual layers
        num_encoder_residual_layers = 0
        num_decoder_residual_layers = 0
        if hparams.num_encoder_layers > 1:
            num_encoder_residual_layers = hparams.num_encoder_layers - 1
        if hparams.num_decoder_layers > 1:
            num_decoder_residual_layers = hparams.num_decoder_layers - 1

        # The first unidirectional layer (after the bi-directional layer) in
        # the GNMT encoder can't have residual connection due to the input is
        # the concatenation of fw_cell and bw_cell's outputs.
        num_encoder_residual_layers = hparams.num_encoder_layers - 2

        # Compatible for GNMT models
        if hparams.num_encoder_layers == hparams.num_decoder_layers:
            num_decoder_residual_layers = num_encoder_residual_layers

        hparams.add_hparam("num_encoder_residual_layers",
                           num_encoder_residual_layers)
        hparams.add_hparam("num_decoder_residual_layers",
                           num_decoder_residual_layers)

        # Vocab
        # Get vocab file names first
        if hparams.vocab_prefix:
            src_vocab_file = hparams.vocab_prefix + "." + hparams.src
            tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
        else:
            raise ValueError("hparams.vocab_prefix must be provided.")
        # Source vocab
        src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
            src_vocab_file,
            hparams.out_dir,
            sos=hparams.sos,
            eos=hparams.eos,
            unk=vocab_utils.UNK)
        # Target vocab
        if hparams.share_vocab:
            utils.log("Using source vocab for target")
            tgt_vocab_file = src_vocab_file
            tgt_vocab_size = src_vocab_size
        else:
            tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(
                tgt_vocab_file,
                hparams.out_dir,
                sos=hparams.sos,
                eos=hparams.eos,
                unk=vocab_utils.UNK)
        hparams.add_hparam("src_vocab_size", src_vocab_size)
        hparams.add_hparam("tgt_vocab_size", tgt_vocab_size)
        hparams.add_hparam("src_vocab_file", src_vocab_file)
        hparams.add_hparam("tgt_vocab_file", tgt_vocab_file)

        # Pretrained Embeddings:
        hparams.add_hparam("src_embed_file", "")
        hparams.add_hparam("tgt_embed_file", "")
        if hparams.embed_prefix:
            src_embed_file = hparams.embed_prefix + "." + hparams.src
            tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt
            if os.path.exists(src_embed_file):
                hparams.src_embed_file = src_embed_file
            if os.path.exists(tgt_embed_file):
                hparams.tgt_embed_file = tgt_embed_file

    # Save HParams
    utils.save_hparams(out_dir, hparams)

    return hparams
Example #20
0
        learning_rate *= tf.minimum(1.0, step / warmup_steps)
        learning_rate *= tf.math.rsqrt(tf.maximum(step, warmup_steps))
        return learning_rate


config = FLAGS
FLAGS.output_dir = "./output_dir/checkpoints_alexnet_ctc"
FLAGS.best_output = "./output_dir/checkpoints_alexnet_ctc/best_bleu"

for arg in vars(FLAGS):
    logger.info("{}, {}".format(arg, getattr(FLAGS, arg)))


tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(config.tgt_vocab_file,
                                                         "./",
                                                         sos="<s>",
                                                         eos="</s>",
                                                         unk=vocab_utils.UNK)

tgt_vocab_table = vocab_utils.create_tgt_vocab_table(config.tgt_vocab_file)
word2idx, idx2word = vocab_utils.create_tgt_dict(tgt_vocab_file)

# model = Model(rnn_units=config.rnn_units, tgt_vocab_size=tgt_vocab_size, tgt_emb_size=config.tgt_emb_size)
model = CTCModel(input_shape=config.input_shape, tgt_vocab_size=tgt_vocab_size, dropout=config.dropout,
                 rnn_units=FLAGS.rnn_units)


lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    config.learning_rate,
    decay_steps=config.decay_steps,
    decay_rate=0.96,