def create_emb_matric(hparam): ''' :param hparam: :return: ''' src_size, _ = check_vocab(hparam.vocab_src) tgt_size, _ = check_vocab(hparam.vocab_tgt) emb_size = hparam.emb_size share_vocab = hparam.share_vocab if share_vocab: if src_size != tgt_size: raise ValueError( 'can not share vocab,because src.Vsize !=tgt.Vsize') emb_matric = tf.get_variable('embeding', shape=(src_size, emb_size), dtype=hparam.dtype) return (emb_matric, emb_matric) else: encode_matric = tf.get_variable('embeding/encoder', shape=(src_size, emb_size), dtype=hparam.dtype) decoder_matric = tf.get_variable('embeding/decoder', shape=(tgt_size, emb_size), dtype=hparam.dtype) return (encode_matric, decoder_matric)
def extend_hparams(hparams): """Add new arguments to hparams.""" # Sanity checks if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]: raise ValueError("subword option must be either spm, or bpe") if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0: raise ValueError( "beam_width must greater than 0 when using beam_search" "decoder.") # Different number of encoder / decoder layers assert hparams.num_encoder_layers == hparams.num_decoder_layers # The first unidirectional layer (after the bi-directional layer) in # the GNMT encoder can't have residual connection due to the input is # the concatenation of fw_cell and bw_cell's outputs. num_encoder_residual_layers = hparams.num_encoder_layers - 2 num_decoder_residual_layers = num_encoder_residual_layers _add_argument(hparams, "num_encoder_residual_layers", num_encoder_residual_layers) _add_argument(hparams, "num_decoder_residual_layers", num_decoder_residual_layers) ## Vocab # Get vocab file names first if hparams.vocab_prefix: src_vocab_file = hparams.vocab_prefix + "." + hparams.src tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt else: raise ValueError("hparams.vocab_prefix must be provided.") # Source vocab src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, hparams.out_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) # Target vocab utils.print_out(" using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size _add_argument(hparams, "src_vocab_size", src_vocab_size) _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size) _add_argument(hparams, "src_vocab_file", src_vocab_file) _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file) # Num embedding partitions _add_argument(hparams, "num_enc_emb_partitions", hparams.num_embeddings_partitions) _add_argument(hparams, "num_dec_emb_partitions", hparams.num_embeddings_partitions) # Pretrained Embeddings _add_argument(hparams, "src_embed_file", "") _add_argument(hparams, "tgt_embed_file", "") return hparams
def extend_hparams(hparams): """Extend training hparams.""" # Sanity checks if hparams.encoder_type == "bi" and hparams.num_layers % 2 != 0: raise ValueError("For bi, num_layers %d should be even" % hparams.num_layers) if hparams.attention_architecture in ["gnmt"] and hparams.num_layers < 2: raise ValueError("For gnmt attention architecture, num_layers %d should be >= 2" % hparams.num_layers) # Flags utils.print_out("# hparams:") utils.print_out(" src=%s" % hparams.src) utils.print_out(" tgt=%s" % hparams.tgt) utils.print_out(" train_prefix=%s" % hparams.train_prefix) utils.print_out(" dev_prefix=%s" % hparams.dev_prefix) utils.print_out(" test_prefix=%s" % hparams.test_prefix) utils.print_out(" out_dir=%s" % hparams.out_dir) # Set num_residual_layers if hparams.residual and hparams.num_layers > 1: if hparams.encoder_type == "gnmt": # The first unidirectional layer (after the bi-directional layer) in # the GNMT encoder can't have residual connection due to the input is # the concatenation of fw_cell and bw_cell's outputs. num_residual_layers = hparams.num_layers - 2 else: num_residual_layers = hparams.num_layers - 1 else: num_residual_layers = 0 hparams.add_hparam("num_residual_layers", num_residual_layers) ## Vocab # Get vocab file names first if hparams.vocab_prefix: tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt else: raise ValueError("hparams.vocab_prefix must be provided.") # Target Vocab tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(tgt_vocab_file, hparams.out_dir, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) hparams.add_hparam("tgt_vocab_size", tgt_vocab_size) hparams.add_hparam("tgt_vocab_file", tgt_vocab_file) # Check out_dir if not tf.gfile.Exists(hparams.out_dir): utils.print_out("# Creating output directory %s ..." % hparams.out_dir) tf.gfile.MakeDirs(hparams.out_dir) # Evaluation for metric in hparams.metrics: hparams.add_hparam("best_" + metric, 0) # larger is better best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric) hparams.add_hparam("best_bleu_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) return hparams
def testCheckVocab(self): # Create a vocab file vocab_dir = os.path.join(tf.test.get_temp_dir(), "vocab_dir") os.makedirs(vocab_dir) vocab_file = os.path.join(vocab_dir, "vocab_file") vocab = ["a", "b", "c"] with codecs.getwriter("utf-8")(tf.gfile.GFile(vocab_file, "wb")) as f: for word in vocab: f.write("%s\n" % word) # Call vocab_utils out_dir = os.path.join(tf.test.get_temp_dir(), "out_dir") os.makedirs(out_dir) vocab_size, new_vocab_file = vocab_utils.check_vocab( vocab_file, out_dir) # Assert: we expect the code to add <unk>, <s>, </s> and # create a new vocab file self.assertEqual(len(vocab) + 3, vocab_size) self.assertEqual(os.path.join(out_dir, "vocab_file"), new_vocab_file) new_vocab = [] with codecs.getreader("utf-8")(tf.gfile.GFile(new_vocab_file, "rb")) as f: for line in f: new_vocab.append(line.strip()) self.assertEqual([vocab_utils.UNK, vocab_utils.SOS, vocab_utils.EOS] + vocab, new_vocab)
def run_main(unused_argv): """Run main.""" # Initialization, Vocab generation if not tf.gfile.Exists(params['out_dir']): utils.print_out("# Creating output directory %s ..." % params['out_dir']) tf.gfile.MakeDirs(params['out_dir']) char_vocab_file = params['enc_char_map_path'] src_vocab_file = params['src_vocab_file'] tgt_vocab_file = params['tgt_vocab_file'] char_vocab_size, char_vocab_file = vocab_utils.check_char_vocab(char_vocab_file, params['out_dir']) src_vocab_size, src_vocab_file = vocab_utils.check_vocab(src_vocab_file, params['out_dir'], type = 'src', check_special_token=params['check_special_token']) tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(tgt_vocab_file, params['out_dir'], type = 'tgt', check_special_token=params['check_special_token']) ## Train / Decode if params['mode'] == 'infer': # # Modification required######### # # Inference # trans_file = params['inference_output_file'] # ckpt = params['ckpt'] # if not ckpt: # ckpt = tf.train.latest_checkpoint(out_dir) # inference_fn(ckpt, inference_input_file, trans_file, num_workers, jobid) # # Evaluation # ref_file = params['inference_ref_file'] # if ref_file and tf.gfile.Exists(trans_file): # for metric in params['metrics']: # score = evaluation_utils.evaluate(ref_file, # trans_file, # metric, # params['subword_option']) # utils.print_out(" %s: %.1f" % (metric, score)) pass infer() elif(params['mode'] == 'train'): # Train train()
def extend_hparams(hparams): """Extend training hparams.""" assert hparams.num_encoder_layers and hparams.num_decoder_layers # Flags utils.print_out("# hparams:") utils.print_out(" src_file=%s" % hparams.src_file) utils.print_out(" tgt_file=%s" % hparams.tgt_file) utils.print_out(" out_dir=%s" % hparams.out_dir) # Source vocab src_vocab_size, src_vocab_file = vocab_utils.check_vocab( hparams.src_vocab_file, hparams.out_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) # Target vocab if hparams.share_vocab: utils.print_out(" using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size else: tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( hparams.tgt_vocab_file, hparams.out_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) hparams.src_vocab_file = src_vocab_file hparams.tgt_vocab_file = tgt_vocab_file hparams.add_hparam("src_vocab_size", src_vocab_size) hparams.add_hparam("tgt_vocab_size", tgt_vocab_size) # Check out_dir if not tf.gfile.Exists(hparams.out_dir): utils.print_out("# Creating output directory %s ..." % hparams.out_dir) tf.gfile.MakeDirs(hparams.out_dir) return hparams
def load_config(filename): d = yaml.load(open(filename).read()) c = namedtuple("config", d.keys())(**d) src_vocab_file = os.path.join(c.data_dir, c.vocab_prefix + "." + c.src) src_vocab_file, src_vocab_size = vocab_utils.check_vocab( src_vocab_file, c.data_dir, c.sos, c.eos, c.unk) c = c._replace(src_vocab_size=src_vocab_size) if not c.share_vocab: tgt_vocab_file = os.path.join(c.data_dir, c.vocab_prefix + "." + c.tgt) tgt_vocab_file, tgt_vocab_size = vocab_utils.check_vocab( tgt_vocab_file, c.data_dir, c.sos, c.eos, c.unk) c = c._replace(tgt_vocab_size=tgt_vocab_size) if not os.path.exists(c.out_dir): os.makedirs(c.out_dir) return c
def _set_commom_param(self, hparam): self._batch = tf.shape(self._batchInput.src)[0] self.C, _ = check_vocab(hparam.vocab_tgt) self.SOS = hparam.SOS self.EOS = hparam.EOS self._subword = hparam.subword_option if self.mode != 'infer': self._predict_count = tf.reduce_sum(self._batchInput.tgt_seq_len) self._word_count = tf.reduce_sum( self._batchInput.src_seq_len) + tf.reduce_sum( self._batchInput.tgt_seq_len)
def main(unused_argv): make_dir(FLAGS.data_dir) train_src_file = FLAGS.train_prefix + "." + FLAGS.src train_tgt_file = FLAGS.train_prefix + "." + FLAGS.tgt dev_src_file = FLAGS.dev_prefix + "." + FLAGS.src dev_tgt_file = FLAGS.dev_prefix + "." + FLAGS.tgt if FLAGS.share_vocab: src_vocab_file = tgt_vocab_file = FLAGS.vocab_prefix else: src_vocab_file = FLAGS.vocab_prefix + "." + FLAGS.src tgt_vocab_file = FLAGS.vocab_prefix + "." + FLAGS.tgt # Source vocab src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, FLAGS.data_dir) # Target vocab if FLAGS.share_vocab: print("using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size else: tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( tgt_vocab_file, FLAGS.data_dir, unk=vocab_utils.UNK) tf.logging.info("Encoding files and saving data") vocab_helper = vocab_utils.VocabHelper(src_vocab_file, tgt_vocab_file, FLAGS.share_vocab) train_tfrecord_files = encode_and_save_files(vocab_helper, FLAGS.data_dir, train_src_file, train_tgt_file, _TRAIN_TAG, _TRAIN_SHARDS) encode_and_save_files(vocab_helper, FLAGS.data_dir, dev_src_file, dev_tgt_file, _DEV_TAG, _DEV_SHARDS) for fname in train_tfrecord_files: shuffle_records(fname)
def prepare_dataset(flags): """Generate the preprocessed dataset.""" src_file = "%s.%s" % (flags.data_dir + flags.train_prefix, flags.src) tgt_file = "%s.%s" % (flags.data_dir + flags.train_prefix, flags.tgt) vocab_file = flags.data_dir + flags.vocab_prefix _, vocab_file = vocab_utils.check_vocab(vocab_file, flags.out_dir) out_file = flags.out_dir + "preprocessed_dataset" src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables( vocab_file) src_dataset = tf.data.TextLineDataset(src_file) tgt_dataset = tf.data.TextLineDataset(tgt_file) iterator = iterator_utils.get_iterator( src_dataset, tgt_dataset, src_vocab_table, tgt_vocab_table, batch_size=1, global_batch_size=1, sos=vocab_utils.SOS, eos=vocab_utils.EOS, random_seed=1, num_buckets=flags.num_buckets, src_max_len=flags.src_max_len, tgt_max_len=flags.tgt_max_len, filter_oversized_sequences=True, return_raw=True).make_initializable_iterator() with tf.Session() as sess: sess.run(tf.tables_initializer()) sess.run(iterator.initializer) try: i = 0 while True: with open(out_file + "_%d" % i, "wb") as f: i += 1 for _ in range(100): for j in sess.run(iterator.get_next()): tf.logging.info(j) f.write(bytearray(j)) except tf.errors.OutOfRangeError: pass
def extend_hparams(hparams): """Extend training hparams.""" hparams.add_hparam("input_emb_pretrain", hparams.input_emb_file is not None) # Check if vocab has the unk and pad symbols as first words. If not, create a new vocab file with these symbols as # the first two words. vocab_size, vocab_path = vocab_utils.check_vocab(hparams.vocab_path, hparams.out_dir, unk=hparams.unk, pad=hparams.pad) vocab, _ = vocab_utils.load_vocab(vocab_path) # Generating embeddings if flag is true or file is not present if hparams.create_new_embeddings or os.path.isfile( hparams.input_emb_file) is False: embedding.save_embedding(vocab, hparams.embedding_path, hparams.input_emb_file) hparams.add_hparam("vocab_size", vocab_size) hparams.set_hparam("vocab_path", vocab_path) if not tf.gfile.Exists(hparams.out_dir): tf.gfile.MakeDirs(hparams.out_dir) return hparams
def extend_hparams(hparams): """Add new arguments to hparams.""" # Sanity checks if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0: raise ValueError("For bi, num_encoder_layers %d should be even" % hparams.num_encoder_layers) if (hparams.attention_architecture in ["gnmt"] and hparams.num_encoder_layers < 2): raise ValueError("For gnmt attention architecture, " "num_encoder_layers %d should be >= 2" % hparams.num_encoder_layers) if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]: raise ValueError("subword option must be either spm, or bpe") if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0: raise ValueError("beam_width must greater than 0 when using beam_search" "decoder.") if hparams.mode == "translate" and not hparams.translate_file: raise ValueError("--translate_file flag must be specified in translate mode") # Different number of encoder / decoder layers assert hparams.num_encoder_layers and hparams.num_decoder_layers if hparams.num_encoder_layers != hparams.num_decoder_layers: hparams.pass_hidden_state = False utils.print_out("Num encoder layer %d is different from num decoder layer" " %d, so set pass_hidden_state to False" % ( hparams.num_encoder_layers, hparams.num_decoder_layers)) # Set residual layers num_encoder_residual_layers = 0 num_decoder_residual_layers = 0 if hparams.residual: if hparams.num_encoder_layers > 1: num_encoder_residual_layers = hparams.num_encoder_layers - 1 if hparams.num_decoder_layers > 1: num_decoder_residual_layers = hparams.num_decoder_layers - 1 if hparams.encoder_type == "gnmt": # The first unidirectional layer (after the bi-directional layer) in # the GNMT encoder can't have residual connection due to the input is # the concatenation of fw_cell and bw_cell's outputs. num_encoder_residual_layers = hparams.num_encoder_layers - 2 # Compatible for GNMT models if hparams.num_encoder_layers == hparams.num_decoder_layers: num_decoder_residual_layers = num_encoder_residual_layers _add_argument(hparams, "num_encoder_residual_layers", num_encoder_residual_layers) _add_argument(hparams, "num_decoder_residual_layers", num_decoder_residual_layers) # Language modeling if hparams.language_model: hparams.attention = "" hparams.attention_architecture = "" hparams.pass_hidden_state = False hparams.share_vocab = True hparams.src = hparams.tgt utils.print_out("For language modeling, we turn off attention and " "pass_hidden_state; turn on share_vocab; set src to tgt.") ## Vocab # Get vocab file names first if hparams.vocab_prefix: src_vocab_file = hparams.vocab_prefix + "." + hparams.src tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt else: raise ValueError("hparams.vocab_prefix must be provided.") # Source vocab src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, hparams.output_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK, pad_vocab=True) # Target vocab if hparams.share_vocab: utils.print_out(" using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size else: tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( tgt_vocab_file, hparams.output_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) _add_argument(hparams, "src_vocab_size", src_vocab_size) _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size) _add_argument(hparams, "src_vocab_file", src_vocab_file) _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file) # Num embedding partitions _add_argument( hparams, "num_enc_emb_partitions", hparams.num_embeddings_partitions) _add_argument( hparams, "num_dec_emb_partitions", hparams.num_embeddings_partitions) # Pretrained Embeddings _add_argument(hparams, "src_embed_file", "") _add_argument(hparams, "tgt_embed_file", "") if hparams.embed_prefix: src_embed_file = hparams.embed_prefix + "." + hparams.src tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt if tf.gfile.Exists(src_embed_file): utils.print_out(" src_embed_file %s exist" % src_embed_file) hparams.src_embed_file = src_embed_file utils.print_out( "For pretrained embeddings, set num_enc_emb_partitions to 1") hparams.num_enc_emb_partitions = 1 else: utils.print_out(" src_embed_file %s doesn't exist" % src_embed_file) if tf.gfile.Exists(tgt_embed_file): utils.print_out(" tgt_embed_file %s exist" % tgt_embed_file) hparams.tgt_embed_file = tgt_embed_file utils.print_out( "For pretrained embeddings, set num_dec_emb_partitions to 1") hparams.num_dec_emb_partitions = 1 else: utils.print_out(" tgt_embed_file %s doesn't exist" % tgt_embed_file) # Evaluation metric = "bleu" best_metric_dir = os.path.join(hparams.output_dir, "best_" + metric) tf.gfile.MakeDirs(best_metric_dir) _add_argument(hparams, "best_" + metric, 0, update=False) _add_argument(hparams, "best_" + metric + "_dir", best_metric_dir) return hparams
def extend_hparams(hparams): """Extend training hparams.""" assert hparams.num_encoder_layers and hparams.num_decoder_layers if hparams.num_encoder_layers != hparams.num_decoder_layers: hparams.pass_hidden_state = False utils.print_out( "Num encoder layer %d is different from num decoder layer" " %d, so set pass_hidden_state to False" % (hparams.num_encoder_layers, hparams.num_decoder_layers)) # Sanity checks if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0: raise ValueError("For bi, num_encoder_layers %d should be even" % hparams.num_encoder_layers) if (hparams.attention_architecture in ["gnmt"] and hparams.num_encoder_layers < 2): raise ValueError("For gnmt attention architecture, " "num_encoder_layers %d should be >= 2" % hparams.num_encoder_layers) # Set residual layers num_encoder_residual_layers = 0 num_decoder_residual_layers = 0 if hparams.residual: if hparams.num_encoder_layers > 1: num_encoder_residual_layers = hparams.num_encoder_layers - 1 if hparams.num_decoder_layers > 1: num_decoder_residual_layers = hparams.num_decoder_layers - 1 if hparams.encoder_type == "gnmt": # The first unidirectional layer (after the bi-directional layer) in # the GNMT encoder can't have residual connection due to the input is # the concatenation of fw_cell and bw_cell's outputs. num_encoder_residual_layers = hparams.num_encoder_layers - 2 # Compatible for GNMT models if hparams.num_encoder_layers == hparams.num_decoder_layers: num_decoder_residual_layers = num_encoder_residual_layers hparams.add_hparam("num_encoder_residual_layers", num_encoder_residual_layers) hparams.add_hparam("num_decoder_residual_layers", num_decoder_residual_layers) if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]: raise ValueError("subword option must be either spm, or bpe") # Flags utils.print_out("# hparams:") utils.print_out(" src=%s" % hparams.src) utils.print_out(" tgt=%s" % hparams.tgt) utils.print_out(" train_prefix=%s" % hparams.train_prefix) utils.print_out(" dev_prefix=%s" % hparams.dev_prefix) utils.print_out(" test_prefix=%s" % hparams.test_prefix) utils.print_out(" out_dir=%s" % hparams.out_dir) ## Vocab # Get vocab file names first if hparams.vocab_prefix: src_vocab_file = hparams.vocab_prefix + "." + hparams.src tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt else: raise ValueError("hparams.vocab_prefix must be provided.") # Source vocab src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, hparams.out_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) # Target vocab if hparams.share_vocab: utils.print_out(" using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size else: tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( tgt_vocab_file, hparams.out_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) hparams.add_hparam("src_vocab_size", src_vocab_size) hparams.add_hparam("tgt_vocab_size", tgt_vocab_size) hparams.add_hparam("src_vocab_file", src_vocab_file) hparams.add_hparam("tgt_vocab_file", tgt_vocab_file) # Pretrained Embeddings: hparams.add_hparam("src_embed_file", "") hparams.add_hparam("tgt_embed_file", "") if hparams.embed_prefix: src_embed_file = hparams.embed_prefix + "." + hparams.src tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt if tf.gfile.Exists(src_embed_file): hparams.src_embed_file = src_embed_file if tf.gfile.Exists(tgt_embed_file): hparams.tgt_embed_file = tgt_embed_file # Check out_dir if not tf.gfile.Exists(hparams.out_dir): utils.print_out("# Creating output directory %s ..." % hparams.out_dir) tf.gfile.MakeDirs(hparams.out_dir) # Evaluation for metric in hparams.metrics: hparams.add_hparam("best_" + metric, 0) # larger is better best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric) hparams.add_hparam("best_" + metric + "_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) if hparams.avg_ckpts: hparams.add_hparam("avg_best_" + metric, 0) # larger is better best_metric_dir = os.path.join(hparams.out_dir, "avg_best_" + metric) hparams.add_hparam("avg_best_" + metric + "_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) return hparams
def extend_hparams(hparams): assert hparams.num_encoder_layers and hparams.num_decoder_layers if hparams.num_encoder_layers != hparams.num_decoder_layers: hparams.pass_hidden_state = False utils.print_out( "Num encoder layer %d is different from num decoder layer %d, so set pass_hidden_state to False" % (hparams.num_encoder_layers, hparams.num_decoder_layers)) if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0: raise ValueError("For bi, num_encoder_layers %d should be even" % hparams.num_encoder_layers) num_encoder_residual_layers = 0 num_decoder_residual_layers = 0 if hparams.residual: if hparams.num_encoder_layers > 1: num_encoder_residual_layers = hparams.num_encoder_layers - 1 if hparams.num_decoder_layers > 1: num_decoder_residual_layers = hparams.num_decoder_layers - 1 hparams.add_hparam("num_encoder_residual_layers", num_encoder_residual_layers) hparams.add_hparam("num_decoder_residual_layers", num_decoder_residual_layers) if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]: raise ValueError("subword option must be either spm, or bpe") utils.print_out("# hparams:") utils.print_out("src=%s" % hparams.src) utils.print_out("tgt=%s" % hparams.tgt) utils.print_out("train_prefix=%s" % hparams.train_prefix) utils.print_out("dev_prefix=%s" % hparams.dev_prefix) utils.print_out("test_prefix=%s" % hparams.test_prefix) utils.print_out("out_dir=%s" % hparams.out_dir) if hparams.vocab_prefix: src_vocab_file = hparams.vocab_prefix + "." + hparams.src tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt else: raise ValueError("hparams.vocab_prefix must be provided.") src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, hparams.out_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) if hparams.share_vocab: utils.print_out("using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size else: tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( tgt_vocab_file, hparams.out_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) hparams.add_hparam("src_vocab_size", src_vocab_size) hparams.add_hparam("tgt_vocab_size", tgt_vocab_size) hparams.add_hparam("src_vocab_file", src_vocab_file) hparams.add_hparam("tgt_vocab_file", tgt_vocab_file) hparams.add_hparam("src_embed_file", "") hparams.add_hparam("tgt_embed_file", "") if hparams.embed_prefix: src_embed_file = hparams.embed_prefix + "." + hparams.src tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt if tf.gfile.Exists(src_embed_file): hparams.src_embed_file = src_embed_file if tf.gfile.Exists(tgt_embed_file): hparams.tgt_embed_file = tgt_embed_file if not tf.gfile.Exists(hparams.out_dir): utils.print_out("# Creating output directory %s ..." % hparams.out_dir) tf.gfile.MakeDirs(hparams.out_dir) for metric in hparams.metrics: hparams.add_hparam("best_" + metric, 0) best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric) hparams.add_hparam("best_" + metric + "_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) if hparams.avg_ckpts: hparams.add_hparam("avg_best_" + metric, 0) best_metric_dir = os.path.join(hparams.out_dir, "avg_best_" + metric) hparams.add_hparam("avg_best_" + metric + "_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) return hparams
for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) logical_gpus = tf.config.experimental.list_logical_devices('GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") except RuntimeError as e: # Memory growth must be set before GPUs have been initialized print(e) from utils.vocab_utils import create_tgt_vocab_table, check_vocab, UNK from utils.dataset import get_train_dataset, get_infer_dataset base_path = "/home/panxie/Documents/sign-language/nslt/Data" src_file = base_path + "/phoenix2014T.test.sign" tgt_file = base_path + "/phoenix2014T.test.gloss" tgt_vocab_file = base_path + "/phoenix2014T.vocab.gloss" # cnn_model_path = "/home/panxie/Documents/sign-language/nslt/BaseModel/ResNet_18.h5" cnn_model_path = "/home/panxie/Documents/sign-language/nslt/BaseModel/bvlc_alexnet.npy" tgt_vocab_size, tgt_vocab_file = check_vocab(tgt_vocab_file, "./", pad="<pad>", sos="<s>", eos="</s>", unk=UNK) model = SFNet(input_shape=(227, 227), cnn_model_path=cnn_model_path, tgt_vocab_size=tgt_vocab_size, rnn_units=256, cnn_arch="alexnet") tgt_vocab_table = create_tgt_vocab_table(base_path + "/phoenix2014T.vocab.gloss") dataset = get_train_dataset(src_file, tgt_file, tgt_vocab_table) cnt = 0 for data in dataset.take(100): loss = model(data, training=True) print(loss)
def extend_hparams(hparams): """Extend training hparams.""" # Sanity checks if hparams.encoder_type == "bi" and hparams.num_layers % 2 != 0: raise ValueError("For bi, num_layers %d should be even" % hparams.num_layers) if hparams.top_responses < 1: raise ValueError("We need to choose from the top responses. %s is not \ a valid value" % hparams.top_responses) # flags utils.print_out("# hparams:") utils.print_out(" src=%s" % hparams.src) utils.print_out(" tgt=%s" % hparams.tgt) utils.print_out(" train_prefix=%s" % hparams.train_prefix) utils.print_out(" dev_prefix=%s" % hparams.dev_prefix) utils.print_out(" test_prefix=%s" % hparams.test_prefix) utils.print_out(" out_dir=%s" % hparams.out_dir) # Set num_residual_layers if hparams.residual: if hparams.num_layers > 1: num_residual_layers = hparams.num_layers - 1 else: num_residual_layers = 0 if hparams.context_num_layers > 1: context_num_residual_layers = hparams.context_num_layers - 1 else: context_num_residual_layers = 0 else: num_residual_layers = 0 context_num_residual_layers = 0 hparams.add_hparam("num_residual_layers", num_residual_layers) hparams.add_hparam("context_num_residual_layers", context_num_residual_layers) # Vocab if hparams.vocab_file: vocab_size, vocab_file = vocab_utils.check_vocab( hparams.vocab_file, out_dir=hparams.out_dir, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) else: raise ValueError( "A vocab_file must be provided by using --vocab_file=<vocab path>") # Add the vocab size and override the vocab_file hparams.add_hparam("vocab_size", vocab_size) hparams.parse("vocab_file=%s" % vocab_file) # Check out_dir if not tf.gfile.Exists(hparams.out_dir): utils.print_out("# Creating output directory %s ..." % hparams.out_dir) tf.gfile.MakeDirs(hparams.out_dir) # Evaluation for metric in hparams.metrics: hparams.add_hparam("best_" + metric, 0) # larger is better best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric) hparams.add_hparam("best_" + metric + "_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) return hparams
src_file = "%s.%s" % (args.train_prefix, args.src) tgt_file = "%s.%s" % (args.train_prefix, args.tgt) src_vocab_file = args.vocab_prefix + "." + args.src tgt_vocab_file = args.vocab_prefix + "." + args.tgt #src_embed_file = args.embed_prefix + "." + args.src #tgt_embed_file = args.embed_prefix + "." + args.tgt src_embed_file = "" tgt_embed_file = "" src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, args.out_dir, check_special_token=args.check_special_token, sos=args.sos, eos=args.eos, unk=vocab_utils.UNK) tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( tgt_vocab_file, args.out_dir, check_special_token=args.check_special_token, sos=args.sos, eos=args.eos, unk=vocab_utils.UNK) #graph = tf.Graph() scope="train" #with graph.as_default(), tf.container(scope):
def extend_hparams(hparams): """Add new arguments to hparams.""" # Sanity checks if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0: raise ValueError("For bi, num_encoder_layers %d should be even" % hparams.num_encoder_layers) if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]: raise ValueError("subword option must be either spm, or bpe") if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0: raise ValueError( "beam_width must greater than 0 when using beam_search" "decoder.") if hparams.infer_mode == "sample" and hparams.sampling_temperature <= 0.0: raise ValueError( "sampling_temperature must greater than 0.0 when using" "sample decoder.") # Different number of encoder / decoder layers assert hparams.num_encoder_layers and hparams.num_decoder_layers if hparams.num_encoder_layers != hparams.num_decoder_layers: hparams.pass_hidden_state = False utils.print_out( "Num encoder layer %d is different from num decoder layer" " %d, so set pass_hidden_state to False" % (hparams.num_encoder_layers, hparams.num_decoder_layers)) # Set residual layers num_encoder_residual_layers = 0 num_decoder_residual_layers = 0 if hparams.residual: if hparams.num_encoder_layers > 1: num_encoder_residual_layers = hparams.num_encoder_layers - 1 if hparams.num_decoder_layers > 1: num_decoder_residual_layers = hparams.num_decoder_layers - 1 _add_argument(hparams, "num_encoder_residual_layers", num_encoder_residual_layers) _add_argument(hparams, "num_decoder_residual_layers", num_decoder_residual_layers) # Language modeling if getattr(hparams, "language_model", None): hparams.attention = "" hparams.attention_architecture = "" hparams.pass_hidden_state = False hparams.share_vocab = True hparams.src = hparams.tgt utils.print_out( "For language modeling, we turn off attention and " "pass_hidden_state; turn on share_vocab; set src to tgt.") ## Vocab # Get vocab file names first if hparams.vocab_prefix: src_vocab_file = hparams.vocab_prefix + "." + hparams.src tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt else: raise ValueError("hparams.vocab_prefix must be provided.") # Source vocab check_special_token = getattr(hparams, "check_special_token", True) src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, hparams.out_dir, check_special_token=check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) # Target vocab if hparams.share_vocab: utils.print_out(" using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size else: tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( tgt_vocab_file, hparams.out_dir, check_special_token=check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) _add_argument(hparams, "src_vocab_size", src_vocab_size) _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size) _add_argument(hparams, "src_vocab_file", src_vocab_file) _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file) # Num embedding partitions num_embeddings_partitions = getattr(hparams, "num_embeddings_partitions", 0) _add_argument(hparams, "num_enc_emb_partitions", num_embeddings_partitions) _add_argument(hparams, "num_dec_emb_partitions", num_embeddings_partitions) # Pretrained Embeddings _add_argument(hparams, "src_embed_file", "") _add_argument(hparams, "tgt_embed_file", "") if getattr(hparams, "embed_prefix", None): src_embed_file = hparams.embed_prefix + "." + hparams.src tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt if tf.gfile.Exists(src_embed_file): utils.print_out(" src_embed_file %s exist" % src_embed_file) hparams.src_embed_file = src_embed_file utils.print_out( "For pretrained embeddings, set num_enc_emb_partitions to 1") hparams.num_enc_emb_partitions = 1 else: utils.print_out(" src_embed_file %s doesn't exist" % src_embed_file) if tf.gfile.Exists(tgt_embed_file): utils.print_out(" tgt_embed_file %s exist" % tgt_embed_file) hparams.tgt_embed_file = tgt_embed_file utils.print_out( "For pretrained embeddings, set num_dec_emb_partitions to 1") hparams.num_dec_emb_partitions = 1 else: utils.print_out(" tgt_embed_file %s doesn't exist" % tgt_embed_file) # Evaluation for metric in hparams.metrics: best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric) tf.gfile.MakeDirs(best_metric_dir) _add_argument(hparams, "best_" + metric, 0, update=False) _add_argument(hparams, "best_" + metric + "_dir", best_metric_dir) if getattr(hparams, "avg_ckpts", None): best_metric_dir = os.path.join(hparams.out_dir, "avg_best_" + metric) tf.gfile.MakeDirs(best_metric_dir) _add_argument(hparams, "avg_best_" + metric, 0, update=False) _add_argument(hparams, "avg_best_" + metric + "_dir", best_metric_dir) return hparams
def create_or_load_hparams(out_dir, default_hparams): """ Create hparams or load hparams from out_dir. """ hparams = utils.load_hparams(out_dir) if not hparams: hparams = default_hparams hparams.add_hparam("best_bleu", 0) best_bleu_dir = os.path.join(out_dir, "best_bleu") hparams.add_hparam("best_bleu_dir", best_bleu_dir) os.makedirs(best_bleu_dir) hparams.add_hparam("avg_best_bleu", 0) best_bleu_dir = os.path.join(hparams.out_dir, "avg_best_bleu") hparams.add_hparam("avg_best_bleu_dir", os.path.join(hparams.out_dir, "avg_best_bleu")) os.makedirs(best_bleu_dir) # Set num_train_steps train_src_file = "%s.%s" % (hparams.train_prefix, hparams.src) train_tgt_file = "%s.%s" % (hparams.train_prefix, hparams.tgt) with open(train_src_file, 'r', encoding='utf-8') as f: train_src_steps = len(f.readlines()) with open(train_tgt_file, 'r', encoding='utf-8') as f: train_tgt_steps = len(f.readlines()) hparams.add_hparam( "num_train_steps", min([train_src_steps, train_tgt_steps]) * hparams.epochs) # Set encoder/decoder layers hparams.add_hparam("num_encoder_layers", hparams.num_layers) hparams.add_hparam("num_decoder_layers", hparams.num_layers) # Set residual layers num_encoder_residual_layers = 0 num_decoder_residual_layers = 0 if hparams.num_encoder_layers > 1: num_encoder_residual_layers = hparams.num_encoder_layers - 1 if hparams.num_decoder_layers > 1: num_decoder_residual_layers = hparams.num_decoder_layers - 1 # The first unidirectional layer (after the bi-directional layer) in # the GNMT encoder can't have residual connection due to the input is # the concatenation of fw_cell and bw_cell's outputs. num_encoder_residual_layers = hparams.num_encoder_layers - 2 # Compatible for GNMT models if hparams.num_encoder_layers == hparams.num_decoder_layers: num_decoder_residual_layers = num_encoder_residual_layers hparams.add_hparam("num_encoder_residual_layers", num_encoder_residual_layers) hparams.add_hparam("num_decoder_residual_layers", num_decoder_residual_layers) # Vocab # Get vocab file names first if hparams.vocab_prefix: src_vocab_file = hparams.vocab_prefix + "." + hparams.src tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt else: raise ValueError("hparams.vocab_prefix must be provided.") # Source vocab src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, hparams.out_dir, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) # Target vocab if hparams.share_vocab: utils.log("Using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size else: tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( tgt_vocab_file, hparams.out_dir, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) hparams.add_hparam("src_vocab_size", src_vocab_size) hparams.add_hparam("tgt_vocab_size", tgt_vocab_size) hparams.add_hparam("src_vocab_file", src_vocab_file) hparams.add_hparam("tgt_vocab_file", tgt_vocab_file) # Pretrained Embeddings: hparams.add_hparam("src_embed_file", "") hparams.add_hparam("tgt_embed_file", "") if hparams.embed_prefix: src_embed_file = hparams.embed_prefix + "." + hparams.src tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt if os.path.exists(src_embed_file): hparams.src_embed_file = src_embed_file if os.path.exists(tgt_embed_file): hparams.tgt_embed_file = tgt_embed_file # Save HParams utils.save_hparams(out_dir, hparams) return hparams
learning_rate *= tf.minimum(1.0, step / warmup_steps) learning_rate *= tf.math.rsqrt(tf.maximum(step, warmup_steps)) return learning_rate config = FLAGS FLAGS.output_dir = "./output_dir/checkpoints_alexnet_ctc" FLAGS.best_output = "./output_dir/checkpoints_alexnet_ctc/best_bleu" for arg in vars(FLAGS): logger.info("{}, {}".format(arg, getattr(FLAGS, arg))) tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(config.tgt_vocab_file, "./", sos="<s>", eos="</s>", unk=vocab_utils.UNK) tgt_vocab_table = vocab_utils.create_tgt_vocab_table(config.tgt_vocab_file) word2idx, idx2word = vocab_utils.create_tgt_dict(tgt_vocab_file) # model = Model(rnn_units=config.rnn_units, tgt_vocab_size=tgt_vocab_size, tgt_emb_size=config.tgt_emb_size) model = CTCModel(input_shape=config.input_shape, tgt_vocab_size=tgt_vocab_size, dropout=config.dropout, rnn_units=FLAGS.rnn_units) lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( config.learning_rate, decay_steps=config.decay_steps, decay_rate=0.96,