def extend_hparams(hparams): """Extend training hparams.""" # Sanity checks if hparams.encoder_type == "bi" and hparams.num_layers % 2 != 0: raise ValueError("For bi, num_layers %d should be even" % hparams.num_layers) if (hparams.attention_architecture in ["gnmt"] and hparams.num_layers < 2): raise ValueError("For gnmt attention architecture, " "num_layers %d should be >= 2" % hparams.num_layers) if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]: raise ValueError("subword option must be either spm, or bpe") # Flags utils.print_out("# hparams:") utils.print_out(" src=%s" % hparams.src) utils.print_out(" tgt=%s" % hparams.tgt) utils.print_out(" train_prefix=%s" % hparams.train_prefix) utils.print_out(" dev_prefix=%s" % hparams.dev_prefix) utils.print_out(" test_prefix=%s" % hparams.test_prefix) utils.print_out(" out_dir=%s" % hparams.out_dir) # Set num_residual_layers if hparams.residual and hparams.num_layers > 1: if hparams.encoder_type == "gnmt": # The first unidirectional layer (after the bi-directional layer) in # the GNMT encoder can't have residual connection due to the input is # the concatenation of fw_cell and bw_cell's outputs. num_residual_layers = hparams.num_layers - 2 else: num_residual_layers = hparams.num_layers - 1 else: num_residual_layers = 0 hparams.add_hparam("num_residual_layers", num_residual_layers) # Vocab vocab_size, vocab_file = vocab_utils.check_vocab( hparams.vocab_file, hparams.out_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) hparams.add_hparam("vocab_size", vocab_size) # hparams.add_hparam("vocab_file", vocab_file) # Check out_dir if not tf.gfile.Exists(hparams.out_dir): utils.print_out("# Creating output directory %s ..." % hparams.out_dir) tf.gfile.MakeDirs(hparams.out_dir) # Evaluation for metric in hparams.metrics: hparams.add_hparam("best_" + metric, 0) # larger is better best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric) hparams.add_hparam("best_" + metric + "_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) return hparams
def create_train_model(hparams): train_file = hparams.train vocab_size, vocab_file = vocab_utils.check_vocab(hparams.vocab_file, hparams.out_dir, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) hparams.add_hparam("vocab_size", vocab_size) graph = tf.Graph() with graph.as_default(), tf.container("train"): vocab_table = lookup_ops.index_table_from_file(vocab_file, default_value=0) iterator = iterator_utils.get_iterator(train_file, vocab_table, batch_size=hparams.batch_size, sos=hparams.sos, eos=hparams.eos, src_max_len=hparams.src_max_len) model = rnn_model.Model(hparams, mode=tf.contrib.learn.ModeKeys.TRAIN, iterator=iterator, vocab_table=vocab_table) return graph, model, iterator
def testCheckVocab(self): # Create a vocab file vocab_dir = os.path.join(tf.test.get_temp_dir(), "vocab_dir") os.makedirs(vocab_dir) vocab_file = os.path.join(vocab_dir, "vocab_file") vocab = ["a", "b", "c"] with codecs.getwriter("utf-8")(tf.gfile.GFile(vocab_file, "wb")) as f: for word in vocab: f.write("%s\n" % word) # Call vocab_utils out_dir = os.path.join(tf.test.get_temp_dir(), "out_dir") os.makedirs(out_dir) vocab_size, new_vocab_file = vocab_utils.check_vocab( vocab_file, out_dir) # Assert: we expect the code to add <unk>, <s>, </s> and # create a new vocab file self.assertEqual(len(vocab) + 3, vocab_size) self.assertEqual(os.path.join(out_dir, "vocab_file"), new_vocab_file) new_vocab = [] with codecs.getreader("utf-8")(tf.gfile.GFile(new_vocab_file, "rb")) as f: for line in f: new_vocab.append(line.strip()) self.assertEqual([vocab_utils.UNK, vocab_utils.SOS, vocab_utils.EOS] + vocab, new_vocab)
def extend_hparams(hparams): """Extend training hparams.""" # Sanity checks if hparams.encoder_type == "bi" and hparams.num_layers % 2 != 0: raise ValueError("For bi, num_layers %d should be even" % hparams.num_layers) if hparams.top_responses < 1: raise ValueError("We need to choose from the top responses. %s is not \ a valid value" % hparams.top_responses) # flags utils.print_out("# hparams:") utils.print_out(" src=%s" % hparams.src) utils.print_out(" tgt=%s" % hparams.tgt) utils.print_out(" train_prefix=%s" % hparams.train_prefix) utils.print_out(" dev_prefix=%s" % hparams.dev_prefix) utils.print_out(" test_prefix=%s" % hparams.test_prefix) utils.print_out(" out_dir=%s" % hparams.out_dir) # Set num_residual_layers if hparams.residual: if hparams.num_layers > 1: num_residual_layers = hparams.num_layers - 1 else: num_residual_layers = 0 if hparams.context_num_layers > 1: context_num_residual_layers = hparams.context_num_layers - 1 else: context_num_residual_layers = 0 else: num_residual_layers = 0 context_num_residual_layers = 0 hparams.add_hparam("num_residual_layers", num_residual_layers) hparams.add_hparam("context_num_residual_layers", context_num_residual_layers) # Vocab if hparams.vocab_file: vocab_size, vocab_file = vocab_utils.check_vocab(hparams.vocab_file, out_dir=hparams.out_dir, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) else: raise ValueError("A vocab_file must be provided by using --vocab_file=<vocab path>") # Add the vocab size and override the vocab_file hparams.add_hparam("vocab_size", vocab_size) hparams.parse("vocab_file=%s" % vocab_file) # Check out_dir if not tf.gfile.Exists(hparams.out_dir): utils.print_out("# Creating output directory %s ..." % hparams.out_dir) tf.gfile.MakeDirs(hparams.out_dir) # Evaluation for metric in hparams.metrics: hparams.add_hparam("best_" + metric, 0) # larger is better best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric) hparams.add_hparam("best_" + metric + "_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) return hparams
def main(_): ano_data_set = os.path.join(cfg.data_set, cfg.ano_data_set) vocab_file = os.path.join(ano_data_set, cfg.tgt_vocab_file) with tf.Graph().as_default(): vocab_size, vocab_file = vocab_utils.check_vocab(vocab_file, out_dir=cfg.out_dir, sos=cfg.sos, eos=cfg.eos, unk=cfg.unk) tgt_vocab_table = vocab_utils.create_vocab_tables(vocab_file) reverse_tgt_vocab_table = vocab_utils.index_to_string_table_from_file( vocab_file, default_value=cfg.unk) tgt_sos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(cfg.sos)), tf.int32) tgt_eos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(cfg.eos)), tf.int32) iter, batch_input = get_iterator(cfg.vaild_tf_filename, tgt_vocab_table, tgt_sos_id, tgt_eos_id) lookUpTgt = reverse_tgt_vocab_table.lookup( tf.to_int64(batch_input.target_output)) sess = tf.Session() sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) sess.run(iter) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) step = 0 try: while True: try: while not coord.should_stop(): src, tgt_output, src_seq_len, tgt_seq_len = \ sess.run([batch_input.source, lookUpTgt, batch_input.source_sequence_length, batch_input.target_sequence_length]) if np.isnan(np.max(src)) or np.isnan(np.min(src)): print('get a nan') exit(1) if np.any(np.less(src, 0.)): print('get a fushu') exit(1) print('run one') step += 1 except tf.errors.OutOfRangeError: print('check finished') exit(1) sess.run(iter) except KeyboardInterrupt: print('interrupt') finally: coord.request_stop() coord.join(threads) sess.close()
def extend_hparams(hparams): hparams.add_hparam("input_emb_pretrain", hparams.input_emb_file is not None) vocab_size, vocab_path = vocab_utils.check_vocab(hparams.vocab_path, hparams.out_dir, unk=hparams.unk, pad=hparams.pad) hparams.add_hparam("vocab_size", vocab_size) hparams.set_hparam("vocab_path", vocab_path) if not tf.gfile.Exists(hparams.out_dir): tf.gfile.MakeDirs(hparams.out_dir) return hparams
def extend_hparams(hparams): """Add new arguments to hparams.""" # Sanity checks if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0: raise ValueError("For bi, num_encoder_layers %d should be even" % hparams.num_encoder_layers) if (hparams.attention_architecture in ["gnmt"] and hparams.num_encoder_layers < 2): raise ValueError("For gnmt attention architecture, " "num_encoder_layers %d should be >= 2" % hparams.num_encoder_layers) if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]: raise ValueError("subword option must be either spm, or bpe") if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0: raise ValueError("beam_width must greater than 0 when using beam_search" "decoder.") if hparams.infer_mode == "sample" and hparams.sampling_temperature <= 0.0: raise ValueError("sampling_temperature must greater than 0.0 when using" "sample decoder.") # Different number of encoder / decoder layers assert hparams.num_encoder_layers and hparams.num_decoder_layers if hparams.num_encoder_layers != hparams.num_decoder_layers: hparams.pass_hidden_state = False utils.print_out("Num encoder layer %d is different from num decoder layer" " %d, so set pass_hidden_state to False" % ( hparams.num_encoder_layers, hparams.num_decoder_layers)) ## Vocab # Get vocab file names first if hparams.vocab_prefix: src_vocab_file = hparams.vocab_prefix + "." + hparams.src tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt lbl_vocab_file = hparams.vocab_prefix + "." + hparams.lbl else: raise ValueError("hparams.vocab_prefix must be provided.") # Source vocab check_special_token = getattr(hparams, "check_special_token", True) src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, hparams.out_dir, check_special_token=check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) # Target vocab if hparams.share_vocab: utils.print_out(" using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size else: tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( tgt_vocab_file, hparams.out_dir, check_special_token=check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) # Label vocab lbl_vocab_size, lbl_vocab_file = vocab_utils.check_vocab( lbl_vocab_file, hparams.out_dir, check_special_token=check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) _add_argument(hparams, "src_vocab_size", src_vocab_size) _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size) _add_argument(hparams, "lbl_vocab_size", lbl_vocab_size) _add_argument(hparams, "src_vocab_file", src_vocab_file) _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file) _add_argument(hparams, "lbl_vocab_file", lbl_vocab_file) # Pretrained Embeddings _add_argument(hparams, "src_embed_file", "") _add_argument(hparams, "tgt_embed_file", "") if getattr(hparams, "embed_prefix", None): src_embed_file = hparams.embed_prefix + "." + hparams.src tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt if tf.gfile.Exists(src_embed_file): utils.print_out(" src_embed_file %s exist" % src_embed_file) hparams.src_embed_file = src_embed_file utils.print_out( "For pretrained embeddings, set num_enc_emb_partitions to 1") hparams.num_enc_emb_partitions = 1 else: utils.print_out(" src_embed_file %s doesn't exist" % src_embed_file) if tf.gfile.Exists(tgt_embed_file): utils.print_out(" tgt_embed_file %s exist" % tgt_embed_file) hparams.tgt_embed_file = tgt_embed_file utils.print_out( "For pretrained embeddings, set num_dec_emb_partitions to 1") hparams.num_dec_emb_partitions = 1 else: utils.print_out(" tgt_embed_file %s doesn't exist" % tgt_embed_file) # Evaluation for metric in hparams.metrics: best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric) tf.gfile.MakeDirs(best_metric_dir) _add_argument(hparams, "best_" + metric, 0, update=False) _add_argument(hparams, "best_" + metric + "_dir", best_metric_dir) if getattr(hparams, "avg_ckpts", None): best_metric_dir = os.path.join(hparams.out_dir, "avg_best_" + metric) tf.gfile.MakeDirs(best_metric_dir) _add_argument(hparams, "avg_best_" + metric, 0, update=False) _add_argument(hparams, "avg_best_" + metric + "_dir", best_metric_dir) return hparams
def __init__(self, is_training=True, checkPoint_path=None): self.graph = tf.Graph() self.is_training = is_training with self.graph.as_default(): ano_data_set = os.path.join(cfg.data_set, cfg.ano_data_set) vocab_file = os.path.join(ano_data_set, cfg.tgt_vocab_file) vocab_size, vocab_file = vocab_utils.check_vocab( vocab_file, out_dir=cfg.out_dir, sos=cfg.sos, eos=cfg.eos, unk=cfg.unk) self.tgt_vocab_table = vocab_utils.create_vocab_tables(vocab_file) self.reverse_tgt_vocab_table = vocab_utils.index_to_string_table_from_file( vocab_file, default_value=cfg.unk) self.tgt_sos_id = tf.cast( self.tgt_vocab_table.lookup(tf.constant(cfg.sos)), tf.int32) self.tgt_eos_id = tf.cast( self.tgt_vocab_table.lookup(tf.constant(cfg.eos)), tf.int32) if is_training: # train_src_dataset = tf.contrib.data.TextLineDataset(os.path.join(ano_data_set, cfg.train_src_dataset)) # train_tgt_dataset = tf.contrib.data.TextLineDataset(os.path.join(ano_data_set, cfg.train_tgt_dataset)) self.init_iter_train, self.iterator_train = get_iterator( cfg.train_tf_filename, self.tgt_vocab_table, self.tgt_sos_id, self.tgt_eos_id, augment=True) # vaild_src_dataset = tf.contrib.data.TextLineDataset(os.path.join(ano_data_set, cfg.vaild_src_dataset)) # vaild_tgt_dataset = tf.contrib.data.TextLineDataset(os.path.join(ano_data_set, cfg.vaild_tgt_dataset)) self.init_iter_vaild, self.iterator_vaild = get_iterator( cfg.vaild_tf_filename, self.tgt_vocab_table, self.tgt_sos_id, self.tgt_eos_id) else: self.source = tf.placeholder(tf.float32, (None, None), name='source') batch_source = tf.expand_dims(tf.expand_dims(self.source, axis=0), axis=-1) iterator_source = normalize_input_img(batch_source) self.source_sequence_length = tf.constant( tf.shape(iterator_source)[2], tf.int32) self.iterator = BatchedInput( source=iterator_source, target_input=None, target_output=None, source_sequence_length=self.source_sequence_length, target_sequence_length=None) self.featureCNN = FeatureCNN() self.gru_att_cov = GRU_Att_Cov(vocab_size) #词表size if is_training: if cfg.outer_batch_size: outer_loss = 0 with tf.variable_scope('outer_batch_size') as scope: for i in range(cfg.outer_batch_size): if i > 0: scope.reuse_variables() self.cnn_out_train = self.featureCNN( self.iterator_train.source, True, False) self.logits_train, _, self.attn_dists_train = self.gru_att_cov( self.cnn_out_train, self.iterator_train, True, self.tgt_sos_id) outer_loss += self._loss(self.logits_train, self.iterator_train) self.loss_train = outer_loss / cfg.outer_batch_size else: self.cnn_out_train = self.featureCNN( self.iterator_train.source, True, False) self.logits_train, _, self.attn_dists_train = self.gru_att_cov( self.cnn_out_train, self.iterator_train, True, self.tgt_sos_id) self.loss_train = self._loss(self.logits_train, self.iterator_train) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.learning_rate = tf.train.exponential_decay( cfg.startLr, self.global_step, cfg.decay_steps, cfg.decay_rate) optimizer = tf.train.AdadeltaOptimizer(self.learning_rate) self.train_op = optimizer.minimize( self.loss_train, global_step=self.global_step) self.cnn_out_vaild = self.featureCNN( self.iterator_vaild.source, True) self.logits_vaild, _, _ = self.gru_att_cov( self.cnn_out_vaild, self.iterator_vaild, True, self.tgt_sos_id) self.loss_vaild = self._loss(self.logits_vaild, self.iterator_vaild) self.cnn_out_vaild_infer = self.featureCNN( self.iterator_vaild.source, False) _, self.infer_indes_vaild, self.infer_attn_dists_vaild = self.gru_att_cov( self.cnn_out_vaild_infer, self.iterator_vaild, False, self.tgt_sos_id) self.infer_lookUpTgt_vaild = self.reverse_tgt_vocab_table.lookup( tf.to_int64(self.infer_indes_vaild)) self.accuracy_vaild = self._acc( self.infer_indes_vaild, self.iterator_vaild.target_output) self.train_lookUpTgt_vaild = self.reverse_tgt_vocab_table.lookup( tf.to_int64(self.iterator_vaild.target_output)) self.train_summary, self.vaild_summary = self._summary() else: self.cnn_out = self.featureCNN(self.iterator.source, is_training) _, self.infer_indes, self.infer_attn_dists = self.gru_att_cov( self.cnn_out, self.iterator, False, self.tgt_sos_id) self.infer_lookUpTgt = self.reverse_tgt_vocab_table.lookup( tf.to_int64(self.infer_indes)) self.init = [ tf.global_variables_initializer(), tf.tables_initializer() ] self.saver = tf.train.Saver() self.sess = tf.Session(config=tf.ConfigProto( log_device_placement=True)) if not is_training: self.sess.run(self.init) self.saver.restore(self.sess, checkPoint_path)
def extend_hparams(hparams): """Extend training hparams.""" # Sanity checks if hparams.encoder_type == "bi" and hparams.num_layers % 2 != 0: raise ValueError("For bi, num_layers %d should be even" % hparams.num_layers) if (hparams.attention_architecture in ["gnmt"] and hparams.num_layers < 2): raise ValueError("For gnmt attention architecture, " "num_layers %d should be >= 2" % hparams.num_layers) # Flags utils.print_out("# hparams:") utils.print_out(" src=%s" % hparams.src) utils.print_out(" tgt=%s" % hparams.tgt) utils.print_out(" train_prefix=%s" % hparams.train_prefix) utils.print_out(" dev_prefix=%s" % hparams.dev_prefix) utils.print_out(" test_prefix=%s" % hparams.test_prefix) utils.print_out(" out_dir=%s" % hparams.out_dir) # Set num_residual_layers if hparams.residual and hparams.num_layers > 1: if hparams.encoder_type == "gnmt": # The first unidirectional layer (after the bi-directional layer) in # the GNMT encoder can't have residual connection due to the input is # the concatenation of fw_cell and bw_cell's outputs. num_residual_layers = hparams.num_layers - 2 else: num_residual_layers = hparams.num_layers - 1 else: num_residual_layers = 0 hparams.add_hparam("num_residual_layers", num_residual_layers) ## Vocab # Get vocab file names first if hparams.vocab_prefix: src_vocab_file = hparams.vocab_prefix + "." + hparams.src tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt # src_vocab_file = hparams.vocab_prefix # tgt_vocab_file = hparams.vocab_prefix else: raise ValueError("hparams.vocab_prefix must be provided.") # Source vocab src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, hparams.out_dir, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) # Target vocab if hparams.share_vocab: utils.print_out(" using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size else: tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( tgt_vocab_file, hparams.out_dir, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) hparams.add_hparam("src_vocab_size", src_vocab_size) hparams.add_hparam("tgt_vocab_size", tgt_vocab_size) hparams.add_hparam("src_vocab_file", src_vocab_file) hparams.add_hparam("tgt_vocab_file", tgt_vocab_file) # Check out_dir if not tf.gfile.Exists(hparams.out_dir): utils.print_out("# Creating output directory %s ..." % hparams.out_dir) tf.gfile.MakeDirs(hparams.out_dir) # Evaluation for metric in hparams.metrics: hparams.add_hparam("best_" + metric, 0) # larger is better best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric) hparams.add_hparam("best_" + metric + "_dir", best_metric_dir) tf.gfile.MakeDirs(best_metric_dir) return hparams
def extend_hparams(hparams, source_vocab_path, target_vocab_path): """ Extends the set of hyperparameters """ if hparams.encoder_type == "bi" and hparams.num_layers % 2 != 0: raise ValueError("For bi, num_layers %d should be even" % hparams.num_layers) if (hparams.attention_architecture in ["gnmt"] and hparams.num_layers < 2): raise ValueError("For gnmt attention architecture, " "num_layers %d should be >= 2" % hparams.num_layers) if hparams.subword_option not in [None, "spm", "bpe"]: raise ValueError("subword option must be either None, spm, or bpe") if hparams.bpe_delimiter is not None and hparams.bpe_delimiter != "@@": raise ValueError("BPE delimiter value must be '@@' %s", hparams.bpe_delimiter) if hparams.bpe_delimiter == "@@": # if bpe_delimiter is set, subword_option will automatically set to bpe if hparams.subword_option == "spm": raise ValueError("Unable to set the subword option to spm " "if bpe delimiter is set") else: hparams.subword_option = "bpe" # Flags utils.print_out("# hparams:") utils.print_out(" train_prefix=%s" % hparams.train_prefix) utils.print_out(" dev_prefix=%s" % hparams.dev_prefix) utils.print_out(" test_prefix=%s" % hparams.test_prefix) utils.print_out(" out_dir=%s" % hparams.out_dir) # Set num_residual_layers if hparams.residual and hparams.num_layers > 1: if hparams.encoder_type == "gnmt": # The first unidirectional layer (after the bi-directional layer) in # the GNMT encoder can't have residual connection due to the input is # the concatenation of fw_cell and bw_cell's outputs. num_residual_layers = hparams.num_layers - 2 else: num_residual_layers = hparams.num_layers - 1 else: num_residual_layers = 0 hparams.add_hparam("num_residual_layers", num_residual_layers) src_vocab_file = source_vocab_path tgt_vocab_file = target_vocab_path # Source vocab src_vocab_size, src_vocab_file = vocab_utils.check_vocab( src_vocab_file, hparams.out_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) # Target vocab if hparams.share_vocab: utils.print_out(" using source vocab for target") tgt_vocab_file = src_vocab_file tgt_vocab_size = src_vocab_size else: tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab( tgt_vocab_file, hparams.out_dir, check_special_token=hparams.check_special_token, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) hparams.add_hparam("src_vocab_size", src_vocab_size) hparams.add_hparam("tgt_vocab_size", tgt_vocab_size) hparams.add_hparam("src_vocab_file", src_vocab_file) hparams.add_hparam("tgt_vocab_file", tgt_vocab_file) return hparams