def run_transformer(flags_obj): """Create tf.Estimator to train and evaluate transformer model. Args: flags_obj: Object containing parsed flag values. """ # Add flag-defined parameters to params object params = PARAMS_MAP[flags_obj.param_set] params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["num_parallel_calls"] = flags_obj.num_parallel_calls params["tpu"] = flags_obj.tpu params["use_tpu"] = bool(flags_obj.tpu) # was a tpu specified. params["batch_size"] = flags_obj.batch_size or ( params["default_batch_size_tpu"] if params["use_tpu"] else params["default_batch_size"]) params["static_batch"] = flags_obj.static_batch or params["use_tpu"] params["allow_ffn_pad"] = not params["use_tpu"] schedule_manager = schedule.Manager( train_steps=flags_obj.train_steps, steps_between_evals=flags_obj.steps_between_evals, train_epochs=flags_obj.train_epochs, epochs_between_evals=flags_obj.epochs_between_evals, default_train_epochs=DEFAULT_TRAIN_EPOCHS, batch_size=params["batch_size"], max_length=params["max_length"], use_tpu=params["use_tpu"], num_tpu_shards=flags_obj.num_tpu_shards) params["repeat_dataset"] = schedule_manager.repeat_dataset # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, tensors_to_log=TENSORS_TO_LOG, # used for logging hooks batch_size=schedule_manager.batch_size, # for ExamplesPerSecondHook use_tpu=params["use_tpu"] # Not all hooks can run with TPUs ) benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="transformer", dataset_name="wmt_translate_ende", run_params=params, test_id=flags_obj.benchmark_test_id) # Train and evaluate transformer model estimator = construct_estimator(flags_obj, params, schedule_manager) run_loop( estimator=estimator, # Training arguments schedule_manager=schedule_manager, train_hooks=train_hooks, benchmark_logger=benchmark_logger, # BLEU calculation arguments bleu_source=flags_obj.bleu_source, bleu_ref=flags_obj.bleu_ref, bleu_threshold=flags_obj.stop_threshold, vocab_file_path=os.path.join(flags_obj.data_dir, flags_obj.vocab_file))
def test_mutual_exclusivity(self): with self.assertRaises(ValueError): schedule.Manager(train_steps=100, steps_between_evals=100, train_epochs=2, epochs_between_evals=1, default_train_epochs=None, batch_size=2048, max_length=256)
def test_step_basis_tpu(self): manager = schedule.Manager( train_steps=1000, steps_between_evals=100, train_epochs=None, epochs_between_evals=None, default_train_epochs=None, batch_size=2048, max_length=256, use_tpu=True) self.assertEqual(manager.single_iteration_train_steps, 100) # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256) self.assertEqual(manager.single_iteration_eval_steps, 375) self.assertIsNone(manager.repeat_dataset)
def test_epoch_basis(self): manager = schedule.Manager( train_steps=None, steps_between_evals=None, train_epochs=10, epochs_between_evals=2, default_train_epochs=None, batch_size=2048, max_length=256) # For non-TPU, estimator relies on dataset exhausion self.assertIsNone(manager.single_iteration_train_steps) self.assertIsNone(manager.single_iteration_eval_steps) self.assertEqual(manager.repeat_dataset, 2)
def test_step_basis(self): manager = schedule.Manager( train_steps=1000, steps_between_evals=100, train_epochs=None, epochs_between_evals=None, default_train_epochs=None, batch_size=2048, max_length=256) self.assertEqual(manager.single_iteration_train_steps, 100) # Evaluation uses the full set self.assertIsNone(manager.single_iteration_eval_steps) self.assertIsNone(manager.repeat_dataset)
def test_epoch_basis_tpu(self): manager = schedule.Manager( train_steps=None, steps_between_evals=None, train_epochs=10, epochs_between_evals=2, default_train_epochs=None, batch_size=2048, max_length=256, use_tpu=True) self.assertEqual( manager.single_iteration_train_steps, schedule.NUM_EXAMPLES[tf.estimator.ModeKeys.TRAIN] * 2 // (2048 / 256) ) # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256) self.assertEqual(manager.single_iteration_eval_steps, 375) self.assertEqual(manager.repeat_dataset, 2)
def run_transformer(flags_obj): """Create tf.Estimator to train and evaluate transformer model. Args: flags_obj: Object containing parsed flag values. Returns: Dict of results of the run. Contains the keys `eval_results`, `train_hooks`, `bleu_cased`, and `bleu_uncased`. `train_hooks` is a list the instances of hooks used during training. """ num_gpus = flags_core.get_num_gpus(flags_obj) # Add flag-defined parameters to params object params = PARAMS_MAP[flags_obj.param_set] if num_gpus > 1: if flags_obj.param_set == "big": params = model_params.BIG_MULTI_GPU_PARAMS elif flags_obj.param_set == "base": params = model_params.BASE_MULTI_GPU_PARAMS params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["num_parallel_calls"] = flags_obj.num_parallel_calls params["tpu"] = flags_obj.tpu params["vocab_file"] = flags_obj.vocab_file params["use_tpu"] = bool(flags_obj.tpu) # was a tpu specified. params["static_batch"] = flags_obj.static_batch or params["use_tpu"] params["allow_ffn_pad"] = not params["use_tpu"] params["max_length"] = flags_obj.max_length or params["max_length"] params["use_synthetic_data"] = flags_obj.use_synthetic_data # Set batch size parameter, which depends on the availability of # TPU and GPU, and distribution settings. params["batch_size"] = ( flags_obj.batch_size or (params["default_batch_size_tpu"] if params["use_tpu"] else params["default_batch_size"])) total_batch_size = params["batch_size"] if not params["use_tpu"]: params["batch_size"] = distribution_utils.per_replica_batch_size( params["batch_size"], num_gpus) schedule_manager = schedule.Manager( train_steps=flags_obj.train_steps, steps_between_evals=flags_obj.steps_between_evals, train_epochs=flags_obj.train_epochs, epochs_between_evals=flags_obj.epochs_between_evals, default_train_epochs=DEFAULT_TRAIN_EPOCHS, batch_size=params["batch_size"], max_length=params["max_length"], use_tpu=params["use_tpu"], num_tpu_shards=flags_obj.num_tpu_shards) params["repeat_dataset"] = schedule_manager.repeat_dataset model_helpers.apply_clean(flags.FLAGS) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, tensors_to_log=TENSORS_TO_LOG, # used for logging hooks batch_size=total_batch_size, # for ExamplesPerSecondHook use_tpu=params["use_tpu"] # Not all hooks can run with TPUs ) benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="transformer", dataset_name="wmt_translate_ende", run_params=params, test_id=flags_obj.benchmark_test_id) # Train and evaluate transformer model estimator = construct_estimator(flags_obj, params, schedule_manager) stats = run_loop( estimator=estimator, # Training arguments schedule_manager=schedule_manager, train_hooks=train_hooks, benchmark_logger=benchmark_logger, # BLEU calculation arguments bleu_source=flags_obj.bleu_source, bleu_ref=flags_obj.bleu_ref, bleu_threshold=flags_obj.stop_threshold, vocab_file=flags_obj.vocab_file) if flags_obj.export_dir and not params["use_tpu"]: serving_input_fn = export.build_tensor_serving_input_receiver_fn( shape=[None], dtype=tf.int64, batch_size=None) # Export saved model, and save the vocab file as an extra asset. The vocab # file is saved to allow consistent input encoding and output decoding. # (See the "Export trained model" section in the README for an example of # how to use the vocab file.) # Since the model itself does not use the vocab file, this file is saved as # an extra asset rather than a core asset. estimator.export_savedmodel( flags_obj.export_dir, serving_input_fn, assets_extra={"vocab.txt": flags_obj.vocab_file}, strip_default_attrs=True) return stats
def run_transformer(flags_obj): """Create tf.Estimator to train and evaluate transformer model. Args: flags_obj: Object containing parsed flag values. """ num_gpus = flags_core.get_num_gpus(flags_obj) # Add flag-defined parameters to params object params = PARAMS_MAP[flags_obj.param_set] # 设置网络规模的种类,基础版还是高级版 if num_gpus > 1: if flags_obj.param_set == "big": params = model_params.BIG_MULTI_GPU_PARAMS elif flags_obj.param_set == "base": params = model_params.BASE_MULTI_GPU_PARAMS params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["num_parallel_calls"] = flags_obj.num_parallel_calls params["tpu"] = flags_obj.tpu params["use_tpu"] = bool(flags_obj.tpu) # was a tpu specified. params["static_batch"] = flags_obj.static_batch or params["use_tpu"] params["allow_ffn_pad"] = not params["use_tpu"] params["use_synthetic_data"] = flags_obj.use_synthetic_data # 什么叫使用合成数据? # Set batch size parameter, which depends on the availability of # TPU and GPU, and distribution settings. params["batch_size"] = ( flags_obj.batch_size or (params["default_batch_size_tpu"] if params["use_tpu"] else params["default_batch_size"])) if not params["use_tpu"]: params["batch_size"] = distribution_utils.per_device_batch_size( params["batch_size"], num_gpus) schedule_manager = schedule.Manager( # 用来管理训练进度的实例,例如steps,验证间隔步数等 train_steps=flags_obj.train_steps, steps_between_evals=flags_obj.steps_between_evals, train_epochs=flags_obj.train_epochs, epochs_between_evals=flags_obj.epochs_between_evals, default_train_epochs=DEFAULT_TRAIN_EPOCHS, batch_size=params["batch_size"], max_length=params["max_length"], use_tpu=params["use_tpu"], num_tpu_shards=flags_obj.num_tpu_shards) params["repeat_dataset"] = schedule_manager.repeat_dataset model_helpers.apply_clean(flags.FLAGS) # 清理一下数据和之前保存的模型,但是需要在参数中指定允许清理 # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( # 好像是输出日志的时候需要这个实例 flags_obj.hooks, model_dir=flags_obj.model_dir, tensors_to_log=TENSORS_TO_LOG, # used for logging hooks batch_size=schedule_manager.batch_size, # for ExamplesPerSecondHook use_tpu=params["use_tpu"] # Not all hooks can run with TPUs ) benchmark_logger = logger.get_benchmark_logger() # 还是用来输出日志的 benchmark_logger.log_run_info(model_name="transformer", dataset_name="wmt_translate_ende", run_params=params, test_id=flags_obj.benchmark_test_id) # Train and evaluate transformer model estimator = construct_estimator( flags_obj, params, schedule_manager) # 返回一个tf.estimator.Estimator用来训练和验证模型 run_loop( estimator=estimator, # “估计器”,用来帮助训练和验证模型 # Training arguments schedule_manager=schedule_manager, # 用来管理训练过程的,训练多少steps,多久验证一次等 train_hooks=train_hooks, # 打日志的? benchmark_logger=benchmark_logger, # 打日志的? # BLEU calculation arguments bleu_source=flags_obj.bleu_source, # 3个关于bleu的文件 bleu_ref=flags_obj.bleu_ref, bleu_threshold=flags_obj.stop_threshold, vocab_file=flags_obj.vocab_file) # 词表文件 if flags_obj.export_dir: serving_input_fn = export.build_tensor_serving_input_receiver_fn( shape=[None], dtype=tf.int64, batch_size=None) # Export saved model, and save the vocab file as an extra asset. The vocab # file is saved to allow consistent input encoding and output decoding. # (See the "Export trained model" section in the README for an example of # how to use the vocab file.) # Since the model itself does not use the vocab file, this file is saved as # an extra asset rather than a core asset. estimator.export_savedmodel( flags_obj.export_dir, serving_input_fn, assets_extra={"vocab.txt": flags_obj.vocab_file})
def run_transformer(flags_obj): """Create tf.Estimator to train and evaluate transformer model. Args: flags_obj: Object containing parsed flag values. """ num_gpus = flags_core.get_num_gpus(flags_obj) # Add flag-defined parameters to params object params = PARAMS_MAP[flags_obj.param_set] if num_gpus > 1: if flags_obj.param_set == "big": params = model_params.BIG_MULTI_GPU_PARAMS elif flags_obj.param_set == "base": params = model_params.BASE_MULTI_GPU_PARAMS params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["num_parallel_calls"] = flags_obj.num_parallel_calls params["tpu"] = flags_obj.tpu params["use_tpu"] = bool(flags_obj.tpu) # was a tpu specified. params["static_batch"] = flags_obj.static_batch or params["use_tpu"] params["allow_ffn_pad"] = not params["use_tpu"] params["use_synthetic_data"] = flags_obj.use_synthetic_data # added below to override the learning rate, warmup steps, max_length, vocab_size in parameter file. # params would now have data passed as a flag params["learning_rate"] = flags_obj.learning_rate params["learning_rate_warmup_steps"] = flags_obj.learning_rate_warmup_steps params["max_length"] = flags_obj.max_length params["vocab_size"] = flags_obj.vocab_size # added for selecting the learning rate scheme params["lr_scheme"] = flags_obj.lr_scheme params["warmup_init_lr"] = flags_obj.warmup_init_lr # added for selecting the optimizer algorithm params["opt_alg"] = flags_obj.opt_alg # added to provide optimizer parameters params["optimizer_sgd_momentum"] = flags_obj.optimizer_sgd_momentum params["optimizer_rms_decay"] = flags_obj.optimizer_rms_decay params["optimizer_rms_momentum"] = flags_obj.optimizer_rms_momentum params["optimizer_rms_epsilon"] = flags_obj.optimizer_rms_epsilon # added to overide layer_postprocess_dropout value params["layer_postprocess_dropout"] = flags_obj.layer_postprocess_dropout # Set batch size parameter, which depends on the availability of # TPU and GPU, and distribution settings. params["batch_size"] = (flags_obj.batch_size or ( params["default_batch_size_tpu"] if params["use_tpu"] else params["default_batch_size"])) # commented below to remove distribution strategy """ if not params["use_tpu"]: params["batch_size"] = distribution_utils.per_device_batch_size( params["batch_size"], num_gpus)""" schedule_manager = schedule.Manager( train_steps=flags_obj.train_steps, steps_between_evals=flags_obj.steps_between_evals, train_epochs=flags_obj.train_epochs, epochs_between_evals=flags_obj.epochs_between_evals, default_train_epochs=DEFAULT_TRAIN_EPOCHS, batch_size=params["batch_size"], max_length=params["max_length"], use_tpu=params["use_tpu"], num_tpu_shards=flags_obj.num_tpu_shards ) params["repeat_dataset"] = schedule_manager.repeat_dataset model_helpers.apply_clean(flags.FLAGS) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, tensors_to_log=TENSORS_TO_LOG, # used for logging hooks batch_size=schedule_manager.batch_size, # for ExamplesPerSecondHook use_tpu=params["use_tpu"] # Not all hooks can run with TPUs ) # added for horovod bcast_hook = hvd.BroadcastGlobalVariablesHook(0) benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="transformer", dataset_name="wmt_translate_ende", run_params=params, test_id=flags_obj.benchmark_test_id) # added for horovod train_hooks.append(bcast_hook) # Train and evaluate transformer model estimator = construct_estimator(flags_obj, params, schedule_manager) run_loop( estimator=estimator, # Training arguments schedule_manager=schedule_manager, train_hooks=train_hooks, benchmark_logger=benchmark_logger, # BLEU calculation arguments bleu_source=flags_obj.bleu_source, bleu_ref=flags_obj.bleu_ref, bleu_threshold=flags_obj.stop_threshold, vocab_file=flags_obj.vocab_file) if flags_obj.export_dir and not params["use_tpu"]: serving_input_fn = export.build_tensor_serving_input_receiver_fn( shape=[None], dtype=tf.int64, batch_size=None) # Export saved model, and save the vocab file as an extra asset. The vocab # file is saved to allow consistent input encoding and output decoding. # (See the "Export trained model" section in the README for an example of # how to use the vocab file.) # Since the model itself does not use the vocab file, this file is saved as # an extra asset rather than a core asset. estimator.export_savedmodel( flags_obj.export_dir, serving_input_fn, assets_extra={"vocab.txt": flags_obj.vocab_file})
def run_transformer(flags_obj): print("run_transformer") """Create tf.Estimator to train and evaluate transformer model. Args: flags_obj: Object containing parsed flag values. """ num_gpus = flags_core.get_num_gpus(flags_obj) # Add flag-defined parameters to params object params = PARAMS_MAP[flags_obj.param_set] if num_gpus > 1: if flags_obj.param_set == "big": params = model_params.BIG_MULTI_GPU_PARAMS elif flags_obj.param_set == "base": params = model_params.BASE_MULTI_GPU_PARAMS params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["num_parallel_calls"] = flags_obj.num_parallel_calls params["tpu"] = flags_obj.tpu params["static_batch"] = flags_obj.static_batch params["allow_ffn_pad"] = True params["use_synthetic_data"] = flags_obj.use_synthetic_data # Set batch size parameter, which depends on the availability of # TPU and GPU, and distribution settings. params["batch_size"] = (flags_obj.batch_size or params["default_batch_size"]) params["batch_size"] = distribution_utils.per_device_batch_size( params["batch_size"], num_gpus) schedule_manager = schedule.Manager( train_steps=flags_obj.train_steps, steps_between_evals=flags_obj.steps_between_evals, train_epochs=flags_obj.train_epochs, epochs_between_evals=flags_obj.epochs_between_evals, default_train_epochs=DEFAULT_TRAIN_EPOCHS, batch_size=params["batch_size"], max_length=params["max_length"], use_tpu=False, num_tpu_shards=flags_obj.num_tpu_shards) params["repeat_dataset"] = schedule_manager.repeat_dataset model_helpers.apply_clean(flags.FLAGS) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, tensors_to_log=TENSORS_TO_LOG, # used for logging hooks batch_size=schedule_manager.batch_size, # for ExamplesPerSecondHook use_tpu=False # Not all hooks can run with TPUs ) benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="transformer", dataset_name="wmt_translate_ende", run_params=params, test_id=flags_obj.benchmark_test_id) # Train and evaluate transformer model # estimator = construct_estimator(flags_obj, params, schedule_manager) os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, intra_op_parallelism_threads=0, gpu_options=tf.GPUOptions(force_gpu_compatible=True)) print("SESS_CONFIG: ", sess_config) config = RunConfig(session_config=sess_config, model_dir=params["model_dir"]) variable_strategy = 'GPU' use_distortion_for_training = True experiment_fn = get_experiment_fn(config.is_chief, flags_obj, params, schedule_manager, num_gpus, variable_strategy, use_distortion_for_training) #tf.contrib.learn.learn_runner.run(experiment_fn, run_config=config, hparams=tf.contrib.training.HParams(is_chief=config.is_chief, **hparams)) tf.contrib.learn.learn_runner.run(experiment_fn, run_config=config, hparams=tf.contrib.training.HParams( is_chief=config.is_chief, **params)) ''' run_loop( estimator=estimator, # Training arguments schedule_manager=schedule_manager, train_hooks=train_hooks, benchmark_logger=benchmark_logger, # BLEU calculation arguments bleu_source=flags_obj.bleu_source, bleu_ref=flags_obj.bleu_ref, bleu_threshold=flags_obj.stop_threshold, vocab_file=flags_obj.vocab_file) ''' if flags_obj.export_dir: serving_input_fn = export.build_tensor_serving_input_receiver_fn( shape=[None], dtype=tf.int64, batch_size=None) # Export saved model, and save the vocab file as an extra asset. The vocab # file is saved to allow consistent input encoding and output decoding. # (See the "Export trained model" section in the README for an example of # how to use the vocab file.) # Since the model itself does not use the vocab file, this file is saved as # an extra asset rather than a core asset. estimator.export_savedmodel( flags_obj.export_dir, serving_input_fn, assets_extra={"vocab.txt": flags_obj.vocab_file}, strip_default_attrs=True)
def run_transformer(flags_obj): """Create tf.Estimator to train and evaluate transformer model. Args: flags_obj: Object containing parsed flag values. """ num_gpus = flags_core.get_num_gpus(flags_obj) # Add flag-defined parameters to params object params = PARAMS_MAP[flags_obj.param_set] ### yr params['vocab_size_in']=6100 params['vocab_size_out'] = 25 #params['vocab_size']='s' if num_gpus > 1: if flags_obj.param_set == "big": params = model_params.BIG_MULTI_GPU_PARAMS elif flags_obj.param_set == "base": params = model_params.BASE_MULTI_GPU_PARAMS params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["num_parallel_calls"] = flags_obj.num_parallel_calls params["tpu"] = flags_obj.tpu params["use_tpu"] = bool(flags_obj.tpu) # was a tpu specified. params["static_batch"] = flags_obj.static_batch or params["use_tpu"] params["allow_ffn_pad"] = not params["use_tpu"] params["use_synthetic_data"] = flags_obj.use_synthetic_data # Set batch size parameter, which depends on the availability of # TPU and GPU, and distribution settings. params["batch_size"] = (flags_obj.batch_size or ( params["default_batch_size_tpu"] if params["use_tpu"] else params["default_batch_size"])) if not params["use_tpu"]: params["batch_size"] = distribution_utils.per_device_batch_size( params["batch_size"], num_gpus) schedule_manager = schedule.Manager( train_steps=flags_obj.train_steps, steps_between_evals=flags_obj.steps_between_evals, train_epochs=flags_obj.train_epochs, epochs_between_evals=flags_obj.epochs_between_evals, default_train_epochs=DEFAULT_TRAIN_EPOCHS, batch_size=params["batch_size"], max_length=params["max_length"], use_tpu=params["use_tpu"], num_tpu_shards=flags_obj.num_tpu_shards ) params["repeat_dataset"] = schedule_manager.repeat_dataset model_helpers.apply_clean(flags.FLAGS) mode = tf.estimator.ModeKeys.TRAIN ## build ph inputs_ph = tf.placeholder(tf.int32, shape=(None, None), name='inputs') targets_ph = tf.placeholder(tf.int32, shape=(None, None), name='targets') targets_ph_2 = tf.placeholder(tf.int32, shape=(None, None), name='targets2') loss_M, train_op_M = model_fn(inputs_ph, targets_ph, targets_ph_2, mode, params) print('Using GPU in Decoder') gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) sess = tf.Session( config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options)) with sess.as_default(): sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() if os.path.exists(os.path.join(FLAGS.model_dir, "checkpoint")): saver.restore(sess, tf.train.latest_checkpoint(FLAGS.model_dir)) cnt=0 for ii in xrange(schedule_manager.train_eval_iterations): tf.logging.info("Starting iteration %d" % (ii + 1)) ## get data random.shuffle(datall) for data in datall: feed = {inputs_ph:data['inputs'], targets_ph: data['targets'], targets_ph_2:data['targets2']} loss, train_op = sess.run([loss_M, train_op_M], feed_dict=feed) cnt+=1 print loss ## if cnt%100==0: print 'loss at %d'%cnt,loss if cnt%2000==0: filename = os.path.join( FLAGS.model_dir, "model_{}.ckpt".format(cnt)) saver.save(sess, filename)
def run_transformer(flags_obj): """Create tf.Estimator to train and evaluate transformer model. Args: flags_obj: Object containing parsed flag values. """ num_gpus = flags_core.get_num_gpus(flags_obj) # Add flag-defined parameters to params object params = PARAMS_MAP[flags_obj.param_set] if num_gpus > 1: if flags_obj.param_set == "big": params = model_params.BIG_MULTI_GPU_PARAMS elif flags_obj.param_set == "base": params = model_params.BASE_MULTI_GPU_PARAMS params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["num_parallel_calls"] = flags_obj.num_parallel_calls params["tpu"] = flags_obj.tpu params["use_tpu"] = bool(flags_obj.tpu) # was a tpu specified. params["static_batch"] = flags_obj.static_batch or params["use_tpu"] params["allow_ffn_pad"] = not params["use_tpu"] params["use_synthetic_data"] = flags_obj.use_synthetic_data # Set batch size parameter, which depends on the availability of # TPU and GPU, and distribution settings. params["batch_size"] = ( flags_obj.batch_size or (params["default_batch_size_tpu"] if params["use_tpu"] else params["default_batch_size"])) # TC: set vocab_size as the number of tokens in vocab_file params["vocab_size"] = len(open(flags_obj.vocab_file).readlines()) print('TC: vocab_size %d' % params["vocab_size"]) if not params["use_tpu"]: params["batch_size"] = distribution_utils.per_device_batch_size( params["batch_size"], num_gpus) # debug if True: print('debug: train_steps', flags_obj.train_steps) print('debug: train_epochs', flags_obj.train_epochs) print('debug: epochs_between_evals', flags_obj.epochs_between_evals) print('debug: steps_between_evals', flags_obj.steps_between_evals) print('debug: flags_obj.batch_size', flags_obj.batch_size) print('debug: batch_size', params['batch_size']) #exit() schedule_manager = schedule.Manager( train_steps=flags_obj.train_steps, #train_steps=10000, steps_between_evals=flags_obj.steps_between_evals, #steps_between_evals=1, train_epochs=flags_obj.train_epochs, epochs_between_evals=flags_obj.epochs_between_evals, default_train_epochs=DEFAULT_TRAIN_EPOCHS, batch_size=params["batch_size"], max_length=params["max_length"], use_tpu=params["use_tpu"], num_tpu_shards=flags_obj.num_tpu_shards) params["repeat_dataset"] = schedule_manager.repeat_dataset model_helpers.apply_clean(flags.FLAGS) if not params["use_bow"]: TENSORS_TO_LOG.pop("bow_loss") # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, tensors_to_log=TENSORS_TO_LOG, # used for logging hooks batch_size=schedule_manager.batch_size, # for ExamplesPerSecondHook use_tpu=params["use_tpu"] # Not all hooks can run with TPUs ) debug_hooks = [tf_debug.LocalCLIDebugHook()] benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name="transformer", dataset_name="news_comments_generation", run_params=params, test_id=flags_obj.benchmark_test_id) # Train and evaluate transformer model estimator = construct_estimator(flags_obj, params, schedule_manager) run_loop( estimator=estimator, # Training arguments schedule_manager=schedule_manager, #train_hooks=debug_hooks, train_hooks=train_hooks, benchmark_logger=benchmark_logger, # BLEU calculation arguments bleu_source=flags_obj.bleu_source, bleu_ref=flags_obj.bleu_ref, bleu_threshold=flags_obj.stop_threshold, vocab_file=flags_obj.vocab_file) if flags_obj.export_dir: serving_input_fn = export.build_tensor_serving_input_receiver_fn( shape=[None], dtype=tf.int64, batch_size=None) # Export saved model, and save the vocab file as an extra asset. The vocab # file is saved to allow consistent input encoding and output decoding. # (See the "Export trained model" section in the README for an example of # how to use the vocab file.) # Since the model itself does not use the vocab file, this file is saved as # an extra asset rather than a core asset. estimator.export_savedmodel( flags_obj.export_dir, serving_input_fn, assets_extra={"vocab.txt": flags_obj.vocab_file})