def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
    params["batch_size"] = flags_obj.batch_size or (
        params["default_batch_size_tpu"]
        if params["use_tpu"] else params["default_batch_size"])
    params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
    params["allow_ffn_pad"] = not params["use_tpu"]

    schedule_manager = schedule.Manager(
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=params["use_tpu"],
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
        use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
    )
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    estimator = construct_estimator(flags_obj, params, schedule_manager)
    run_loop(
        estimator=estimator,
        # Training arguments
        schedule_manager=schedule_manager,
        train_hooks=train_hooks,
        benchmark_logger=benchmark_logger,
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file_path=os.path.join(flags_obj.data_dir, flags_obj.vocab_file))
Esempio n. 2
0
 def test_mutual_exclusivity(self):
     with self.assertRaises(ValueError):
         schedule.Manager(train_steps=100,
                          steps_between_evals=100,
                          train_epochs=2,
                          epochs_between_evals=1,
                          default_train_epochs=None,
                          batch_size=2048,
                          max_length=256)
Esempio n. 3
0
  def test_step_basis_tpu(self):
    manager = schedule.Manager(
        train_steps=1000, steps_between_evals=100, train_epochs=None,
        epochs_between_evals=None, default_train_epochs=None, batch_size=2048,
        max_length=256, use_tpu=True)

    self.assertEqual(manager.single_iteration_train_steps, 100)
    # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256)
    self.assertEqual(manager.single_iteration_eval_steps, 375)
    self.assertIsNone(manager.repeat_dataset)
Esempio n. 4
0
  def test_epoch_basis(self):
    manager = schedule.Manager(
        train_steps=None, steps_between_evals=None, train_epochs=10,
        epochs_between_evals=2, default_train_epochs=None, batch_size=2048,
        max_length=256)

    # For non-TPU, estimator relies on dataset exhausion
    self.assertIsNone(manager.single_iteration_train_steps)
    self.assertIsNone(manager.single_iteration_eval_steps)

    self.assertEqual(manager.repeat_dataset, 2)
Esempio n. 5
0
  def test_step_basis(self):
    manager = schedule.Manager(
        train_steps=1000, steps_between_evals=100, train_epochs=None,
        epochs_between_evals=None, default_train_epochs=None, batch_size=2048,
        max_length=256)

    self.assertEqual(manager.single_iteration_train_steps, 100)

    # Evaluation uses the full set
    self.assertIsNone(manager.single_iteration_eval_steps)

    self.assertIsNone(manager.repeat_dataset)
Esempio n. 6
0
  def test_epoch_basis_tpu(self):
    manager = schedule.Manager(
        train_steps=None, steps_between_evals=None, train_epochs=10,
        epochs_between_evals=2, default_train_epochs=None, batch_size=2048,
        max_length=256, use_tpu=True)

    self.assertEqual(
        manager.single_iteration_train_steps,
        schedule.NUM_EXAMPLES[tf.estimator.ModeKeys.TRAIN] * 2 // (2048 / 256)
    )

    # num_eval_examples / (batch_size / max_length) == 3000 / (2048 / 256)
    self.assertEqual(manager.single_iteration_eval_steps, 375)

    self.assertEqual(manager.repeat_dataset, 2)
def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.

  Returns:
    Dict of results of the run.  Contains the keys `eval_results`,
    `train_hooks`, `bleu_cased`, and `bleu_uncased`. `train_hooks` is a list the
    instances of hooks used during training.
  """
    num_gpus = flags_core.get_num_gpus(flags_obj)

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    if num_gpus > 1:
        if flags_obj.param_set == "big":
            params = model_params.BIG_MULTI_GPU_PARAMS
        elif flags_obj.param_set == "base":
            params = model_params.BASE_MULTI_GPU_PARAMS

    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["vocab_file"] = flags_obj.vocab_file
    params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
    params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
    params["allow_ffn_pad"] = not params["use_tpu"]

    params["max_length"] = flags_obj.max_length or params["max_length"]

    params["use_synthetic_data"] = flags_obj.use_synthetic_data

    # Set batch size parameter, which depends on the availability of
    # TPU and GPU, and distribution settings.
    params["batch_size"] = (
        flags_obj.batch_size
        or (params["default_batch_size_tpu"]
            if params["use_tpu"] else params["default_batch_size"]))

    total_batch_size = params["batch_size"]
    if not params["use_tpu"]:
        params["batch_size"] = distribution_utils.per_replica_batch_size(
            params["batch_size"], num_gpus)

    schedule_manager = schedule.Manager(
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=params["use_tpu"],
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    model_helpers.apply_clean(flags.FLAGS)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        model_dir=flags_obj.model_dir,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=total_batch_size,  # for ExamplesPerSecondHook
        use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
    )
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    estimator = construct_estimator(flags_obj, params, schedule_manager)
    stats = run_loop(
        estimator=estimator,
        # Training arguments
        schedule_manager=schedule_manager,
        train_hooks=train_hooks,
        benchmark_logger=benchmark_logger,
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file=flags_obj.vocab_file)

    if flags_obj.export_dir and not params["use_tpu"]:
        serving_input_fn = export.build_tensor_serving_input_receiver_fn(
            shape=[None], dtype=tf.int64, batch_size=None)
        # Export saved model, and save the vocab file as an extra asset. The vocab
        # file is saved to allow consistent input encoding and output decoding.
        # (See the "Export trained model" section in the README for an example of
        # how to use the vocab file.)
        # Since the model itself does not use the vocab file, this file is saved as
        # an extra asset rather than a core asset.
        estimator.export_savedmodel(
            flags_obj.export_dir,
            serving_input_fn,
            assets_extra={"vocab.txt": flags_obj.vocab_file},
            strip_default_attrs=True)
    return stats
def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
    num_gpus = flags_core.get_num_gpus(flags_obj)

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]  # 设置网络规模的种类,基础版还是高级版
    if num_gpus > 1:
        if flags_obj.param_set == "big":
            params = model_params.BIG_MULTI_GPU_PARAMS
        elif flags_obj.param_set == "base":
            params = model_params.BASE_MULTI_GPU_PARAMS

    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
    params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
    params["allow_ffn_pad"] = not params["use_tpu"]

    params["use_synthetic_data"] = flags_obj.use_synthetic_data  # 什么叫使用合成数据?

    # Set batch size parameter, which depends on the availability of
    # TPU and GPU, and distribution settings.
    params["batch_size"] = (
        flags_obj.batch_size
        or (params["default_batch_size_tpu"]
            if params["use_tpu"] else params["default_batch_size"]))

    if not params["use_tpu"]:
        params["batch_size"] = distribution_utils.per_device_batch_size(
            params["batch_size"], num_gpus)

    schedule_manager = schedule.Manager(  # 用来管理训练进度的实例,例如steps,验证间隔步数等
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=params["use_tpu"],
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    model_helpers.apply_clean(flags.FLAGS)  # 清理一下数据和之前保存的模型,但是需要在参数中指定允许清理

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(  # 好像是输出日志的时候需要这个实例
        flags_obj.hooks,
        model_dir=flags_obj.model_dir,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
        use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
    )
    benchmark_logger = logger.get_benchmark_logger()  # 还是用来输出日志的
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    estimator = construct_estimator(
        flags_obj, params,
        schedule_manager)  # 返回一个tf.estimator.Estimator用来训练和验证模型
    run_loop(
        estimator=estimator,  # “估计器”,用来帮助训练和验证模型
        # Training arguments
        schedule_manager=schedule_manager,  # 用来管理训练过程的,训练多少steps,多久验证一次等
        train_hooks=train_hooks,  # 打日志的?
        benchmark_logger=benchmark_logger,  # 打日志的?
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,  # 3个关于bleu的文件
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file=flags_obj.vocab_file)  # 词表文件

    if flags_obj.export_dir:
        serving_input_fn = export.build_tensor_serving_input_receiver_fn(
            shape=[None], dtype=tf.int64, batch_size=None)
        # Export saved model, and save the vocab file as an extra asset. The vocab
        # file is saved to allow consistent input encoding and output decoding.
        # (See the "Export trained model" section in the README for an example of
        # how to use the vocab file.)
        # Since the model itself does not use the vocab file, this file is saved as
        # an extra asset rather than a core asset.
        estimator.export_savedmodel(
            flags_obj.export_dir,
            serving_input_fn,
            assets_extra={"vocab.txt": flags_obj.vocab_file})
def run_transformer(flags_obj):
  """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
  num_gpus = flags_core.get_num_gpus(flags_obj)

  # Add flag-defined parameters to params object
  params = PARAMS_MAP[flags_obj.param_set]
  if num_gpus > 1:
    if flags_obj.param_set == "big":
      params = model_params.BIG_MULTI_GPU_PARAMS
    elif flags_obj.param_set == "base":
      params = model_params.BASE_MULTI_GPU_PARAMS

  params["data_dir"] = flags_obj.data_dir
  params["model_dir"] = flags_obj.model_dir
  params["num_parallel_calls"] = flags_obj.num_parallel_calls

  params["tpu"] = flags_obj.tpu
  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
  params["allow_ffn_pad"] = not params["use_tpu"]

  params["use_synthetic_data"] = flags_obj.use_synthetic_data

  # added below to override the learning rate, warmup steps, max_length, vocab_size in parameter file.
  # params would now have data passed as a flag 

  params["learning_rate"] = flags_obj.learning_rate
  params["learning_rate_warmup_steps"] = flags_obj.learning_rate_warmup_steps
  params["max_length"] = flags_obj.max_length
  params["vocab_size"] = flags_obj.vocab_size

  # added for selecting the learning rate scheme
  params["lr_scheme"] =  flags_obj.lr_scheme
  params["warmup_init_lr"] = flags_obj.warmup_init_lr

  # added for selecting the optimizer algorithm
  params["opt_alg"] = flags_obj.opt_alg

  # added to provide optimizer parameters
  params["optimizer_sgd_momentum"] = flags_obj.optimizer_sgd_momentum
  params["optimizer_rms_decay"] = flags_obj.optimizer_rms_decay
  params["optimizer_rms_momentum"] = flags_obj.optimizer_rms_momentum
  params["optimizer_rms_epsilon"] = flags_obj.optimizer_rms_epsilon
  
  # added to overide layer_postprocess_dropout value
  params["layer_postprocess_dropout"] = flags_obj.layer_postprocess_dropout

  # Set batch size parameter, which depends on the availability of
  # TPU and GPU, and distribution settings.
  params["batch_size"] = (flags_obj.batch_size or (
      params["default_batch_size_tpu"] if params["use_tpu"]
      else params["default_batch_size"]))

  # commented below to remove distribution strategy
  """
  if not params["use_tpu"]:
    params["batch_size"] = distribution_utils.per_device_batch_size(
        params["batch_size"], num_gpus)"""

  schedule_manager = schedule.Manager(
      train_steps=flags_obj.train_steps,
      steps_between_evals=flags_obj.steps_between_evals,
      train_epochs=flags_obj.train_epochs,
      epochs_between_evals=flags_obj.epochs_between_evals,
      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
      batch_size=params["batch_size"],
      max_length=params["max_length"],
      use_tpu=params["use_tpu"],
      num_tpu_shards=flags_obj.num_tpu_shards
  )

  params["repeat_dataset"] = schedule_manager.repeat_dataset

  model_helpers.apply_clean(flags.FLAGS)

  # Create hooks that log information about the training and metric values
  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
      batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
      use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
  )

  # added for horovod
  bcast_hook = hvd.BroadcastGlobalVariablesHook(0)


  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info(
      model_name="transformer",
      dataset_name="wmt_translate_ende",
      run_params=params,
      test_id=flags_obj.benchmark_test_id)

  # added for horovod
  train_hooks.append(bcast_hook)


  # Train and evaluate transformer model
  estimator = construct_estimator(flags_obj, params, schedule_manager)


  run_loop(
      estimator=estimator,
      # Training arguments
      schedule_manager=schedule_manager,
      train_hooks=train_hooks,
      benchmark_logger=benchmark_logger,
      # BLEU calculation arguments
      bleu_source=flags_obj.bleu_source,
      bleu_ref=flags_obj.bleu_ref,
      bleu_threshold=flags_obj.stop_threshold,
      vocab_file=flags_obj.vocab_file)

  if flags_obj.export_dir and not params["use_tpu"]:
    serving_input_fn = export.build_tensor_serving_input_receiver_fn(
        shape=[None], dtype=tf.int64, batch_size=None)
    # Export saved model, and save the vocab file as an extra asset. The vocab
    # file is saved to allow consistent input encoding and output decoding.
    # (See the "Export trained model" section in the README for an example of
    # how to use the vocab file.)
    # Since the model itself does not use the vocab file, this file is saved as
    # an extra asset rather than a core asset.
    estimator.export_savedmodel(
        flags_obj.export_dir, serving_input_fn,
        assets_extra={"vocab.txt": flags_obj.vocab_file})
Esempio n. 10
0
def run_transformer(flags_obj):
    print("run_transformer")
    """Create tf.Estimator to train and evaluate transformer model.
  Args:
    flags_obj: Object containing parsed flag values.
  """
    num_gpus = flags_core.get_num_gpus(flags_obj)

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    if num_gpus > 1:
        if flags_obj.param_set == "big":
            params = model_params.BIG_MULTI_GPU_PARAMS
        elif flags_obj.param_set == "base":
            params = model_params.BASE_MULTI_GPU_PARAMS

    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["static_batch"] = flags_obj.static_batch
    params["allow_ffn_pad"] = True

    params["use_synthetic_data"] = flags_obj.use_synthetic_data

    # Set batch size parameter, which depends on the availability of
    # TPU and GPU, and distribution settings.
    params["batch_size"] = (flags_obj.batch_size
                            or params["default_batch_size"])

    params["batch_size"] = distribution_utils.per_device_batch_size(
        params["batch_size"], num_gpus)

    schedule_manager = schedule.Manager(
        train_steps=flags_obj.train_steps,
        steps_between_evals=flags_obj.steps_between_evals,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=False,
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    model_helpers.apply_clean(flags.FLAGS)

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        model_dir=flags_obj.model_dir,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
        use_tpu=False  # Not all hooks can run with TPUs
    )
    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="wmt_translate_ende",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    #  estimator = construct_estimator(flags_obj, params, schedule_manager)

    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        intra_op_parallelism_threads=0,
        gpu_options=tf.GPUOptions(force_gpu_compatible=True))

    print("SESS_CONFIG: ", sess_config)
    config = RunConfig(session_config=sess_config,
                       model_dir=params["model_dir"])
    variable_strategy = 'GPU'
    use_distortion_for_training = True
    experiment_fn = get_experiment_fn(config.is_chief, flags_obj, params,
                                      schedule_manager, num_gpus,
                                      variable_strategy,
                                      use_distortion_for_training)

    #tf.contrib.learn.learn_runner.run(experiment_fn, run_config=config, hparams=tf.contrib.training.HParams(is_chief=config.is_chief, **hparams))

    tf.contrib.learn.learn_runner.run(experiment_fn,
                                      run_config=config,
                                      hparams=tf.contrib.training.HParams(
                                          is_chief=config.is_chief, **params))
    '''
  run_loop(
      estimator=estimator,
      # Training arguments
      schedule_manager=schedule_manager,
      train_hooks=train_hooks,
      benchmark_logger=benchmark_logger,
      # BLEU calculation arguments
      bleu_source=flags_obj.bleu_source,
      bleu_ref=flags_obj.bleu_ref,
      bleu_threshold=flags_obj.stop_threshold,
      vocab_file=flags_obj.vocab_file)
  '''

    if flags_obj.export_dir:
        serving_input_fn = export.build_tensor_serving_input_receiver_fn(
            shape=[None], dtype=tf.int64, batch_size=None)
        # Export saved model, and save the vocab file as an extra asset. The vocab
        # file is saved to allow consistent input encoding and output decoding.
        # (See the "Export trained model" section in the README for an example of
        # how to use the vocab file.)
        # Since the model itself does not use the vocab file, this file is saved as
        # an extra asset rather than a core asset.
        estimator.export_savedmodel(
            flags_obj.export_dir,
            serving_input_fn,
            assets_extra={"vocab.txt": flags_obj.vocab_file},
            strip_default_attrs=True)
Esempio n. 11
0
def run_transformer(flags_obj):
  """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
  num_gpus = flags_core.get_num_gpus(flags_obj)

  # Add flag-defined parameters to params object
  params = PARAMS_MAP[flags_obj.param_set]
  ### yr
  params['vocab_size_in']=6100
  params['vocab_size_out'] = 25
  #params['vocab_size']='s'
  if num_gpus > 1:
    if flags_obj.param_set == "big":
      params = model_params.BIG_MULTI_GPU_PARAMS
    elif flags_obj.param_set == "base":
      params = model_params.BASE_MULTI_GPU_PARAMS

  params["data_dir"] = flags_obj.data_dir
  params["model_dir"] = flags_obj.model_dir
  params["num_parallel_calls"] = flags_obj.num_parallel_calls

  params["tpu"] = flags_obj.tpu
  params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
  params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
  params["allow_ffn_pad"] = not params["use_tpu"]

  params["use_synthetic_data"] = flags_obj.use_synthetic_data

  # Set batch size parameter, which depends on the availability of
  # TPU and GPU, and distribution settings.
  params["batch_size"] = (flags_obj.batch_size or (
      params["default_batch_size_tpu"] if params["use_tpu"]
      else params["default_batch_size"]))

  if not params["use_tpu"]:
    params["batch_size"] = distribution_utils.per_device_batch_size(
        params["batch_size"], num_gpus)

  schedule_manager = schedule.Manager(
      train_steps=flags_obj.train_steps,
      steps_between_evals=flags_obj.steps_between_evals,
      train_epochs=flags_obj.train_epochs,
      epochs_between_evals=flags_obj.epochs_between_evals,
      default_train_epochs=DEFAULT_TRAIN_EPOCHS,
      batch_size=params["batch_size"],
      max_length=params["max_length"],
      use_tpu=params["use_tpu"],
      num_tpu_shards=flags_obj.num_tpu_shards
  )

  params["repeat_dataset"] = schedule_manager.repeat_dataset

  model_helpers.apply_clean(flags.FLAGS)

  mode = tf.estimator.ModeKeys.TRAIN

  ## build ph
  inputs_ph = tf.placeholder(tf.int32, shape=(None, None), name='inputs')
  targets_ph = tf.placeholder(tf.int32, shape=(None, None), name='targets')
  targets_ph_2 = tf.placeholder(tf.int32, shape=(None, None), name='targets2')

  loss_M, train_op_M = model_fn(inputs_ph, targets_ph, targets_ph_2, mode, params)


  print('Using GPU in Decoder')
  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
  sess = tf.Session(
          config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options))
  with sess.as_default():
      sess.run(tf.global_variables_initializer())
      saver = tf.train.Saver()

      if os.path.exists(os.path.join(FLAGS.model_dir, "checkpoint")):
          saver.restore(sess, tf.train.latest_checkpoint(FLAGS.model_dir))


      cnt=0
      for ii in xrange(schedule_manager.train_eval_iterations):
          tf.logging.info("Starting iteration %d" % (ii + 1))
          ## get data
          random.shuffle(datall)
          for data in datall:
                feed = {inputs_ph:data['inputs'],
                      targets_ph: data['targets'],
                      targets_ph_2:data['targets2']}


                loss, train_op = sess.run([loss_M, train_op_M], feed_dict=feed)
                cnt+=1
                print loss

                    ##
                if cnt%100==0:
                    print 'loss at %d'%cnt,loss

                if cnt%2000==0:
                    filename = os.path.join(
                    FLAGS.model_dir, "model_{}.ckpt".format(cnt))
                    saver.save(sess, filename)
def run_transformer(flags_obj):
    """Create tf.Estimator to train and evaluate transformer model.

  Args:
    flags_obj: Object containing parsed flag values.
  """
    num_gpus = flags_core.get_num_gpus(flags_obj)

    # Add flag-defined parameters to params object
    params = PARAMS_MAP[flags_obj.param_set]
    if num_gpus > 1:
        if flags_obj.param_set == "big":
            params = model_params.BIG_MULTI_GPU_PARAMS
        elif flags_obj.param_set == "base":
            params = model_params.BASE_MULTI_GPU_PARAMS

    params["data_dir"] = flags_obj.data_dir
    params["model_dir"] = flags_obj.model_dir
    params["num_parallel_calls"] = flags_obj.num_parallel_calls

    params["tpu"] = flags_obj.tpu
    params["use_tpu"] = bool(flags_obj.tpu)  # was a tpu specified.
    params["static_batch"] = flags_obj.static_batch or params["use_tpu"]
    params["allow_ffn_pad"] = not params["use_tpu"]

    params["use_synthetic_data"] = flags_obj.use_synthetic_data

    # Set batch size parameter, which depends on the availability of
    # TPU and GPU, and distribution settings.
    params["batch_size"] = (
        flags_obj.batch_size
        or (params["default_batch_size_tpu"]
            if params["use_tpu"] else params["default_batch_size"]))

    # TC: set vocab_size as the number of tokens in vocab_file
    params["vocab_size"] = len(open(flags_obj.vocab_file).readlines())
    print('TC: vocab_size %d' % params["vocab_size"])

    if not params["use_tpu"]:
        params["batch_size"] = distribution_utils.per_device_batch_size(
            params["batch_size"], num_gpus)

    # debug
    if True:
        print('debug: train_steps', flags_obj.train_steps)
        print('debug: train_epochs', flags_obj.train_epochs)
        print('debug: epochs_between_evals', flags_obj.epochs_between_evals)
        print('debug: steps_between_evals', flags_obj.steps_between_evals)
        print('debug: flags_obj.batch_size', flags_obj.batch_size)
        print('debug: batch_size', params['batch_size'])
        #exit()

    schedule_manager = schedule.Manager(
        train_steps=flags_obj.train_steps,
        #train_steps=10000,
        steps_between_evals=flags_obj.steps_between_evals,
        #steps_between_evals=1,
        train_epochs=flags_obj.train_epochs,
        epochs_between_evals=flags_obj.epochs_between_evals,
        default_train_epochs=DEFAULT_TRAIN_EPOCHS,
        batch_size=params["batch_size"],
        max_length=params["max_length"],
        use_tpu=params["use_tpu"],
        num_tpu_shards=flags_obj.num_tpu_shards)

    params["repeat_dataset"] = schedule_manager.repeat_dataset

    model_helpers.apply_clean(flags.FLAGS)

    if not params["use_bow"]:
        TENSORS_TO_LOG.pop("bow_loss")

    # Create hooks that log information about the training and metric values
    train_hooks = hooks_helper.get_train_hooks(
        flags_obj.hooks,
        model_dir=flags_obj.model_dir,
        tensors_to_log=TENSORS_TO_LOG,  # used for logging hooks
        batch_size=schedule_manager.batch_size,  # for ExamplesPerSecondHook
        use_tpu=params["use_tpu"]  # Not all hooks can run with TPUs
    )

    debug_hooks = [tf_debug.LocalCLIDebugHook()]

    benchmark_logger = logger.get_benchmark_logger()
    benchmark_logger.log_run_info(model_name="transformer",
                                  dataset_name="news_comments_generation",
                                  run_params=params,
                                  test_id=flags_obj.benchmark_test_id)

    # Train and evaluate transformer model
    estimator = construct_estimator(flags_obj, params, schedule_manager)
    run_loop(
        estimator=estimator,
        # Training arguments
        schedule_manager=schedule_manager,
        #train_hooks=debug_hooks,
        train_hooks=train_hooks,
        benchmark_logger=benchmark_logger,
        # BLEU calculation arguments
        bleu_source=flags_obj.bleu_source,
        bleu_ref=flags_obj.bleu_ref,
        bleu_threshold=flags_obj.stop_threshold,
        vocab_file=flags_obj.vocab_file)

    if flags_obj.export_dir:
        serving_input_fn = export.build_tensor_serving_input_receiver_fn(
            shape=[None], dtype=tf.int64, batch_size=None)
        # Export saved model, and save the vocab file as an extra asset. The vocab
        # file is saved to allow consistent input encoding and output decoding.
        # (See the "Export trained model" section in the README for an example of
        # how to use the vocab file.)
        # Since the model itself does not use the vocab file, this file is saved as
        # an extra asset rather than a core asset.
        estimator.export_savedmodel(
            flags_obj.export_dir,
            serving_input_fn,
            assets_extra={"vocab.txt": flags_obj.vocab_file})