def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu, **params.runtime.model_parallelism()) with distribution_strategy.scope(): task = classification_example.ClassificationExampleTask(params.task) train_lib.run_experiment(distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir)
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir if "train" in FLAGS.mode: train_utils.serialize_config(params, model_dir) if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu, **params.runtime.model_parallelism()) with distribution_strategy.scope(): if params.task.use_crf: task = ap_parsing_task.APParsingTaskCRF(params.task) else: task = ap_parsing_task.APParsingTaskBase(params.task) ckpt_exporter = train_utils.maybe_create_best_ckpt_exporter( params, model_dir) trainer = train_utils.create_trainer( params, task, train="train" in FLAGS.mode, evaluate=("eval" in FLAGS.mode), checkpoint_exporter=ckpt_exporter) model, _ = train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, trainer=trainer, model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir) # Export saved model. if "train" in FLAGS.mode: saved_model_path = os.path.join(model_dir, "saved_models/latest") logging.info("Exporting SavedModel to %s", saved_model_path) tf.saved_model.save(model, saved_model_path) if ckpt_exporter: logging.info("Loading best checkpoint for export") trainer.checkpoint.restore(ckpt_exporter.best_ckpt_path) saved_model_path = os.path.join(model_dir, "saved_models/best") # Make sure restored and not re-initialized. if trainer.global_step > 0: logging.info( "Exporting best saved model by %s (from global step: %d) to %s", params.trainer.best_checkpoint_eval_metric, trainer.global_step.numpy(), saved_model_path) tf.saved_model.save(trainer.model, saved_model_path)
def main(_): # TODO(b/177863554): consolidate to nlp/train.py gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir train_utils.serialize_config(params, model_dir) continuous_finetune_lib.run_continuous_finetune( FLAGS.mode, params, model_dir, pretrain_steps=FLAGS.pretrain_steps) train_utils.save_gin_config(FLAGS.mode, model_dir)
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) if 'train_and_eval' in FLAGS.mode: assert ( params.task.train_data.feature_shape == params.task.validation_data.feature_shape), ( f'train {params.task.train_data.feature_shape} != validate ' f'{params.task.validation_data.feature_shape}') if 'assemblenet' in FLAGS.experiment: if 'eval' in FLAGS.mode: # Use the feature shape in validation_data for all jobs. The number of # frames in train_data will be used to construct the Assemblenet model. params.task.model.backbone.assemblenet.num_frames = params.task.validation_data.feature_shape[ 0] shape = params.task.validation_data.feature_shape else: params.task.model.backbone.assemblenet.num_frames = params.task.train_data.feature_shape[ 0] shape = params.task.train_data.feature_shape logging.info('mode %r num_frames %r feature shape %r', FLAGS.mode, params.task.model.backbone.assemblenet.num_frames, shape) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment(distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir)
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) if isinstance(params, cfg.ExperimentConfig): with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir) elif isinstance(params, multi_cfg.MultiTaskExperimentConfig): with distribution_strategy.scope(): task = multitask.MultiTask.from_config(params.task, model_dir) model = multihead_model.build_model(params.task) train_lib_multitask.run_experiment( distribution_strategy=distribution_strategy, task=task, model=model, mode=FLAGS.mode, params=params, model_dir=model_dir) else: raise ValueError("Expected config to be either type cfg.ExperimentConfig" + \ "or multi_cfg.MultiTaskExperimentConfig, got %s" %type(params)) train_utils.save_gin_config(FLAGS.mode, model_dir)
performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype, <<<<<<< HEAD params.runtime.loss_scale) ======= params.runtime.loss_scale, use_experimental_api=True) >>>>>>> upstream/master distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir) <<<<<<< HEAD ======= train_utils.save_gin_config(FLAGS.mode, model_dir) >>>>>>> upstream/master if __name__ == '__main__': tfm_flags.define_flags() app.run(main)
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) if params.runtime.num_hpus > 0: import os #TODO: remove when SW-49334 is fixed [SW-49404] os.environ["TF_DISABLE_EAGER_TO_FUNC_REWRITER"] = "1" from habana_frameworks.tensorflow import load_habana_module load_habana_module() if params.task.train_data.deterministic or params.task.validation_data.deterministic: import os os.environ['PYTHONHASHSEED'] = '0' os.environ['TF_DETERMINISTIC_OPS'] = '1' import numpy numpy.random.seed(0) import tensorflow as tf tf.random.set_seed(0) tf.compat.v1.set_random_seed(0) import random random.seed(0) if FLAGS.dtype == "bf16": print("Using bf16 config list {}".format(FLAGS.bf16_config_path)) os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path hls_addresses = str(os.environ.get("MULTI_HLS_IPS", "127.0.0.1")).split(",") TF_BASE_PORT = 2410 mpi_rank = comm_rank() mpi_size = comm_size() if params.runtime.num_hpus > 1: model_dir = os.path.join(FLAGS.model_dir, "worker_" + str(mpi_rank)) else: model_dir = FLAGS.model_dir #prepare a comma-seperated list of device addreses worker_list = [] for address in hls_addresses: for rank in range(mpi_size // len(hls_addresses)): worker_list.append(address + ':' + str(TF_BASE_PORT + rank)) worker_hosts = ",".join(worker_list) task_index = mpi_rank # Configures cluster spec for distribution strategy. distribution_utils.configure_cluster(worker_hosts, task_index) if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype) distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, num_hpus=params.runtime.num_hpus, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment(distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir)