def train_or_eval(config: configure_pretraining.PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" # initialize horovod hvd.init() if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) config.model_dir = config.model_dir if hvd.rank() == 0 else \ os.path.join(config.model_dir, str(hvd.rank())) config.train_batch_size = config.train_batch_size // hvd.size() config.eval_batch_size = config.eval_batch_size // hvd.size() is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 tpu_cluster_resolver = None if config.use_tpu and config.tpu_name: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( config.tpu_name, zone=config.tpu_zone, project=config.gcp_project) tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=config.iterations_per_loop, num_shards=(config.num_tpu_cores if config.do_train else config.num_tpu_cores), tpu_job_name=config.tpu_job_name, per_host_input_for_training=is_per_host) session_config = tf.ConfigProto() session_config.gpu_options.visible_device_list = str(hvd.local_rank()) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, session_config=session_config, tpu_config=tpu_config) model_fn = model_fn_builder(config=config) estimator = tf.estimator.tpu.TPUEstimator( use_tpu=config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size) if config.do_train: utils.heading("Running training") hooks = [hvd.BroadcastGlobalVariablesHook(0)] estimator.train(input_fn=pretrain_data.get_input_fn(config, True, hvd), max_steps=config.num_train_steps, hooks=hooks) if config.do_eval: utils.heading("Running evaluation") result = estimator.evaluate(input_fn=pretrain_data.get_input_fn( config, False, hvd), steps=config.num_eval_steps) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result
def train_or_eval(config: configure_pretraining.PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug and config.do_train: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) if config.use_tpu: is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 tpu_cluster_resolver = None if config.use_tpu and config.tpu_name: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( config.tpu_name, zone=config.tpu_zone, project=config.gcp_project) tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=config.iterations_per_loop, num_shards=config.num_tpu_cores, tpu_job_name=config.tpu_job_name, per_host_input_for_training=is_per_host) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, keep_checkpoint_max=config.keep_checkpoint_max, tpu_config=tpu_config) model_fn = model_fn_builder(config=config) estimator = tf.estimator.tpu.TPUEstimator( use_tpu=config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size) else: run_config = tf.estimator.RunConfig( model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, keep_checkpoint_max=config.keep_checkpoint_max) model_fn = model_fn_builder(config=config) estimator = tf.estimator.Estimator( model_fn=tensorflow.contrib.estimator.replicate_model_fn(model_fn), config=run_config, params={"batch_size": config.train_batch_size}) if config.do_train: utils.heading("Running training") estimator.train(input_fn=pretrain_data.get_input_fn(config, True), max_steps=config.num_train_steps) if config.do_eval: utils.heading("Running evaluation") result = estimator.evaluate(input_fn=pretrain_data.get_input_fn( config, False), steps=config.num_eval_steps) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result
def train_or_eval(config: configure_pretraining.PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug and config.do_train: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) # warm_start_settings = None # if config.init_checkpoint: # from tensorflow.python.estimator.estimator import WarmStartSettings # warm_start_settings = WarmStartSettings(ckpt_to_initialize_from=config.init_checkpoint, # vars_to_warm_start=['^(?!.*global_step.*)(?!.*adam.*)(?!.*Adam.*).*$']) is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 tpu_cluster_resolver = None if config.use_tpu and config.tpu_name: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( config.tpu_name, zone=config.tpu_zone, project=config.gcp_project) tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=config.iterations_per_loop, num_shards=config.num_tpu_cores, # tpu_job_name=config.tpu_job_name, per_host_input_for_training=is_per_host) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, tpu_config=tpu_config) model_fn = model_fn_builder(config=config) estimator = tf.estimator.tpu.TPUEstimator( use_tpu=config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size) if config.do_train: utils.heading("Running training") estimator.train(input_fn=pretrain_data.get_input_fn(config, True), max_steps=config.num_train_steps) if config.do_eval: utils.heading("Running evaluation") result = estimator.evaluate(input_fn=pretrain_data.get_input_fn( config, False), steps=config.num_eval_steps) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result
def train_or_eval(config: configure_pretraining.PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" if config.do_train == config.do_eval: raise ValueError("Exactly one of `do_train` or `do_eval` must be True.") if config.debug and config.do_train: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 tpu_cluster_resolver = None if config.use_tpu: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(zone=config.tpu_zone, project=config.gcp_project) print('Running on TPU ', tpu_cluster_resolver.cluster_spec().as_dict()['worker']) if tpu_cluster_resolver: tf.config.experimental_connect_to_cluster(tpu_cluster_resolver) tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver) tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=config.iterations_per_loop, num_shards=config.num_tpu_cores, tpu_job_name=config.tpu_job_name, per_host_input_for_training=is_per_host) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, keep_checkpoint_max=config.keep_checkpoint_max, tpu_config=tpu_config) model_fn = model_fn_builder(config=config) estimator = tf.estimator.tpu.TPUEstimator( use_tpu=config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size) if config.do_train: utils.heading("Running training") estimator.train(input_fn=pretrain_data.get_input_fn(config, True), max_steps=config.num_train_steps) if config.do_eval: utils.heading("Running evaluation") result = estimator.evaluate( input_fn=pretrain_data.get_input_fn(config, False), steps=config.num_eval_steps) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result
def train_one_step(config: configure_pretraining.PretrainingConfig): """Builds an ELECTRA model an trains it for one step; useful for debugging.""" train_input_fn = pretrain_data.get_input_fn(config, True) features = tf.data.make_one_shot_iterator( train_input_fn(dict(batch_size=config.train_batch_size))).get_next() model = PretrainingModel(config, features, True) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) utils.log(sess.run(model.total_loss))
def train_or_eval(config: configure_pretraining.PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) num_gpus = utils.get_available_gpus() utils.log("Found {} gpus".format(len(num_gpus))) if num_gpus == 1: session_config = tf.ConfigProto( log_device_placement=True, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) run_config = tf.estimator.RunConfig( model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, # save_checkpoints_secs=3600, # tf_random_seed=FLAGS.seed, session_config=session_config, # keep_checkpoint_max=0, log_step_count_steps=100) else: train_distribution_strategy = tf.distribute.MirroredStrategy( devices=None, cross_device_ops=tensorflow.contrib.distribute. AllReduceCrossDeviceOps('nccl', num_packs=len(num_gpus))) eval_distribution_strategy = tf.distribute.MirroredStrategy( devices=None) session_config = tf.ConfigProto( # log_device_placement=True, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) run_config = tf.estimator.RunConfig( model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, train_distribute=train_distribution_strategy, eval_distribute=eval_distribution_strategy, # save_checkpoints_secs=3600, # tf_random_seed=FLAGS.seed, session_config=session_config, # keep_checkpoint_max=0, log_step_count_steps=100) model_fn = model_fn_builder(config=config) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config, params={ 'train_batch_size': config.train_batch_size, 'eval_batch_size': config.eval_batch_size }) if config.do_train: utils.heading("Running training") estimator.train(input_fn=pretrain_data.get_input_fn(config, True), max_steps=config.num_train_steps) if config.do_eval: utils.heading("Running evaluation") result = estimator.evaluate(input_fn=pretrain_data.get_input_fn( config, False), steps=config.num_eval_steps) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result
def train_or_eval(config: PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug and config.do_train: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) # session config session_config = tf.ConfigProto() session_config.gpu_options.allow_growth = True session_config.gpu_options.visible_device_list = str( hvd.local_rank()) # one gpu per process # session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 # xla # session_config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT # xla # run config # согласно примеру: https://gist.github.com/alsrgv/34a32f30292f4e2c1fa29ec0d65dea26 # model_dir = config.model_dir if hvd.rank() == 0 else None # UPD: если model_dir == None, то Estimator по умолчанию сохраняет чекпоинты в /tmp, что сжирает системный диск run_config = tf.estimator.RunConfig( model_dir=config.model_dir, session_config=session_config, save_checkpoints_steps=config.save_checkpoints_steps if hvd.rank() == 0 else None, save_summary_steps=100 if hvd.rank() == 0 else 0, keep_checkpoint_max=config.keep_checkpoint_max, log_step_count_steps=10000) # model_fn model_fn = model_fn_builder(config=config) # training hooks training_hooks = [] if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) # estimator estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if config.do_train: utils.heading("Running training") input_fn = pretrain_data.get_input_fn( pretrain_tfrecords=config.pretrain_tfrecords, max_seq_length=config.max_seq_length, batch_size=config.train_batch_size, is_training=True, hvd=hvd, num_cpu_threads=8) estimator.train(input_fn=input_fn, hooks=training_hooks, max_steps=config.num_train_steps) if config.do_eval: utils.heading("Running evaluation") input_fn = pretrain_data.get_input_fn( pretrain_tfrecords=config.pretrain_tfrecords, max_seq_length=config.max_seq_length, batch_size=config.eval_batch_size, is_training=False, hvd=hvd, num_cpu_threads=8) result = estimator.evaluate(input_fn=input_fn, steps=config.num_eval_steps) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result
def train_or_eval(config: configure_pretraining.PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) if config.use_tpu: is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 tpu_cluster_resolver = None if config.use_tpu and config.tpu_name: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( config.tpu_name, zone=config.tpu_zone, project=config.gcp_project) tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=config.iterations_per_loop, num_shards=config.num_tpu_cores, tpu_job_name=config.tpu_job_name, per_host_input_for_training=is_per_host, ) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, tpu_config=tpu_config, ) model_fn = model_fn_builder(config=config) estimator = tf.estimator.tpu.TPUEstimator( use_tpu=config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, ) else: config_proto = tf.ConfigProto() config_proto.gpu_options.allow_growth = True run_config = tf.estimator.RunConfig( model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, session_config=config_proto, ) model_fn = model_fn_builder(config=config) estimator = None if config.saved_model: estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config, warm_start_from=config.saved_model) else: estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if config.do_train: utils.heading("Running training") estimator.train( input_fn=pretrain_data.get_input_fn(config, True), max_steps=config.num_train_steps, ) if config.do_eval: utils.heading("Running evaluation") result = estimator.evaluate( input_fn=pretrain_data.get_input_fn(config, False), steps=config.num_eval_steps, ) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result