def __init__(self, config: configure_finetuning.FinetuningConfig, tasks, pretraining_config=None): self._config = config self._tasks = tasks self._preprocessor = preprocessing.Preprocessor(config, self._tasks) is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 tpu_cluster_resolver = None if config.use_tpu: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( zone=config.tpu_zone, project=config.gcp_project) print('Running on TPU ', tpu_cluster_resolver.cluster_spec().as_dict()['worker']) if tpu_cluster_resolver: tf.config.experimental_connect_to_cluster(tpu_cluster_resolver) tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver) # strategy = tf.distribute.experimental.TPUStrategy(tpu_cluster_resolver) # else: # strategy = tf.distribute.get_strategy() print("REPLICAS: ", strategy.num_replicas_in_sync) tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=config.iterations_per_loop, num_shards=config.num_tpu_cores, per_host_input_for_training=is_per_host, tpu_job_name=config.tpu_job_name) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, save_checkpoints_secs=None, tpu_config=tpu_config) if self._config.do_train: (self._train_input_fn, self.train_steps) = self._preprocessor.prepare_train() else: self._train_input_fn, self.train_steps = None, 0 model_fn = model_fn_builder(config=config, tasks=self._tasks, num_train_steps=self.train_steps, pretraining_config=pretraining_config) self._estimator = tf.estimator.tpu.TPUEstimator( use_tpu=config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, predict_batch_size=config.predict_batch_size)
def __init__(self, config: configure_finetuning.FinetuningConfig, tasks, pretraining_config=None): self._config = config self._tasks = tasks self._preprocessor = preprocessing.Preprocessor(config, self._tasks) is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 tpu_cluster_resolver = None if config.use_tpu and config.tpu_name: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( config.tpu_name, zone=config.tpu_zone, project=config.gcp_project) tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=config.iterations_per_loop, num_shards=config.num_tpu_cores, per_host_input_for_training=is_per_host, tpu_job_name=config.tpu_job_name) session_config2 = tf.ConfigProto(allow_soft_placement=True) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, save_checkpoints_secs=None, tpu_config=tpu_config, session_config=session_config, keep_checkpoint_max=config.max_save) if self._config.do_train: (self._train_input_fn, self.train_steps) = self._preprocessor.prepare_train() else: self._train_input_fn, self.train_steps = None, 0 model_fn = model_fn_builder( config=config, tasks=self._tasks, num_train_steps=self.train_steps, pretraining_config=pretraining_config) self._estimator = tf.estimator.tpu.TPUEstimator( use_tpu=config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, predict_batch_size=config.predict_batch_size)
def __init__(self, config: configure_finetuning.FinetuningConfig, tasks, hvd, pretraining_config=None): self._config = config self._tasks = tasks self._preprocessor = preprocessing.Preprocessor(config, self._tasks) self._hooks = [hvd.BroadcastGlobalVariablesHook(0)] is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 tpu_cluster_resolver = None if config.use_tpu and config.tpu_name: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( config.tpu_name, zone=config.tpu_zone, project=config.gcp_project) session_config = tf.ConfigProto() session_config.gpu_options.visible_device_list = str(hvd.local_rank()) tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=config.iterations_per_loop, num_shards=config.num_tpu_cores, per_host_input_for_training=is_per_host, tpu_job_name=config.tpu_job_name) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, session_config=session_config, save_checkpoints_secs=None, tpu_config=tpu_config) if self._config.do_train: (self._train_input_fn, self.train_steps) = self._preprocessor.prepare_train() else: self._train_input_fn, self.train_steps = None, 0 model_fn = model_fn_builder( config=config, tasks=self._tasks, num_train_steps=self.train_steps, pretraining_config=pretraining_config) self._estimator = tf.estimator.tpu.TPUEstimator( use_tpu=config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, predict_batch_size=config.predict_batch_size)
def __init__(self, config: configure_finetuning.FinetuningConfig, tasks, pretraining_config=None): self._config = config self._tasks = tasks self._preprocessor = preprocessing.Preprocessor(config, self._tasks) num_gpus = utils.get_available_gpus() utils.log("Found {} gpus".format(len(num_gpus))) if num_gpus == 1: session_config = tf.ConfigProto( log_device_placement=True, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) run_config = tf.estimator.RunConfig( model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, # save_checkpoints_secs=3600, # tf_random_seed=FLAGS.seed, session_config=session_config, # keep_checkpoint_max=0, log_step_count_steps=100) else: train_distribution_strategy = tf.distribute.MirroredStrategy( devices=None, cross_device_ops=tensorflow.contrib.distribute. AllReduceCrossDeviceOps('nccl', num_packs=len(num_gpus))) eval_distribution_strategy = tf.distribute.MirroredStrategy( devices=None) session_config = tf.ConfigProto( # log_device_placement=True, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) run_config = tf.estimator.RunConfig( model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, train_distribute=train_distribution_strategy, eval_distribute=eval_distribution_strategy, # save_checkpoints_secs=3600, # tf_random_seed=FLAGS.seed, session_config=session_config, # keep_checkpoint_max=0, log_step_count_steps=100) if self._config.do_train: (self._train_input_fn, self.train_steps) = self._preprocessor.prepare_train() else: self._train_input_fn, self.train_steps = None, 0 model_fn = model_fn_builder(config=config, tasks=self._tasks, num_train_steps=self.train_steps, pretraining_config=pretraining_config) self._estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config, params={ 'train_batch_size': config.train_batch_size, 'eval_batch_size': config.eval_batch_size })