def main(unused_argv): flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('pipeline_config_path') tf.config.set_soft_device_placement(True) if FLAGS.checkpoint_dir: model_lib_v2.eval_continuously( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples), checkpoint_dir=FLAGS.checkpoint_dir, wait_interval=300, timeout=FLAGS.eval_timeout) else: if FLAGS.use_tpu: resolver = tf.distribute.cluster_resolver.TPUClusterResolver() tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) elif FLAGS.num_workers > 1: strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() else: strategy = tf.compat.v2.distribute.MirroredStrategy() with strategy.scope(): model_lib_v2.train_loop( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, use_tpu=FLAGS.use_tpu)
def test_train_loop_then_eval_loop(self): """Tests that Estimator and input function are constructed correctly.""" model_dir = tf.test.get_temp_dir() pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST) new_pipeline_config_path = os.path.join(model_dir, 'new_pipeline.config') config_util.clear_fine_tune_checkpoint(pipeline_config_path, new_pipeline_config_path) config_kwarg_overrides = _get_config_kwarg_overrides() train_steps = 2 strategy = tf2.distribute.MirroredStrategy(['/cpu:0', '/cpu:1']) with strategy.scope(): model_lib_v2.train_loop(new_pipeline_config_path, model_dir=model_dir, train_steps=train_steps, checkpoint_every_n=1, **config_kwarg_overrides) model_lib_v2.eval_continuously(new_pipeline_config_path, model_dir=model_dir, checkpoint_dir=model_dir, train_steps=train_steps, wait_interval=1, timeout=10, **config_kwarg_overrides)
def test_train_loop_then_eval_loop(self): """Tests that Estimator and input function are constructed correctly.""" hparams = model_hparams.create_hparams( hparams_overrides='load_pretrained=false') pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST) config_kwarg_overrides = _get_config_kwarg_overrides() model_dir = tf.test.get_temp_dir() train_steps = 2 model_lib_v2.train_loop( hparams, pipeline_config_path, model_dir=model_dir, train_steps=train_steps, checkpoint_every_n=1, **config_kwarg_overrides) model_lib_v2.eval_continuously( hparams, pipeline_config_path, model_dir=model_dir, checkpoint_dir=model_dir, train_steps=train_steps, wait_interval=10, **config_kwarg_overrides)
def evaluate(self): """ Evaluates all Training Checkpoints. """ model_lib_v2.eval_continuously(pipeline_config_path=self.config_path, model_dir=self.checkpoint_path, checkpoint_dir=self.checkpoint_path, postprocess_on_cpu=True)
def eval_continuously(self): print("Running evaluation loop...") strategy = tf.distribute.MirroredStrategy() with strategy.scope(): model_lib_v2.eval_continuously( pipeline_config_path=os.path.join(self._training_loop_path, "pipeline.config"), model_dir=self._training_loop_path, checkpoint_dir=self._training_loop_path)
def evaluate(_): pipeline_config = os.path.join(FLAGS.model_dir, 'pipeline.config') eval_continuously(pipeline_config_path=pipeline_config, train_steps=FLAGS.train_steps, model_dir=FLAGS.model_dir, checkpoint_dir=FLAGS.model_dir, wait_interval=FLAGS.wait_interval, timeout=FLAGS.timeout)
def main(unused_argv): # ste the gpu (device:GPU:0) print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: # Restrict TensorFlow to only use the first GPU try: tf.config.experimental.set_visible_devices(gpus[0], 'GPU') tf.config.experimental.set_memory_growth(gpus[0], True) logical_gpus = tf.config.experimental.list_logical_devices('GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU") except RuntimeError as e: # Visible devices must be set before GPUs have been initialized print(e) flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('pipeline_config_path') tf.config.set_soft_device_placement(True) if FLAGS.checkpoint_dir: model_lib_v2.eval_continuously( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples), checkpoint_dir=FLAGS.checkpoint_dir, wait_interval=300, timeout=FLAGS.eval_timeout) else: if FLAGS.use_tpu: # TPU is automatically inferred if tpu_name is None and # we are running under cloud ai-platform. resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) # elif FLAGS.num_workers > 1: # strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # else: # strategy = tf.compat.v2.distribute.MirroredStrategy() # with strategy.scope(): model_lib_v2.train_loop( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, use_tpu=FLAGS.use_tpu, checkpoint_every_n=FLAGS.checkpoint_every_n, record_summaries=FLAGS.record_summaries)
def evaluate_model(self, hyper_params: HyperParameterInformation) -> None: pipeline_config_path = os.path.join(self.path.model_dir, 'pipeline.config') eval_continuously(pipeline_config_path=pipeline_config_path, train_steps=hyper_params.training_steps, model_dir=self.path.model_dir, checkpoint_dir=self.path.model_dir, override_eval_num_epochs=False, wait_interval=180, timeout=3600)
def main(unused_argv): flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('pipeline_config_path') tf.config.set_soft_device_placement(True) if FLAGS.checkpoint_dir: if FLAGS.eval_all_checkpoints: model_lib_v2.eval_all_checkpoints( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples), checkpoint_dir=FLAGS.checkpoint_dir) else: model_lib_v2.eval_continuously( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples), checkpoint_dir=FLAGS.checkpoint_dir, wait_interval=300, timeout=FLAGS.eval_timeout) else: if FLAGS.use_tpu: # TPU is automatically inferred if tpu_name is None and # we are running under cloud ai-platform. resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) elif FLAGS.num_workers > 1: strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() else: strategy = tf.compat.v2.distribute.MirroredStrategy() with strategy.scope(): model_lib_v2.train_loop( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, use_tpu=FLAGS.use_tpu, checkpoint_every_n=FLAGS.checkpoint_every_n, record_summaries=FLAGS.record_summaries)
def main(_): with open('system_dict.json') as json_file: args = json.load(json_file) tf.config.set_soft_device_placement(True) if args["checkpoint_dir"]: model_lib_v2.eval_continuously( pipeline_config_path=args["pipeline_config_path"], model_dir=args["model_dir"], train_steps=args["num_train_steps"], sample_1_of_n_eval_examples=args["sample_1_of_n_eval_examples"], sample_1_of_n_eval_on_train_examples=( args["sample_1_of_n_eval_on_train_examples"]), checkpoint_dir=args["checkpoint_dir"], wait_interval=300, timeout=args["eval_timeout"]) else: if args["use_tpu"]: # TPU is automatically inferred if tpu_name is None and # we are running under cloud ai-platform. resolver = tf.distribute.cluster_resolver.TPUClusterResolver( args["tpu_name"]) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) elif args["num_workers"] > 1: strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() else: strategy = tf.compat.v2.distribute.MirroredStrategy() with strategy.scope(): model_lib_v2.train_loop( pipeline_config_path=args["pipeline_config_path"], model_dir=args["model_dir"], train_steps=args["num_train_steps"], use_tpu=args["use_tpu"], checkpoint_every_n=args["checkpoint_every_n"], record_summaries=args["record_summaries"])
def main(unused_argv): if FLAGS.checkpoint_dir: print("\n-------Running evaluation") else: print("\n-------Running traingin!") flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('pipeline_config_path') tf.config.set_soft_device_placement(True) print( "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: try: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) logical_gpus = tf.config.experimental.list_logical_devices('GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") except RuntimeError as e: # Memory growth must be set before GPUs have been initialized print(e) print( "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) if FLAGS.checkpoint_dir: model_lib_v2.eval_continuously( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples), checkpoint_dir=FLAGS.checkpoint_dir, wait_interval=300, timeout=FLAGS.eval_timeout) else: if FLAGS.use_tpu: # TPU is automatically inferred if tpu_name is None and # we are running under cloud ai-platform. resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) elif FLAGS.num_workers > 1: strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() else: strategy = tf.compat.v2.distribute.MirroredStrategy() with strategy.scope(): model_lib_v2.train_loop( pipeline_config_path=FLAGS.pipeline_config_path, model_dir=FLAGS.model_dir, save_final_config=True, train_steps=FLAGS.num_train_steps, use_tpu=FLAGS.use_tpu, checkpoint_every_n=FLAGS.checkpoint_every_n, record_summaries=FLAGS.record_summaries)