def __init__(self, hparams, train_iterations, eval_steps, per_host_v1=False): tf.logging.info("TrainLowLevelRunner: constructor") self.feature_structure = {} self.eval_feature_structure = {} self.loss = None self.infeed_queue = [] self.eval_infeed_queue = [] self.enqueue_ops = [] self.eval_enqueue_ops = [] self.dataset_initializer = [] self.eval_dataset_initializer = [] self.is_local = ((hparams.master == "") and (hparams.tpu_name is None)) self.per_host_v1 = per_host_v1 self.iterations = train_iterations self.eval_steps = eval_steps self.outfeed_tensors = [] self.outfeed_names = [] self.dequeue_ops = [] self.predictions = {} self.sess = None self.graph = tf.Graph() self.hparams = hparams self.num_hosts = hparams.num_shards // hparams.num_shards_per_host with self.graph.as_default(): self.tpu_init = [tpu.initialize_system()] self.tpu_shutdown = tpu.shutdown_system() self.resolver = get_resolver(hparams) session_config = tf.ConfigProto( allow_soft_placement=True, isolate_session_state=True, operation_timeout_in_ms=600 * 60 * 1000, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))) if self.hparams.tpu_name is None: master = self.hparams.master else: cluster_spec = self.resolver.cluster_spec() if cluster_spec: session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) master = self.resolver.get_master() self.sess = tf.Session(master, graph=self.graph, config=session_config) self.sess.run(self.tpu_init)
def run_inference(inputs, pipeline_config_file, ckpt_path, input_type='encoded_image_string_tensor', use_bfloat16=False, repeat=1): """Runs inference on TPU. Args: inputs: Input image with the same type as `input_type` pipeline_config_file: Pipeline config file name. ckpt_path: Training checkpoint path. input_type: One of 'encoded_image_string_tensor': a 1d tensor with dtype=tf.string 'image_tensor': a 4d tensor with dtype=tf.uint8 'tf_example': a 1d tensor with dtype=tf.string use_bfloat16: If true, use tf.bfloat16 on TPU. repeat: Number of times to repeat running the provided input for profiling. Returns: A dict of resulting tensors. """ pipeline_config, meta_arch = parse_pipeline_config(pipeline_config_file) shapes_info = model_map[meta_arch].get_prediction_tensor_shapes( pipeline_config) with tf.Graph().as_default(), tf.Session() as sess: placeholder_tensor, result_tensor_dict = model_map[ meta_arch].build_graph(pipeline_config, shapes_info, input_type, use_bfloat16) saver = tf.train.Saver() init_op = tf.global_variables_initializer() sess.run(tpu.initialize_system()) sess.run(init_op) if ckpt_path is not None: saver.restore(sess, ckpt_path) for _ in range(repeat): tensor_dict_out = sess.run( result_tensor_dict, feed_dict={placeholder_tensor: [inputs]}) sess.run(tpu.shutdown_system()) return tensor_dict_out
def run_inference_from_saved_model(inputs, saved_model_dir, input_placeholder_name='placeholder_tensor', repeat=1): """Loads saved model and run inference on TPU. Args: inputs: Input image with the same type as `input_type` saved_model_dir: The directory SavedModel being exported to. input_placeholder_name: input placeholder's name in SavedModel signature. repeat: Number of times to repeat running the provided input for profiling. Returns: A dict of resulting tensors. """ with tf.Graph().as_default(), tf.Session() as sess: meta_graph = loader.load(sess, [tag_constants.SERVING, tag_constants.TPU], saved_model_dir) sess.run(tpu.initialize_system()) key_prediction = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY tensor_name_input = (meta_graph.signature_def[key_prediction]. inputs[input_placeholder_name].name) tensor_name_output = { k: v.name for k, v in ( meta_graph.signature_def[key_prediction].outputs.items()) } for _ in range(repeat): tensor_dict_out = sess.run(tensor_name_output, feed_dict={tensor_name_input: [inputs]}) sess.run(tpu.shutdown_system()) return tensor_dict_out
def _tpu_shutdown_fn(): tpu.shutdown_system(job=job)
def shutdown_tpu_system(cluster_resolver=None): """Shuts down the TPU devices. This will clear all caches, even those that are maintained through sequential calls to tf.tpu.experimental.initialize_tpu_system, such as the compilation cache. Args: cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Raises: RuntimeError: If no TPU devices found for eager execution or if run in a tf.function. """ job = None if cluster_resolver is None: # If no cluster resolver is specified, and running eagerly, execute the init # ops in the current device scope. if context.executing_eagerly(): curr_device = device.DeviceSpec.from_string( context.context().device_name) if curr_device.job is not None: job = "{}/replica:0/task:0".format(curr_device.job) cluster_resolver = TPUClusterResolver("") assert isinstance(cluster_resolver, TPUClusterResolver) tpu_name = compat.as_text(cluster_resolver._tpu) # pylint: disable=protected-access if tpu_name not in _INITIALIZED_TPU_SYSTEMS: logging.warning( "You are shutting down a TPU system %s that has not been " "initialized." % tpu_name) logging.info("Shutting down the TPU system: %s", tpu_name) if context.executing_eagerly(): # This function looks as it is for the following non-intuitive reasons. # tpu.shutdown_system creates a dummy op whose sole purpose is to trigger # DistributedTPURewritePass. This pass actually adds real ops that # shutdown the TPU system. Thus, we can't simply run tpu.shutdown_system # eagerly. We need to wrap it in defun and trigger the rewrite passes on it. if tpu_name not in _LOCAL_MASTERS: # Explicitly place the tpu.shutdown_system in the first worker to # avoid the output node match multiple devices error. job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name()) @function.defun def _tpu_shutdown_fn(): tpu.shutdown_system(job=job) # The TPU_SYSTEM device must match the device used in tpu.shutdown_system # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM # devices available. with ops.device(tpu._tpu_system_device_name(job)): # pylint: disable=protected-access _tpu_shutdown_fn() # Clear out the eager context caches since the memory is invalid now. logging.info("Clearing out eager caches") context.context()._clear_caches() # pylint: disable=protected-access elif not ops.executing_eagerly_outside_functions(): master = cluster_resolver.master() cluster_spec = cluster_resolver.cluster_spec() session_config = config_pb2.ConfigProto(allow_soft_placement=True) if cluster_spec: session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: sess.run(tpu.shutdown_system()) else: raise RuntimeError("initialize_tpu_system is not supported within " "tf.functions.") logging.info("Finished shutting down TPU system.") if tpu_name in _INITIALIZED_TPU_SYSTEMS: del _INITIALIZED_TPU_SYSTEMS[tpu_name]
def __init__(self, iterations, eval_steps): tf.logging.info("LowLevelRunner: constructor.") self.fake_feature_structure = {} self.feature_structure = {} self.fake_eval_feature_structure = {} self.eval_feature_structure = {} self.infeed_queue = [] self.eval_infeed_queue = [] self.fake_enqueue_ops = [] self.enqueue_ops = [] self.fake_eval_enqueue_ops = [] self.eval_enqueue_ops = [] self.fake_dataset_initializer = [] self.dataset_initializer = [] self.fake_eval_dataset_initializer = [] self.eval_dataset_initializer = [] self.outfeed_tensors = [] self.outfeed_names = [] self.dequeue_ops = [] self.train_compile_op = None self.eval_compile_op = None self.loss = None self.eval_op = None self.iterations = iterations self.eval_steps = eval_steps self.num_hosts = FLAGS.tpu_num_shards // FLAGS.tpu_num_shards_per_host self.scaffold_fn = None self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.master or FLAGS.cloud_tpu_name) # Disable grappler for better performance. self.session_config = tf.ConfigProto( allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True, operation_timeout_in_ms=600 * 60 * 1000) # 10 hours cluster_spec = self.tpu_cluster_resolver.cluster_spec() if cluster_spec: self.session_config.cluster_def.CopyFrom( cluster_spec.as_cluster_def()) self.input_graph = tf.Graph() self.eval_input_graph = tf.Graph() # Train and eval share the same session and graph so that the weights # can be shared for in memory eval. self.graph = tf.Graph() self.output_graph = tf.Graph() with self.graph.as_default(): if FLAGS.random_seed: tf.random.set_random_seed(FLAGS.random_seed) self.num_epochs_tensor = tf.placeholder(tf.int32, shape=(), name="epochs") self.train_steps_tensor = tf.placeholder( tf.int32, shape=(), name="steps_per_train_loop") self.eval_steps_tensor = tf.placeholder(tf.int32, shape=(), name="steps_per_eval_loop") self.tpu_init = [tpu.initialize_system()] self.tpu_shutdown = tpu.shutdown_system() self.master = self.tpu_cluster_resolver.get_master() self.input_sess = tf.Session(self.master, graph=self.input_graph, config=self.session_config) self.eval_input_sess = tf.Session(self.master, graph=self.eval_input_graph, config=self.session_config) self.sess = tf.Session(self.master, graph=self.graph, config=self.session_config) self.output_sess = tf.Session(self.master, graph=self.output_graph, config=self.session_config) self.sess.run(self.tpu_init) self.infeed_thead = None self.train_eval_thead = None
def __init__(self, iterations, eval_steps, sleep_seconds=120, num_multiprocessing_workers=ssd_constants.WORKER_COUNT, num_cores_per_shard=1, input_partition_dims=None): tf.logging.info("TrainAndEvalLowLevelRunner: constructor") self.eval_steps = eval_steps self.feature_structure = {} self.eval_feature_structure = {} self.loss = None self.infeed_queue = [] self.eval_infeed_queue = [] self.enqueue_ops = [] self.dequeue_ops = [] self.predictions = {} self.eval_enqueue_ops = [] self.train_eval_compile_op = None self.dataset_initializer = [] self.eval_dataset_initializer = [] self.iterations = iterations # TODO(wangtao): change FLAGS.num_shards_per_host to # FLAGS.num_cores_per_host after other low level API # support spatial partition. FLAGS.num_shards_per_host means number of TPU # cores for each host. self.replicas_per_worker = FLAGS.num_shards_per_host // num_cores_per_shard self.num_hosts = FLAGS.num_shards * num_cores_per_shard // FLAGS.num_shards_per_host self.num_shards = FLAGS.num_shards self.scaffold_fn = None self.sess = None self.input_sess = None self.graph = tf.Graph() self.input_graph = tf.Graph() self.eval_op = None self.infeed_thread = None self.eval_epochs = [] self.success_epoch = 1000 self.log_epochs = {} self.params = {} self.train_loop = None self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name or FLAGS.master, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # Disable grappler for better performance. self.session_config = tf.ConfigProto( allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True, operation_timeout_in_ms=600 * 60 * 1000) # 10 hours cluster_spec = self.tpu_cluster_resolver.cluster_spec() if cluster_spec: self.session_config.cluster_def.CopyFrom( cluster_spec.as_cluster_def()) self.tpu_init = tpu.initialize_system() self.tpu_shutdown = tpu.shutdown_system() self.master = self.tpu_cluster_resolver.get_master() self.init_sess = tf.Session(self.master, config=self.session_config) self.outfeed_tensors = [] self.outfeed_names = [] self.run_success = False self.log_run_success = False self.num_multiprocessing_workers = num_multiprocessing_workers # Figure out the steps and epochs to eval for MLPerf. self.eval_at_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist() self.eval_iterations = [ steps // 20000 - 1 for steps in self.eval_at_steps ] self.max_train_iterations = int( math.ceil(FLAGS.num_epochs * FLAGS.num_examples_per_epoch / (FLAGS.train_batch_size * self.iterations))) self.sleep_seconds = sleep_seconds tf.logging.info("eval_at_steps: %s", self.eval_at_steps) tf.logging.info("eval_iterations: %s", self.eval_iterations) # Init for spatial partitioning. self.device_topology = self.init_sess.run(self.tpu_init) self.input_partition_dims = [input_partition_dims, None] self.use_spatial_partition = ( input_partition_dims is not None and int(np.prod(FLAGS.input_partition_dims)) > 1) self.use_spatial_partition = input_partition_dims is not None self.num_cores_per_shard = num_cores_per_shard if self.use_spatial_partition: computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[ self.num_cores_per_shard] self.device_assignment = tpu_device_assignment.device_assignment( topology=self.device_topology, computation_shape=computation_shape, num_replicas=self.num_shards) tf.logging.info("num_cores_per_shard: %d", self.num_cores_per_shard) tf.logging.info("num_hosts: %d", self.num_hosts) tf.logging.info("replicas_per_worker: %d", self.replicas_per_worker) tf.logging.info("computation_shape: %s", str(computation_shape)) tf.logging.info("num_shards: %d", self.num_shards) tf.logging.info( "device_assignment.topology.device_coordinates: %s", str(self.device_assignment.topology.device_coordinates)) tf.logging.info("device_assignment.core_assignment: %s", str(self.device_assignment.core_assignment)) eval_input_partition_dims = [{ ssd_constants.BOXES: None, ssd_constants.CLASSES: None, ssd_constants.IMAGE: input_partition_dims, ssd_constants.RAW_SHAPE: None, ssd_constants.SOURCE_ID: None, }, None] if FLAGS.eval_batch_size * eval_steps > FLAGS.eval_samples: eval_input_partition_dims[0][ssd_constants.IS_PADDED] = None self.eval_input_dims_flattener = utils.InputDimsFlattener( eval_input_partition_dims) else: self.device_assignment = None self.eval_input_dims_flattener = None