Example #1
0
 def _tpu_init_fn():
   if tpu_name in _LOCAL_MASTERS:
     job = None
   else:
     # Explicitly place the tpu.initialize_system in the first worker to
     # avoid the output node match multiple devices error.
     job = "worker/replica:0/task:0"
   return tpu.initialize_system(job=job)
Example #2
0
 def _tpu_init_fn():
   if tpu_name in _LOCAL_MASTERS:
     job = None
   else:
     # Explicitly place the tpu.initialize_system in the first worker to
     # avoid the output node match multiple devices error.
     job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name())
   return tpu.initialize_system(job=job)
Example #3
0
def initialize_tpu_system(cluster_resolver=None):
    """Initialize the TPU devices.

  Args:
    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.
  Returns:
    The tf.tpu.Topology object for the topology of the TPU cluster.
  """
    if cluster_resolver is None:
        cluster_resolver = TPUClusterResolver("")
    assert isinstance(cluster_resolver, TPUClusterResolver)

    tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
    if tpu_name in _INITIALIZED_TPU_SYSTEMS:
        logging.warning("TPU system %s has already been initialized. "
                        "Reinitializing the TPU can cause previously created "
                        "variables on TPU to be lost.")

    logging.info("Initializing the TPU system.")

    if context.executing_eagerly():
        # This function looks as it is for the following non-intuitive reasons.
        # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
        # DistributedTPURewritePass. This pass actually adds real ops that
        # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
        # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
        # The easiest way to trigger a rewrite is to run the function with
        # TPUPartitionedCallOp.
        @function.defun
        def _tpu_init_fn():
            return tpu.initialize_system()

        # We can't call _tpu_init_fn normally (because it contains just a dummy op,
        # see above) but need to define it to get it added to eager context
        # and get its assigned name.
        # pylint: disable=protected-access
        graph_func = _tpu_init_fn._get_concrete_function_internal()
        func_name = compat.as_str(graph_func._inference_function.name)
        # pylint: enable=protected-access

        with ops.device(get_first_tpu_host_device(cluster_resolver)):
            output = tpu_functional_ops.TPUPartitionedCall(
                args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name)
        serialized_topology = output[0].numpy()
    else:
        master = cluster_resolver.master()
        session_config = config_pb2.ConfigProto(allow_soft_placement=True)
        with ops.Graph().as_default():
            with session_lib.Session(config=session_config,
                                     target=master) as sess:
                serialized_topology = sess.run(tpu.initialize_system())

    logging.info("Finished initializing TPU system.")
    tpu_topology = topology.Topology(serialized=serialized_topology)
    _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology

    return tpu_topology
 def _tpu_init_fn():
   # In TF1, we usually close chips when compilation fails to clear the data
   # in infeed. In TF2, we don't need to do this because infeed is no longer
   # used, so user can recover from TPU compilation failures more smoothly.
   # Same for the cancellation of a TPU excution.
   return tpu.initialize_system(
       job=job,
       compilation_failure_closes_chips=False,
       tpu_cancellation_closes_chips=False)
Example #5
0
 def run(self, fetches, feed_dict=None, options=None, run_metadata=None):
     from tensorflow.python.tpu import tpu  # pylint: disable=g-import-not-at-top
     if self.topology is None:
         self.topology = super().run(tpu.initialize_system())
         assert self.topology is not None
     fetch_mapper = session._FetchMapper.for_fetch(fetches)
     new_fetches = []
     for fetch in fetch_mapper.unique_fetches():
         if isinstance(fetch, ops.Operation):
             fetch = tpu.rewrite(lambda fetch=fetch: fetch)
         new_fetches.append(fetch)
     rewritten_fetches = fetch_mapper.build_results(new_fetches)
     return super().run(rewritten_fetches, feed_dict, options, run_metadata)
  def __init__(self,
               hparams,
               train_iterations,
               eval_steps,
               per_host_v1=False):
    tf.logging.info("TrainLowLevelRunner: constructor")

    self.feature_structure = {}
    self.eval_feature_structure = {}
    self.loss = None
    self.infeed_queue = []
    self.eval_infeed_queue = []
    self.enqueue_ops = []
    self.eval_enqueue_ops = []
    self.dataset_initializer = []
    self.eval_dataset_initializer = []
    self.is_local = ((hparams.master == "") and (hparams.tpu_name is None))
    self.per_host_v1 = per_host_v1
    self.iterations = train_iterations
    self.eval_steps = eval_steps
    self.outfeed_tensors = []
    self.outfeed_names = []
    self.dequeue_ops = []
    self.predictions = {}
    self.sess = None
    self.graph = tf.Graph()
    self.hparams = hparams
    self.num_hosts = hparams.num_shards // hparams.num_shards_per_host
    with self.graph.as_default():
      self.tpu_init = [tpu.initialize_system()]
      self.tpu_shutdown = tpu.shutdown_system()

    self.resolver = get_resolver(hparams)
    session_config = tf.ConfigProto(
        allow_soft_placement=True,
        isolate_session_state=True,
        operation_timeout_in_ms=600 * 60 * 1000,
        graph_options=tf.GraphOptions(
            rewrite_options=rewriter_config_pb2.RewriterConfig(
                disable_meta_optimizer=True)))

    if self.hparams.tpu_name is None:
      master = self.hparams.master
    else:
      cluster_spec = self.resolver.cluster_spec()
      if cluster_spec:
        session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
      master = self.resolver.get_master()
    self.sess = tf.Session(master, graph=self.graph, config=session_config)
    self.sess.run(self.tpu_init)
def run_inference(inputs,
                  pipeline_config_file,
                  ckpt_path,
                  input_type='encoded_image_string_tensor',
                  use_bfloat16=False,
                  repeat=1):
    """Runs inference on TPU.

  Args:
    inputs: Input image with the same type as `input_type`
    pipeline_config_file: Pipeline config file name.
    ckpt_path: Training checkpoint path.
    input_type: One of
                'encoded_image_string_tensor': a 1d tensor with dtype=tf.string
                'image_tensor': a 4d tensor with dtype=tf.uint8
                'tf_example': a 1d tensor with dtype=tf.string
    use_bfloat16: If true, use tf.bfloat16 on TPU.
    repeat: Number of times to repeat running the provided input for profiling.

  Returns:
    A dict of resulting tensors.
  """

    pipeline_config, meta_arch = parse_pipeline_config(pipeline_config_file)

    shapes_info = model_map[meta_arch].get_prediction_tensor_shapes(
        pipeline_config)

    with tf.Graph().as_default(), tf.Session() as sess:
        placeholder_tensor, result_tensor_dict = model_map[
            meta_arch].build_graph(pipeline_config, shapes_info, input_type,
                                   use_bfloat16)

        saver = tf.train.Saver()
        init_op = tf.global_variables_initializer()

        sess.run(tpu.initialize_system())

        sess.run(init_op)
        if ckpt_path is not None:
            saver.restore(sess, ckpt_path)

        for _ in range(repeat):
            tensor_dict_out = sess.run(
                result_tensor_dict, feed_dict={placeholder_tensor: [inputs]})

        sess.run(tpu.shutdown_system())

        return tensor_dict_out
Example #8
0
def _obtain_topology(master_address, cluster_def):
    """Obtains TPU fabric topology."""
    try:
        logging.info(
            'Initializing TPU system (master: %s) to fetch topology '
            'for model parallelism. This might take a while.', master_address)
        with ops.Graph().as_default():
            session_config = get_session_config_with_timeout(
                _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, cluster_def)
            with session_lib.Session(master_address,
                                     config=session_config) as sess:
                topology = sess.run(tpu.initialize_system())
                return topology
    except errors.DeadlineExceededError:
        raise ValueError('Fail to initialize TPU system with master (%s). '
                         'Please double check the TPU system is functional.' %
                         (master_address))
Example #9
0
    def __init__(self, sparse_features_key, embedding, **kwargs):
        """Initializes the runner."""
        super(DLRMEmbeddingRunner, self).__init__(**kwargs,
                                                  do_initialize=False)
        self.embedding = embedding
        self.embedding_config = embedding.config_proto
        self.features_key = sparse_features_key
        self.embed_vars_and_ops = None
        self.retrieve_ops = None
        self.enqueue_datas_list = {True: [], False: []}
        self.dummy_variables = None
        self.dummy_variables_init = None
        self.num_outfeeds = 1

        with self.graph.as_default():
            self.embed_vars_and_ops = self.embedding.create_variables_and_ops()
            self.dummy_variables, self.dummy_variables_init = (
                tpu_embedding_gradient.create_dummy_table_variables(
                    self.embedding))
        self.device_topology = tf.Session(self.master, config=self.config).run(
            tpu.initialize_system(embedding_config=self.embedding_config))
def run_inference_from_saved_model(inputs,
                                   saved_model_dir,
                                   input_placeholder_name='placeholder_tensor',
                                   repeat=1):
    """Loads saved model and run inference on TPU.

  Args:
    inputs: Input image with the same type as `input_type`
    saved_model_dir: The directory SavedModel being exported to.
    input_placeholder_name: input placeholder's name in SavedModel signature.
    repeat: Number of times to repeat running the provided input for profiling.

  Returns:
    A dict of resulting tensors.
  """
    with tf.Graph().as_default(), tf.Session() as sess:
        meta_graph = loader.load(sess,
                                 [tag_constants.SERVING, tag_constants.TPU],
                                 saved_model_dir)

        sess.run(tpu.initialize_system())

        key_prediction = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY

        tensor_name_input = (meta_graph.signature_def[key_prediction].
                             inputs[input_placeholder_name].name)
        tensor_name_output = {
            k: v.name
            for k, v in (
                meta_graph.signature_def[key_prediction].outputs.items())
        }

        for _ in range(repeat):
            tensor_dict_out = sess.run(tensor_name_output,
                                       feed_dict={tensor_name_input: [inputs]})

        sess.run(tpu.shutdown_system())

        return tensor_dict_out
    def __init__(self,
                 iterations,
                 eval_steps,
                 sleep_seconds=120,
                 num_multiprocessing_workers=ssd_constants.WORKER_COUNT,
                 num_cores_per_shard=1,
                 input_partition_dims=None):
        tf.logging.info("TrainAndEvalLowLevelRunner: constructor")

        self.eval_steps = eval_steps
        self.feature_structure = {}
        self.eval_feature_structure = {}
        self.loss = None
        self.infeed_queue = []
        self.eval_infeed_queue = []
        self.enqueue_ops = []
        self.dequeue_ops = []
        self.predictions = {}
        self.eval_enqueue_ops = []
        self.train_eval_compile_op = None
        self.dataset_initializer = []
        self.eval_dataset_initializer = []
        self.iterations = iterations
        # TODO(wangtao): change FLAGS.num_shards_per_host to
        # FLAGS.num_cores_per_host after other low level API
        # support spatial partition. FLAGS.num_shards_per_host means number of TPU
        # cores for each host.
        self.replicas_per_worker = FLAGS.num_shards_per_host // num_cores_per_shard
        self.num_hosts = FLAGS.num_shards * num_cores_per_shard // FLAGS.num_shards_per_host
        self.num_shards = FLAGS.num_shards
        self.scaffold_fn = None
        self.sess = None
        self.input_sess = None
        self.graph = tf.Graph()
        self.input_graph = tf.Graph()
        self.eval_op = None
        self.infeed_thread = None
        self.eval_epochs = []
        self.success_epoch = 1000
        self.log_epochs = {}
        self.params = {}
        self.train_loop = None
        self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name or FLAGS.master,
            zone=FLAGS.tpu_zone,
            project=FLAGS.gcp_project)
        # Disable grappler for better performance.
        self.session_config = tf.ConfigProto(
            allow_soft_placement=True,
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True)),
            isolate_session_state=True,
            operation_timeout_in_ms=600 * 60 * 1000)  # 10 hours
        cluster_spec = self.tpu_cluster_resolver.cluster_spec()
        if cluster_spec:
            self.session_config.cluster_def.CopyFrom(
                cluster_spec.as_cluster_def())
        self.tpu_init = tpu.initialize_system()
        self.tpu_shutdown = tpu.shutdown_system()
        self.master = self.tpu_cluster_resolver.get_master()
        self.init_sess = tf.Session(self.master, config=self.session_config)
        self.outfeed_tensors = []
        self.outfeed_names = []
        self.run_success = False
        self.log_run_success = False
        self.num_multiprocessing_workers = num_multiprocessing_workers

        # Figure out the steps and epochs to eval for MLPerf.
        self.eval_at_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist()
        self.eval_iterations = [
            steps // 20000 - 1 for steps in self.eval_at_steps
        ]
        self.max_train_iterations = int(
            math.ceil(FLAGS.num_epochs * FLAGS.num_examples_per_epoch /
                      (FLAGS.train_batch_size * self.iterations)))
        self.sleep_seconds = sleep_seconds

        tf.logging.info("eval_at_steps: %s", self.eval_at_steps)
        tf.logging.info("eval_iterations: %s", self.eval_iterations)

        # Init for spatial partitioning.
        self.device_topology = self.init_sess.run(self.tpu_init)
        self.input_partition_dims = [input_partition_dims, None]
        self.use_spatial_partition = (
            input_partition_dims is not None
            and int(np.prod(FLAGS.input_partition_dims)) > 1)
        self.use_spatial_partition = input_partition_dims is not None
        self.num_cores_per_shard = num_cores_per_shard
        if self.use_spatial_partition:
            computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
                self.num_cores_per_shard]
            self.device_assignment = tpu_device_assignment.device_assignment(
                topology=self.device_topology,
                computation_shape=computation_shape,
                num_replicas=self.num_shards)
            tf.logging.info("num_cores_per_shard: %d",
                            self.num_cores_per_shard)
            tf.logging.info("num_hosts: %d", self.num_hosts)
            tf.logging.info("replicas_per_worker: %d",
                            self.replicas_per_worker)
            tf.logging.info("computation_shape: %s", str(computation_shape))
            tf.logging.info("num_shards: %d", self.num_shards)
            tf.logging.info(
                "device_assignment.topology.device_coordinates: %s",
                str(self.device_assignment.topology.device_coordinates))
            tf.logging.info("device_assignment.core_assignment: %s",
                            str(self.device_assignment.core_assignment))
            eval_input_partition_dims = [{
                ssd_constants.BOXES: None,
                ssd_constants.CLASSES: None,
                ssd_constants.IMAGE: input_partition_dims,
                ssd_constants.RAW_SHAPE: None,
                ssd_constants.SOURCE_ID: None,
            }, None]
            if FLAGS.eval_batch_size * eval_steps > FLAGS.eval_samples:
                eval_input_partition_dims[0][ssd_constants.IS_PADDED] = None
            self.eval_input_dims_flattener = utils.InputDimsFlattener(
                eval_input_partition_dims)
        else:
            self.device_assignment = None
            self.eval_input_dims_flattener = None
 def _tpu_init_fn():
   return tpu.initialize_system()
Example #13
0
def initialize_tpu_system(cluster_resolver=None):
  """Initialize the TPU devices.

  Args:
    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.
  Returns:
    The tf.tpu.Topology object for the topology of the TPU cluster.

  Raises:
    RuntimeError: If no TPU devices found for eager execution.
  """
  if cluster_resolver is None:
    cluster_resolver = TPUClusterResolver("")
  assert isinstance(cluster_resolver, TPUClusterResolver)

  tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
  if tpu_name in _INITIALIZED_TPU_SYSTEMS:
    logging.warning("TPU system %s has already been initialized. "
                    "Reinitializing the TPU can cause previously created "
                    "variables on TPU to be lost.")

  logging.info("Initializing the TPU system.")

  if context.executing_eagerly():
    # This function looks as it is for the following non-intuitive reasons.
    # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
    # DistributedTPURewritePass. This pass actually adds real ops that
    # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
    # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
    @function.defun
    def _tpu_init_fn():
      return tpu.initialize_system()

    tpu_devices = sorted(
        [x for x in context.list_devices() if "device:TPU:" in x])

    if not tpu_devices:
      raise RuntimeError("Could not find any TPU devices")

    # Replace the remote TPU device with the remote TPU_SYSTEM system device. As
    # in the remote TPU device case, we will try to compile it instead of
    # running through optimization passes and TF Executor, but TPU_SYSTEM should
    # work.
    tpu_system_device = tpu_devices[0].replace("TPU", "TPU_SYSTEM")

    with ops.device(tpu_system_device):
      output = _tpu_init_fn()
    serialized_topology = output.numpy()
  else:
    master = cluster_resolver.master()
    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
    with ops.Graph().as_default():
      with session_lib.Session(config=session_config, target=master) as sess:
        serialized_topology = sess.run(tpu.initialize_system())

  logging.info("Finished initializing TPU system.")
  tpu_topology = topology.Topology(serialized=serialized_topology)
  _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology

  return tpu_topology
Example #14
0
def run_saved_model_with_feed_dict(saved_model_dir,
                                   tag_set,
                                   signature_def_key,
                                   input_tensor_key_feed_dict,
                                   outdir,
                                   overwrite_flag,
                                   worker=None,
                                   init_tpu=False,
                                   tf_debug=False):
    """Runs SavedModel and fetch all outputs.

  Runs the input dictionary through the MetaGraphDef within a SavedModel
  specified by the given tag_set and SignatureDef. Also save the outputs to file
  if outdir is not None.

  Args:
    saved_model_dir: Directory containing the SavedModel to execute.
    tag_set: Group of tag(s) of the MetaGraphDef with the SignatureDef map, in
        string format, separated by ','. For tag-set contains multiple tags, all
        tags must be passed in.
    signature_def_key: A SignatureDef key string.
    input_tensor_key_feed_dict: A dictionary maps input keys to numpy ndarrays.
    outdir: A directory to save the outputs to. If the directory doesn't exist,
        it will be created.
    overwrite_flag: A boolean flag to allow overwrite output file if file with
        the same name exists.
    worker: If provided, the session will be run on the worker.  Valid worker
        specification is a bns or gRPC path.
    init_tpu: If true, the TPU system will be initialized after the session
        is created.
    tf_debug: A boolean flag to use TensorFlow Debugger (TFDBG) to observe the
        intermediate Tensor values and runtime GraphDefs while running the
        SavedModel.

  Raises:
    ValueError: When any of the input tensor keys is not valid.
    RuntimeError: An error when output file already exists and overwrite is not
    enabled.
  """
    # Get a list of output tensor names.
    meta_graph_def = saved_model_utils.get_meta_graph_def(
        saved_model_dir, tag_set)

    # Re-create feed_dict based on input tensor name instead of key as session.run
    # uses tensor name.
    inputs_tensor_info = _get_inputs_tensor_info_from_meta_graph_def(
        meta_graph_def, signature_def_key)

    # Check if input tensor keys are valid.
    for input_key_name in input_tensor_key_feed_dict.keys():
        if input_key_name not in inputs_tensor_info:
            raise ValueError(
                '"%s" is not a valid input key. Please choose from %s, or use '
                '--show option.' %
                (input_key_name,
                 '"' + '", "'.join(inputs_tensor_info.keys()) + '"'))

    inputs_feed_dict = {
        inputs_tensor_info[key].name: tensor
        for key, tensor in input_tensor_key_feed_dict.items()
    }
    # Get outputs
    outputs_tensor_info = _get_outputs_tensor_info_from_meta_graph_def(
        meta_graph_def, signature_def_key)
    # Sort to preserve order because we need to go from value to key later.
    output_tensor_keys_sorted = sorted(outputs_tensor_info.keys())
    output_tensor_names_sorted = [
        outputs_tensor_info[tensor_key].name
        for tensor_key in output_tensor_keys_sorted
    ]

    with session.Session(worker, graph=ops_lib.Graph()) as sess:
        if init_tpu:
            print('Initializing TPU System ...')
            # This is needed for freshly started worker, or if the job
            # restarts after a preemption.
            sess.run(tpu.initialize_system())

        loader.load(sess, tag_set.split(','), saved_model_dir)

        if tf_debug:
            sess = local_cli_wrapper.LocalCLIDebugWrapperSession(sess)

        outputs = sess.run(output_tensor_names_sorted,
                           feed_dict=inputs_feed_dict)

        for i, output in enumerate(outputs):
            output_tensor_key = output_tensor_keys_sorted[i]
            print('Result for output key %s:\n%s' %
                  (output_tensor_key, output))

            # Only save if outdir is specified.
            if outdir:
                # Create directory if outdir does not exist
                if not os.path.isdir(outdir):
                    os.makedirs(outdir)
                output_full_path = os.path.join(outdir,
                                                output_tensor_key + '.npy')

                # If overwrite not enabled and file already exist, error out
                if not overwrite_flag and os.path.exists(output_full_path):
                    raise RuntimeError(
                        'Output file %s already exists. Add \"--overwrite\" to overwrite'
                        ' the existing output files.' % output_full_path)

                np.save(output_full_path, output)
                print('Output %s is saved to %s' %
                      (output_tensor_key, output_full_path))
Example #15
0
def initialize_tpu_system(cluster_resolver=None):
    """Initialize the TPU devices.

  Args:
    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.
  Returns:
    The tf.tpu.Topology object for the topology of the TPU cluster. If called
    inside tf.function, it returns the serialized topology object instead.

  Raises:
    RuntimeError: If running inside a tf.function.
    NotFoundError: If no TPU devices found in eager mode.
  """
    job = None
    if cluster_resolver is None:
        # If no cluster resolver is specified, and running eagerly, execute the init
        # ops in the current device scope.
        if context.executing_eagerly():
            curr_device = device.DeviceSpec.from_string(
                context.context().device_name)
            if curr_device.job is not None:
                job = "{}/replica:0/task:0".format(curr_device.job)

        cluster_resolver = TPUClusterResolver("")
    assert isinstance(cluster_resolver, TPUClusterResolver)

    tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
    if tpu_name in _INITIALIZED_TPU_SYSTEMS:
        logging.warning(
            "TPU system %s has already been initialized. "
            "Reinitializing the TPU can cause previously created "
            "variables on TPU to be lost.", tpu_name)

    logging.info("Initializing the TPU system: %s", tpu_name)

    # This function looks as it is for the following non-intuitive reasons.
    # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
    # DistributedTPURewritePass. This pass actually adds real ops that
    # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
    # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
    if tpu_name not in _LOCAL_MASTERS:
        # Explicitly place the tpu.initialize_system in the first worker to
        # avoid the output node match multiple devices error.
        job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name())

    if context.executing_eagerly():

        @function.defun
        def _tpu_init_fn():
            # In TF1, we usually close chips when compilation fails to clear the data
            # in infeed. In TF2, we don't need to do this because infeed is no longer
            # used, so user can recover from TPU compilation failures more smoothly.
            return tpu.initialize_system(
                job=job, compilation_failure_closes_chips=False)

        # The TPU_SYSTEM device must match the device used in tpu.initialize_system
        # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM
        # devices available.
        try:
            with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
                output = _tpu_init_fn()
            context.async_wait()
        except errors.InvalidArgumentError as e:
            raise errors.NotFoundError(
                None, None,
                "TPUs not found in the cluster. Failed in initialization: " +
                str(e))

        # Clear out the eager context caches since the memory is invalid now.
        logging.info("Clearing out eager caches")
        context.context()._clear_caches()  # pylint: disable=protected-access

        serialized_topology = output.numpy()
    elif not ops.executing_eagerly_outside_functions():
        master = cluster_resolver.master()
        cluster_spec = cluster_resolver.cluster_spec()

        session_config = config_pb2.ConfigProto(allow_soft_placement=True)
        if cluster_spec:
            session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())

        with ops.Graph().as_default():
            with session_lib.Session(config=session_config,
                                     target=master) as sess:
                serialized_topology = sess.run(tpu.initialize_system())
    else:
        with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
            serialized_topology = tpu.initialize_system(
                job=job, compilation_failure_closes_chips=False)
            # If initialize_tpu_system is called inside tf.function, we only return
            # the serialized topology object as the tf.tpu.Topology object has to be
            # constructed in eager mode.
            return serialized_topology

    logging.info("Finished initializing TPU system.")
    tpu_topology = topology.Topology(serialized=serialized_topology)
    _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology

    return tpu_topology
Example #16
0
    def __init__(self,
                 iterations_per_loop,
                 train_steps,
                 eval_steps,
                 num_replicas,
                 eval_dataset_repeats=True,
                 do_initialize=True):
        self.feature_structure = {}
        self.infeed_op = {}
        self.num_replicas = num_replicas
        self.eval_dataset_repeats = eval_dataset_repeats
        # Set number of input graphs to number of hosts up to a maximum of 32.
        self.num_input_graphs = min(
            32, self.num_replicas // FLAGS.replicas_per_host)
        # Following data has separated copies for training and eval, thus
        # represented as a map from is_train(boolean) to actual data
        self.dataset_initializer = {True: [], False: []}
        self.input_graph = {True: [], False: []}
        self.input_sess = {True: [], False: []}
        self.enqueue_ops = {True: [], False: []}
        for _ in range(self.num_input_graphs):
            self.input_graph[True].append(tf.Graph())
            self.input_graph[False].append(tf.Graph())
            self.dataset_initializer[True].append([])
            self.dataset_initializer[False].append([])
            self.enqueue_ops[True].append([])
            self.enqueue_ops[False].append([])
            self.input_sess[True].append([])
            self.input_sess[False].append([])
        # dequeue_ops is only for eval
        self.dequeue_ops = []
        self.iterations_per_loop = iterations_per_loop
        self.sess = None
        self.output_sess = None
        self.train_eval_thread = None
        self.graph = tf.Graph()
        if iterations_per_loop != 0 and train_steps % iterations_per_loop != 0:
            train_steps = iterations_per_loop * int(
                math.ceil(train_steps / iterations_per_loop))
        self.train_steps = train_steps
        if iterations_per_loop == 0:
            self.max_train_iterations = 1
        else:
            self.max_train_iterations = train_steps // iterations_per_loop
        self.eval_steps = int(eval_steps)
        self.train_batch_size = 0
        self.eval_batch_size = 0
        self.eval_has_labels = 0
        self.model_fn = None
        self.num_outfeeds = self.eval_steps
        self.config = tf.ConfigProto(
            operation_timeout_in_ms=600 * 60 * 1000,
            allow_soft_placement=True,
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True)),
            isolate_session_state=True)

        if FLAGS.enable_mlir_bridge:
            self.config.experimental.enable_mlir_bridge = True

        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.master,
            zone=FLAGS.tpu_zone,
            project=FLAGS.gcp_project,
            job_name="tpu_worker")
        self.master = tpu_cluster_resolver.get_master()
        self.job_name = tpu_cluster_resolver.get_job_name() or "tpu_worker"
        self.embedding_config = None
        self.device_topology = None
        if do_initialize:
            self.device_topology = tf.Session(
                self.master, config=self.config).run(tpu.initialize_system())
def initialize_tpu_system(cluster_resolver=None):
  """Initialize the TPU devices.

  Args:
    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.
  Returns:
    The tf.tpu.Topology object for the topology of the TPU cluster.

  Raises:
    RuntimeError: If no TPU devices found for eager execution.
  """
  if cluster_resolver is None:
    cluster_resolver = TPUClusterResolver("")
  assert isinstance(cluster_resolver, TPUClusterResolver)

  tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
  if tpu_name in _INITIALIZED_TPU_SYSTEMS:
    logging.warning("TPU system %s has already been initialized. "
                    "Reinitializing the TPU can cause previously created "
                    "variables on TPU to be lost.")

  logging.info("Initializing the TPU system.")

  if context.executing_eagerly():
    # This function looks as it is for the following non-intuitive reasons.
    # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
    # DistributedTPURewritePass. This pass actually adds real ops that
    # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
    # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
    # The easiest way to trigger a rewrite is to run the function with
    # TPUPartitionedCallOp.
    @function.defun
    def _tpu_init_fn():
      return tpu.initialize_system()

    # We can't call _tpu_init_fn normally (because it contains just a dummy op,
    # see above) but need to define it to get it added to eager context
    # and get its assigned name.
    # pylint: disable=protected-access
    graph_func = _tpu_init_fn._get_concrete_function_internal()
    func_name = compat.as_str(graph_func._inference_function.name)
    # pylint: enable=protected-access

    tpu_devices = sorted(
        [x for x in context.list_devices() if "device:TPU:" in x])

    if not tpu_devices:
      raise RuntimeError("Could not find any TPU devices")

    with ops.device(device_util.get_host_for_device(tpu_devices[0])):
      output = tpu_functional_ops.TPUPartitionedCall(
          args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name)
    serialized_topology = output[0].numpy()
  else:
    master = cluster_resolver.master()
    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
    with ops.Graph().as_default():
      with session_lib.Session(config=session_config, target=master) as sess:
        serialized_topology = sess.run(tpu.initialize_system())

  logging.info("Finished initializing TPU system.")
  tpu_topology = topology.Topology(serialized=serialized_topology)
  _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology

  return tpu_topology
def initialize_tpu_system(cluster_resolver=None):
    """Initialize the TPU devices.

  Args:
    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
        which provides information about the TPU cluster.
  Returns:
    The tf.tpu.Topology object for the topology of the TPU cluster.

  Raises:
    RuntimeError: If no TPU devices found for eager execution.
  """
    if cluster_resolver is None:
        cluster_resolver = TPUClusterResolver("")
    assert isinstance(cluster_resolver, TPUClusterResolver)

    tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
    if tpu_name in _INITIALIZED_TPU_SYSTEMS:
        logging.warning("TPU system %s has already been initialized. "
                        "Reinitializing the TPU can cause previously created "
                        "variables on TPU to be lost.")

    logging.info("Initializing the TPU system: %s", tpu_name)

    if context.executing_eagerly():
        # This function looks as it is for the following non-intuitive reasons.
        # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
        # DistributedTPURewritePass. This pass actually adds real ops that
        # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
        # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
        job = None
        if tpu_name not in _LOCAL_MASTERS:
            # Explicitly place the tpu.initialize_system in the first worker to
            # avoid the output node match multiple devices error.
            job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name())

        @function.defun
        def _tpu_init_fn():
            return tpu.initialize_system(job=job)

        # The TPU_SYSTEM device must match the device used in tpu.initialize_system
        # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM
        # devices available.
        with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
            output = _tpu_init_fn()

        # Clear out the eager context caches since the memory is invalid now.
        logging.info("Clearing out eager caches")
        context.context()._clear_caches()  # pylint: disable=protected-access

        serialized_topology = output.numpy()
    else:
        master = cluster_resolver.master()
        cluster_spec = cluster_resolver.cluster_spec()

        session_config = config_pb2.ConfigProto(allow_soft_placement=True)
        if cluster_spec:
            session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())

        with ops.Graph().as_default():
            with session_lib.Session(config=session_config,
                                     target=master) as sess:
                serialized_topology = sess.run(tpu.initialize_system())

    logging.info("Finished initializing TPU system.")
    tpu_topology = topology.Topology(serialized=serialized_topology)
    _INITIALIZED_TPU_SYSTEMS[tpu_name] = tpu_topology

    return tpu_topology
 def _tpu_init_fn():
     return tpu.initialize_system(job=job)
    def __init__(self, iterations, eval_steps):
        tf.logging.info("LowLevelRunner: constructor.")

        self.fake_feature_structure = {}
        self.feature_structure = {}
        self.fake_eval_feature_structure = {}
        self.eval_feature_structure = {}
        self.infeed_queue = []
        self.eval_infeed_queue = []
        self.fake_enqueue_ops = []
        self.enqueue_ops = []
        self.fake_eval_enqueue_ops = []
        self.eval_enqueue_ops = []
        self.fake_dataset_initializer = []
        self.dataset_initializer = []
        self.fake_eval_dataset_initializer = []
        self.eval_dataset_initializer = []
        self.outfeed_tensors = []
        self.outfeed_names = []
        self.dequeue_ops = []
        self.train_compile_op = None
        self.eval_compile_op = None
        self.loss = None
        self.eval_op = None
        self.iterations = iterations
        self.eval_steps = eval_steps
        self.num_hosts = FLAGS.tpu_num_shards // FLAGS.tpu_num_shards_per_host
        self.scaffold_fn = None
        self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.master or FLAGS.cloud_tpu_name)
        # Disable grappler for better performance.
        self.session_config = tf.ConfigProto(
            allow_soft_placement=True,
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True)),
            isolate_session_state=True,
            operation_timeout_in_ms=600 * 60 * 1000)  # 10 hours
        cluster_spec = self.tpu_cluster_resolver.cluster_spec()
        if cluster_spec:
            self.session_config.cluster_def.CopyFrom(
                cluster_spec.as_cluster_def())
        self.input_graph = tf.Graph()
        self.eval_input_graph = tf.Graph()
        # Train and eval share the same session and graph so that the weights
        # can be shared for in memory eval.
        self.graph = tf.Graph()
        self.output_graph = tf.Graph()
        with self.graph.as_default():
            if FLAGS.random_seed:
                tf.random.set_random_seed(FLAGS.random_seed)
            self.num_epochs_tensor = tf.placeholder(tf.int32,
                                                    shape=(),
                                                    name="epochs")
            self.train_steps_tensor = tf.placeholder(
                tf.int32, shape=(), name="steps_per_train_loop")
            self.eval_steps_tensor = tf.placeholder(tf.int32,
                                                    shape=(),
                                                    name="steps_per_eval_loop")
            self.tpu_init = [tpu.initialize_system()]
            self.tpu_shutdown = tpu.shutdown_system()
        self.master = self.tpu_cluster_resolver.get_master()
        self.input_sess = tf.Session(self.master,
                                     graph=self.input_graph,
                                     config=self.session_config)
        self.eval_input_sess = tf.Session(self.master,
                                          graph=self.eval_input_graph,
                                          config=self.session_config)
        self.sess = tf.Session(self.master,
                               graph=self.graph,
                               config=self.session_config)
        self.output_sess = tf.Session(self.master,
                                      graph=self.output_graph,
                                      config=self.session_config)
        self.sess.run(self.tpu_init)
        self.infeed_thead = None
        self.train_eval_thead = None