コード例 #1
0
ファイル: tpu_context.py プロジェクト: godyd2702/tensorflow
  def _get_device_assignment(self):
    """Gets the (maybe cached) TPU device assignment."""
    master = self._get_master_address()
    device_assignment = self._lazy_device_assignment_dict.get(master)
    if device_assignment is not None:
      return device_assignment

    tpu_system_metadata = self._get_tpu_system_metadata()

    device_assignment = tpu_device_assignment.device_assignment(
        tpu_system_metadata.topology,
        computation_shape=self._computation_shape,
        num_replicas=self.num_replicas)

    logging.info('num_cores_per_replica: %s',
                 str(self._config.tpu_config.num_cores_per_replica))
    logging.info('computation_shape: %s', str(self._computation_shape))
    logging.info('num_replicas: %d', self.num_replicas)
    logging.info('device_assignment.topology.device_coordinates: %s',
                 str(device_assignment.topology.device_coordinates))
    logging.info('device_assignment.core_assignment: %s',
                 str(device_assignment.core_assignment))

    self._lazy_device_assignment_dict[master] = device_assignment
    return device_assignment
コード例 #2
0
  def _get_device_assignment(self):
    """Gets the (maybe cached) TPU device assignment."""
    master = self._get_master_address()
    device_assignment = self._lazy_device_assignment_dict.get(master)
    if device_assignment is not None:
      return device_assignment

    tpu_system_metadata = self._get_tpu_system_metadata()

    device_assignment = tpu_device_assignment.device_assignment(
        tpu_system_metadata.topology,
        computation_shape=self._computation_shape,
        num_replicas=self.num_replicas)

    logging.info('num_cores_per_replica: %s',
                 str(self._config.tpu_config.num_cores_per_replica))
    logging.info('computation_shape: %s', str(self._computation_shape))
    logging.info('num_replicas: %d', self.num_replicas)
    logging.info('device_assignment.topology.device_coordinates: %s',
                 str(device_assignment.topology.device_coordinates))
    logging.info('device_assignment.core_assignment: %s',
                 str(device_assignment.core_assignment))

    self._lazy_device_assignment_dict[master] = device_assignment
    return device_assignment
コード例 #3
0
ファイル: model_executor.py プロジェクト: bruinxiong/mesh-1
  def __init__(self, sess, use_tpu, mesh_shape, layout_rules):
    super(MeshContext, self).__init__()
    self._use_tpu = use_tpu
    self._mesh_shape = mtf.convert_to_shape(mesh_shape)
    self._layout_rules = layout_rules

    self._d_assignment = None
    self._num_hosts = None
    self._num_cores = None

    self._cpu_devices, self._gpu_devices = self._list_cpu_gpu_devices(sess)

    if self._use_tpu:
      topology = sess.run(tpu.initialize_system())
      topo_object = tpu.Topology(serialized=topology)
      self._num_cores = int(np.prod(topo_object.mesh_shape))
      self._num_hosts = int(topo_object.num_tasks)
      num_cores_per_host = int(self._num_cores // self._num_hosts)
      assert num_cores_per_host == int(topo_object.num_tpus_per_task)

      # Get a device_assignment object for mtf.
      self._d_assignment = device_assignment.device_assignment(
          topology, computation_shape=[1, 1, 1],
          num_replicas=self._num_cores)

      self._mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl(
          self._mesh_shape, self._layout_rules, None, self._d_assignment)
    else:
      self._mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
          self._mesh_shape, self._layout_rules, self._gpu_devices)
コード例 #4
0
 def _WaitTillInit():
     """Wait until the model is ready."""
     try:
         with self._GetSession() as sess:
             topology = sess.run(
                 tf.tpu.initialize_system(embedding_config=None,
                                          job=None))
             device_assignment = device_assignment_lib.device_assignment(
                 topology,
                 computation_shape=ComputationShape(
                     num_devices_per_split),
                 num_replicas=data_parallelism)
             py_utils.SetTpuDeviceAssignment(device_assignment)
             tf.logging.info('device_assignment.core_assignment: %s',
                             str(device_assignment.core_assignment))
             tf.logging.info(
                 'device_assignment.topology.device_coordinates: %s',
                 str(device_assignment.topology.device_coordinates))
     except py_utils.transient_tf_errors as e:
         tf.logging.info('TPU initialization failed: %s', e)
         raise
コード例 #5
0
def train_and_eval():
    """Trains and evaluates MeshTensorflow model without TPUEstimator.

  TODO(lehou): Pack everything nicely as a set of APIs.
  """
    tf.logging.info('FLAGS.master: {}'.format(FLAGS.master))

    # Open a session to get the list of CPU devices to hold master variables.
    with tf.Session(target=FLAGS.master,
                    config=tf.ConfigProto(allow_soft_placement=True)) as sess:
        topology = sess.run(tpu.initialize_system())
        cpu_devices = _list_cpu_devices(sess)

    topo_object = tf.contrib.tpu.Topology(serialized=topology)
    num_cores = int(np.prod(topo_object.mesh_shape))
    num_hosts = int(topo_object.num_tasks)
    num_cores_per_host = int(num_cores // num_hosts)
    assert num_cores_per_host == int(topo_object.num_tpus_per_task)

    # Get a device_assignment object for mtf.
    d_assignment = device_assignment.device_assignment(
        topology, computation_shape=[1, 1, 1], num_replicas=num_cores)

    # Get mesh_impl.
    mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape)
    layout_rules = unet.get_layout()
    mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl(mesh_shape, layout_rules, None,
                                                d_assignment)

    for _ in range(FLAGS.num_training_loops):
        _train_phase(mesh_impl, cpu_devices, d_assignment, num_hosts,
                     num_cores)
        _eval_phase(mesh_impl, cpu_devices, d_assignment, num_hosts, num_cores)

    _shutdown()

    tf.logging.info('finished.')
コード例 #6
0
    def __init__(self,
                 iterations,
                 num_cores_per_shard=1,
                 input_partition_dims=None):
        tf.logging.info("TrainLowLevelRunner: constructor")

        self.feature_structure = {}
        self.loss = None
        self.infeed_queue = []
        self.enqueue_ops = []
        self.dataset_initializer = []
        self.iterations = iterations
        # TODO(wangtao): change FLAGS.num_shards_per_host to
        # FLAGS.num_cores_per_host after other low level API
        # support spatial partition. FLAGS.num_shards_per_host means number of TPU
        # cores for each host.
        self.replicas_per_worker = FLAGS.num_shards_per_host // num_cores_per_shard
        self.num_hosts = FLAGS.num_shards * num_cores_per_shard // FLAGS.num_shards_per_host
        self.num_shards = FLAGS.num_shards
        self.scaffold_fn = None
        # Having two separate sessions and graphs to make the initialization faster.
        self.input_sess = None
        self.train_sess = None
        self.input_graph = tf.Graph()
        self.train_graph = None
        self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        # Disable grappler for better performance.
        self.session_config = tf.ConfigProto(
            allow_soft_placement=True,
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True)),
            isolate_session_state=True)
        cluster_spec = self.tpu_cluster_resolver.cluster_spec()
        if cluster_spec:
            self.session_config.cluster_def.CopyFrom(
                cluster_spec.as_cluster_def())
        self.tpu_init = tpu.initialize_system()
        self.tpu_shutdown = tpu.shutdown_system()
        self.init_sess = tf.Session(self.tpu_cluster_resolver.get_master(),
                                    config=self.session_config)
        self.queue = Queue.Queue()

        # Init for spatial partitioning.
        self.device_topology = self.init_sess.run(self.tpu_init)
        self.input_partition_dims = input_partition_dims
        self.use_spatial_partition = (
            input_partition_dims is not None
            and int(np.prod(FLAGS.input_partition_dims)) > 1)
        self.num_cores_per_shard = num_cores_per_shard
        if self.use_spatial_partition:
            computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
                self.num_cores_per_shard]
            self.device_assignment = tpu_device_assignment.device_assignment(
                topology=self.device_topology,
                computation_shape=computation_shape,
                num_replicas=self.num_shards)
            tf.logging.info("num_cores_per_shard: %d",
                            self.num_cores_per_shard)
            tf.logging.info("num_hosts: %d", self.num_hosts)
            tf.logging.info("replicas_per_worker: %d",
                            self.replicas_per_worker)
            tf.logging.info("computation_shape: %s", str(computation_shape))
            tf.logging.info("num_shards: %d", self.num_shards)
            tf.logging.info(
                "device_assignment.topology.device_coordinates: %s",
                str(self.device_assignment.topology.device_coordinates))
            tf.logging.info("device_assignment.core_assignment: %s",
                            str(self.device_assignment.core_assignment))
        else:
            self.device_assignment = None
コード例 #7
0
    def __init__(self,
                 tpu_cluster_resolver,
                 train_params,
                 eval_params,
                 eval_steps,
                 eval_metric,
                 input_partition_dims=None,
                 num_cores_per_replica=None,
                 tpu_job_name=None):
        tf.logging.info("TrainLowLevelRunner: constructor")

        self.tpu_cluster_resolver = tpu_cluster_resolver
        self.eval_metric = eval_metric
        self.train_params = train_params
        self.eval_params = eval_params
        self.train_params["batch_size"] = (train_params["train_batch_size"] //
                                           train_params["num_shards"])
        self.eval_params["batch_size"] = (eval_params["eval_batch_size"] //
                                          eval_params["num_shards"])
        self.tpu_job_name = tpu_job_name

        self.model_dir = train_params["model_dir"]
        self.iterations_per_loop = train_params["iterations_per_loop"]
        self.eval_steps = eval_steps
        self.num_shards = self.train_params["num_shards"]
        self.input_flattener = runner_utils.InputsFlattener()
        self.eval_input_flattener = runner_utils.InputsFlattener()
        self.num_hosts = None
        self.train_eval_compile_op = None
        self.train_eval_op = None
        self.infeed_queue = []
        self.eval_infeed_queue = []
        self.outfeed_names = []
        self.outfeed_tensors = []
        self.enqueue_ops = []
        self.eval_enqueue_ops = []
        self.dequeue_ops = []
        self.dataset_initializer = []
        self.eval_dataset_initializer = []
        self.scaffold_fn = None
        # Having two separate sessions and graphs to make the initialization faster.
        self.input_sess = None
        self.train_eval_sess = None
        self.input_graph = tf.Graph()
        self.train_eval_graph = tf.Graph()
        self.session_config = tf.ConfigProto(allow_soft_placement=True,
                                             isolate_session_state=True,
                                             operation_timeout_in_ms=600 * 60 *
                                             1000)  # 10 hours
        cluster_spec = self.tpu_cluster_resolver.cluster_spec()
        if cluster_spec:
            self.session_config.cluster_def.CopyFrom(
                cluster_spec.as_cluster_def())
        self.tpu_init = tf.contrib.tpu.initialize_system()
        self.tpu_shutdown = tf.contrib.tpu.shutdown_system()
        self.master = self.tpu_cluster_resolver.get_master()
        self.init_sess = tf.Session(self.master, config=self.session_config)
        self.device_topology = self.init_sess.run(self.tpu_init)
        self.input_partition_dims = input_partition_dims
        self.use_spatial_partition = input_partition_dims is not None
        self.num_cores_per_replica = num_cores_per_replica
        if self.use_spatial_partition:
            computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
                self.num_cores_per_replica]
            self.device_assignment = tpu_device_assignment.device_assignment(
                topology=self.device_topology,
                computation_shape=computation_shape,
                num_replicas=self.num_shards)
            tf.logging.info("num_cores_per_replica: %d",
                            self.num_cores_per_replica)
            tf.logging.info("computation_shape: %s", str(computation_shape))
            tf.logging.info("num_replicas: %d", self.num_shards)
            tf.logging.info(
                "device_assignment.topology.device_coordinates: %s",
                str(self.device_assignment.topology.device_coordinates))
            tf.logging.info("device_assignment.core_assignment: %s",
                            str(self.device_assignment.core_assignment))
            self.input_dims_flattener = runner_utils.InputDimsFlattener(
                self.input_partition_dims)
            eval_input_partition_dims = [
                dict(self.input_partition_dims[0]), None
            ]
            # don't need to partition the "is_padding" dimension
            if eval_params["eval_samples"] % eval_params[
                    "eval_batch_size"] != 0:
                eval_input_partition_dims[0][
                    mask_rcnn_params.IS_PADDING] = None
            self.eval_input_dims_flattener = runner_utils.InputDimsFlattener(
                eval_input_partition_dims)
        else:
            self.device_assignment = None
            self.input_dims_flattener = None
            self.eval_input_dims_flattener = None
        # Summary writer writes out train metrics.
        self.summary_writer = tf.summary.FileWriter(self.model_dir)
        # Summary writer writes out eval metrics.
        eval_output_dir = os.path.join(self.model_dir, "eval")
        tf.gfile.MakeDirs(eval_output_dir)
        self.eval_summary_writer = tf.summary.FileWriter(eval_output_dir)
        self.infeed_thread = None
        self.total_epoch = self.train_params[
            "total_steps"] // self.iterations_per_loop
    def __init__(self,
                 iterations,
                 eval_steps,
                 sleep_seconds=120,
                 num_multiprocessing_workers=ssd_constants.WORKER_COUNT,
                 num_cores_per_shard=1,
                 input_partition_dims=None):
        tf.logging.info("TrainAndEvalLowLevelRunner: constructor")

        self.eval_steps = eval_steps
        self.feature_structure = {}
        self.eval_feature_structure = {}
        self.loss = None
        self.infeed_queue = []
        self.eval_infeed_queue = []
        self.enqueue_ops = []
        self.dequeue_ops = []
        self.predictions = {}
        self.eval_enqueue_ops = []
        self.train_eval_compile_op = None
        self.dataset_initializer = []
        self.eval_dataset_initializer = []
        self.iterations = iterations
        # TODO(wangtao): change FLAGS.num_shards_per_host to
        # FLAGS.num_cores_per_host after other low level API
        # support spatial partition. FLAGS.num_shards_per_host means number of TPU
        # cores for each host.
        self.replicas_per_worker = FLAGS.num_shards_per_host // num_cores_per_shard
        self.num_hosts = FLAGS.num_shards * num_cores_per_shard // FLAGS.num_shards_per_host
        self.num_shards = FLAGS.num_shards
        self.scaffold_fn = None
        self.sess = None
        self.input_sess = None
        self.graph = tf.Graph()
        self.input_graph = tf.Graph()
        self.eval_op = None
        self.infeed_thread = None
        self.eval_epochs = []
        self.success_epoch = 1000
        self.log_epochs = {}
        self.params = {}
        self.train_loop = None
        self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name or FLAGS.master,
            zone=FLAGS.tpu_zone,
            project=FLAGS.gcp_project)
        # Disable grappler for better performance.
        self.session_config = tf.ConfigProto(
            allow_soft_placement=True,
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True)),
            isolate_session_state=True,
            operation_timeout_in_ms=600 * 60 * 1000)  # 10 hours
        cluster_spec = self.tpu_cluster_resolver.cluster_spec()
        if cluster_spec:
            self.session_config.cluster_def.CopyFrom(
                cluster_spec.as_cluster_def())
        self.tpu_init = tpu.initialize_system()
        self.tpu_shutdown = tpu.shutdown_system()
        self.master = self.tpu_cluster_resolver.get_master()
        self.init_sess = tf.Session(self.master, config=self.session_config)
        self.outfeed_tensors = []
        self.outfeed_names = []
        self.run_success = False
        self.log_run_success = False
        self.num_multiprocessing_workers = num_multiprocessing_workers

        # Figure out the steps and epochs to eval for MLPerf.
        self.eval_at_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist()
        self.eval_iterations = [
            steps // 20000 - 1 for steps in self.eval_at_steps
        ]
        self.max_train_iterations = int(
            math.ceil(FLAGS.num_epochs * FLAGS.num_examples_per_epoch /
                      (FLAGS.train_batch_size * self.iterations)))
        self.sleep_seconds = sleep_seconds

        tf.logging.info("eval_at_steps: %s", self.eval_at_steps)
        tf.logging.info("eval_iterations: %s", self.eval_iterations)

        # Init for spatial partitioning.
        self.device_topology = self.init_sess.run(self.tpu_init)
        self.input_partition_dims = [input_partition_dims, None]
        self.use_spatial_partition = (
            input_partition_dims is not None
            and int(np.prod(FLAGS.input_partition_dims)) > 1)
        self.use_spatial_partition = input_partition_dims is not None
        self.num_cores_per_shard = num_cores_per_shard
        if self.use_spatial_partition:
            computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
                self.num_cores_per_shard]
            self.device_assignment = tpu_device_assignment.device_assignment(
                topology=self.device_topology,
                computation_shape=computation_shape,
                num_replicas=self.num_shards)
            tf.logging.info("num_cores_per_shard: %d",
                            self.num_cores_per_shard)
            tf.logging.info("num_hosts: %d", self.num_hosts)
            tf.logging.info("replicas_per_worker: %d",
                            self.replicas_per_worker)
            tf.logging.info("computation_shape: %s", str(computation_shape))
            tf.logging.info("num_shards: %d", self.num_shards)
            tf.logging.info(
                "device_assignment.topology.device_coordinates: %s",
                str(self.device_assignment.topology.device_coordinates))
            tf.logging.info("device_assignment.core_assignment: %s",
                            str(self.device_assignment.core_assignment))
            eval_input_partition_dims = [{
                ssd_constants.BOXES: None,
                ssd_constants.CLASSES: None,
                ssd_constants.IMAGE: input_partition_dims,
                ssd_constants.RAW_SHAPE: None,
                ssd_constants.SOURCE_ID: None,
            }, None]
            if FLAGS.eval_batch_size * eval_steps > FLAGS.eval_samples:
                eval_input_partition_dims[0][ssd_constants.IS_PADDED] = None
            self.eval_input_dims_flattener = utils.InputDimsFlattener(
                eval_input_partition_dims)
        else:
            self.device_assignment = None
            self.eval_input_dims_flattener = None
コード例 #9
0
    def __init__(self,
                 tpu_cluster_resolver,
                 params,
                 input_partition_dims=None,
                 tpu_job_name=None):
        tf.logging.info("TrainLowLevelRunner: constructor")

        self.tpu_cluster_resolver = tpu_cluster_resolver
        self.params = params
        self.tpu_job_name = tpu_job_name

        self.model_dir = params["model_dir"]
        self.iterations_per_loop = params["iterations_per_loop"]
        self.num_shards = self.params["num_shards"]
        self.input_flattener = runner_utils.InputsFlattener()
        self.feature_structure = {}
        self.train_compile_op = None
        self.train_op = None
        self.infeed_queue = []
        self.enqueue_ops = []
        self.dataset_initializer = []
        self.scaffold_fn = None
        # Having two separate sessions and graphs to make the initialization faster.
        self.input_sess = None
        self.train_sess = None
        self.input_graph = tf.Graph()
        self.train_graph = None
        self.session_config = tf.ConfigProto(allow_soft_placement=True,
                                             isolate_session_state=True,
                                             operation_timeout_in_ms=600 * 60 *
                                             1000)  # 10 hours
        cluster_spec = self.tpu_cluster_resolver.cluster_spec()
        if cluster_spec:
            self.session_config.cluster_def.CopyFrom(
                cluster_spec.as_cluster_def())
        self.tpu_init = tf.contrib.tpu.initialize_system()
        self.tpu_shutdown = tf.contrib.tpu.shutdown_system()
        self.init_sess = tf.Session(self.tpu_cluster_resolver.get_master(),
                                    config=self.session_config)
        self.device_topology = self.init_sess.run(self.tpu_init)
        self.input_partition_dims = input_partition_dims
        self.use_spatial_partition = input_partition_dims is not None
        self.num_cores_per_replica = (self.params["num_cores_per_replica"]
                                      if self.params["num_cores_per_replica"]
                                      else 1)
        if self.use_spatial_partition:
            computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
                self.num_cores_per_replica]
            self.device_assignment = tpu_device_assignment.device_assignment(
                topology=self.device_topology,
                computation_shape=computation_shape,
                num_replicas=self.num_shards)
            tf.logging.info("num_cores_per_replica: %d",
                            self.num_cores_per_replica)
            tf.logging.info("computation_shape: %s", str(computation_shape))
            tf.logging.info("num_replicas: %d", self.num_shards)
            tf.logging.info(
                "device_assignment.topology.device_coordinates: %s",
                str(self.device_assignment.topology.device_coordinates))
            tf.logging.info("device_assignment.core_assignment: %s",
                            str(self.device_assignment.core_assignment))
            self.input_dims_flattener = runner_utils.InputDimsFlattener(
                self.input_partition_dims)
        else:
            self.device_assignment = None
            self.input_dims_flattener = None
        self.queue = Queue.Queue()
        # Summary writer writes out train metrics.
        self.summary_writer = tf.summary.FileWriter(self.model_dir)
        self.infeed_thread = None