def _get_device_assignment(self): """Gets the (maybe cached) TPU device assignment.""" master = self._get_master_address() device_assignment = self._lazy_device_assignment_dict.get(master) if device_assignment is not None: return device_assignment tpu_system_metadata = self._get_tpu_system_metadata() device_assignment = tpu_device_assignment.device_assignment( tpu_system_metadata.topology, computation_shape=self._computation_shape, num_replicas=self.num_replicas) logging.info('num_cores_per_replica: %s', str(self._config.tpu_config.num_cores_per_replica)) logging.info('computation_shape: %s', str(self._computation_shape)) logging.info('num_replicas: %d', self.num_replicas) logging.info('device_assignment.topology.device_coordinates: %s', str(device_assignment.topology.device_coordinates)) logging.info('device_assignment.core_assignment: %s', str(device_assignment.core_assignment)) self._lazy_device_assignment_dict[master] = device_assignment return device_assignment
def _get_device_assignment(self): """Gets the (maybe cached) TPU device assignment.""" master = self._get_master_address() device_assignment = self._lazy_device_assignment_dict.get(master) if device_assignment is not None: return device_assignment tpu_system_metadata = self._get_tpu_system_metadata() device_assignment = tpu_device_assignment.device_assignment( tpu_system_metadata.topology, computation_shape=self._computation_shape, num_replicas=self.num_replicas) logging.info('num_cores_per_replica: %s', str(self._config.tpu_config.num_cores_per_replica)) logging.info('computation_shape: %s', str(self._computation_shape)) logging.info('num_replicas: %d', self.num_replicas) logging.info('device_assignment.topology.device_coordinates: %s', str(device_assignment.topology.device_coordinates)) logging.info('device_assignment.core_assignment: %s', str(device_assignment.core_assignment)) self._lazy_device_assignment_dict[master] = device_assignment return device_assignment
def __init__(self, sess, use_tpu, mesh_shape, layout_rules): super(MeshContext, self).__init__() self._use_tpu = use_tpu self._mesh_shape = mtf.convert_to_shape(mesh_shape) self._layout_rules = layout_rules self._d_assignment = None self._num_hosts = None self._num_cores = None self._cpu_devices, self._gpu_devices = self._list_cpu_gpu_devices(sess) if self._use_tpu: topology = sess.run(tpu.initialize_system()) topo_object = tpu.Topology(serialized=topology) self._num_cores = int(np.prod(topo_object.mesh_shape)) self._num_hosts = int(topo_object.num_tasks) num_cores_per_host = int(self._num_cores // self._num_hosts) assert num_cores_per_host == int(topo_object.num_tpus_per_task) # Get a device_assignment object for mtf. self._d_assignment = device_assignment.device_assignment( topology, computation_shape=[1, 1, 1], num_replicas=self._num_cores) self._mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl( self._mesh_shape, self._layout_rules, None, self._d_assignment) else: self._mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl( self._mesh_shape, self._layout_rules, self._gpu_devices)
def _WaitTillInit(): """Wait until the model is ready.""" try: with self._GetSession() as sess: topology = sess.run( tf.tpu.initialize_system(embedding_config=None, job=None)) device_assignment = device_assignment_lib.device_assignment( topology, computation_shape=ComputationShape( num_devices_per_split), num_replicas=data_parallelism) py_utils.SetTpuDeviceAssignment(device_assignment) tf.logging.info('device_assignment.core_assignment: %s', str(device_assignment.core_assignment)) tf.logging.info( 'device_assignment.topology.device_coordinates: %s', str(device_assignment.topology.device_coordinates)) except py_utils.transient_tf_errors as e: tf.logging.info('TPU initialization failed: %s', e) raise
def train_and_eval(): """Trains and evaluates MeshTensorflow model without TPUEstimator. TODO(lehou): Pack everything nicely as a set of APIs. """ tf.logging.info('FLAGS.master: {}'.format(FLAGS.master)) # Open a session to get the list of CPU devices to hold master variables. with tf.Session(target=FLAGS.master, config=tf.ConfigProto(allow_soft_placement=True)) as sess: topology = sess.run(tpu.initialize_system()) cpu_devices = _list_cpu_devices(sess) topo_object = tf.contrib.tpu.Topology(serialized=topology) num_cores = int(np.prod(topo_object.mesh_shape)) num_hosts = int(topo_object.num_tasks) num_cores_per_host = int(num_cores // num_hosts) assert num_cores_per_host == int(topo_object.num_tpus_per_task) # Get a device_assignment object for mtf. d_assignment = device_assignment.device_assignment( topology, computation_shape=[1, 1, 1], num_replicas=num_cores) # Get mesh_impl. mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape) layout_rules = unet.get_layout() mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl(mesh_shape, layout_rules, None, d_assignment) for _ in range(FLAGS.num_training_loops): _train_phase(mesh_impl, cpu_devices, d_assignment, num_hosts, num_cores) _eval_phase(mesh_impl, cpu_devices, d_assignment, num_hosts, num_cores) _shutdown() tf.logging.info('finished.')
def __init__(self, iterations, num_cores_per_shard=1, input_partition_dims=None): tf.logging.info("TrainLowLevelRunner: constructor") self.feature_structure = {} self.loss = None self.infeed_queue = [] self.enqueue_ops = [] self.dataset_initializer = [] self.iterations = iterations # TODO(wangtao): change FLAGS.num_shards_per_host to # FLAGS.num_cores_per_host after other low level API # support spatial partition. FLAGS.num_shards_per_host means number of TPU # cores for each host. self.replicas_per_worker = FLAGS.num_shards_per_host // num_cores_per_shard self.num_hosts = FLAGS.num_shards * num_cores_per_shard // FLAGS.num_shards_per_host self.num_shards = FLAGS.num_shards self.scaffold_fn = None # Having two separate sessions and graphs to make the initialization faster. self.input_sess = None self.train_sess = None self.input_graph = tf.Graph() self.train_graph = None self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # Disable grappler for better performance. self.session_config = tf.ConfigProto( allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True) cluster_spec = self.tpu_cluster_resolver.cluster_spec() if cluster_spec: self.session_config.cluster_def.CopyFrom( cluster_spec.as_cluster_def()) self.tpu_init = tpu.initialize_system() self.tpu_shutdown = tpu.shutdown_system() self.init_sess = tf.Session(self.tpu_cluster_resolver.get_master(), config=self.session_config) self.queue = Queue.Queue() # Init for spatial partitioning. self.device_topology = self.init_sess.run(self.tpu_init) self.input_partition_dims = input_partition_dims self.use_spatial_partition = ( input_partition_dims is not None and int(np.prod(FLAGS.input_partition_dims)) > 1) self.num_cores_per_shard = num_cores_per_shard if self.use_spatial_partition: computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[ self.num_cores_per_shard] self.device_assignment = tpu_device_assignment.device_assignment( topology=self.device_topology, computation_shape=computation_shape, num_replicas=self.num_shards) tf.logging.info("num_cores_per_shard: %d", self.num_cores_per_shard) tf.logging.info("num_hosts: %d", self.num_hosts) tf.logging.info("replicas_per_worker: %d", self.replicas_per_worker) tf.logging.info("computation_shape: %s", str(computation_shape)) tf.logging.info("num_shards: %d", self.num_shards) tf.logging.info( "device_assignment.topology.device_coordinates: %s", str(self.device_assignment.topology.device_coordinates)) tf.logging.info("device_assignment.core_assignment: %s", str(self.device_assignment.core_assignment)) else: self.device_assignment = None
def __init__(self, tpu_cluster_resolver, train_params, eval_params, eval_steps, eval_metric, input_partition_dims=None, num_cores_per_replica=None, tpu_job_name=None): tf.logging.info("TrainLowLevelRunner: constructor") self.tpu_cluster_resolver = tpu_cluster_resolver self.eval_metric = eval_metric self.train_params = train_params self.eval_params = eval_params self.train_params["batch_size"] = (train_params["train_batch_size"] // train_params["num_shards"]) self.eval_params["batch_size"] = (eval_params["eval_batch_size"] // eval_params["num_shards"]) self.tpu_job_name = tpu_job_name self.model_dir = train_params["model_dir"] self.iterations_per_loop = train_params["iterations_per_loop"] self.eval_steps = eval_steps self.num_shards = self.train_params["num_shards"] self.input_flattener = runner_utils.InputsFlattener() self.eval_input_flattener = runner_utils.InputsFlattener() self.num_hosts = None self.train_eval_compile_op = None self.train_eval_op = None self.infeed_queue = [] self.eval_infeed_queue = [] self.outfeed_names = [] self.outfeed_tensors = [] self.enqueue_ops = [] self.eval_enqueue_ops = [] self.dequeue_ops = [] self.dataset_initializer = [] self.eval_dataset_initializer = [] self.scaffold_fn = None # Having two separate sessions and graphs to make the initialization faster. self.input_sess = None self.train_eval_sess = None self.input_graph = tf.Graph() self.train_eval_graph = tf.Graph() self.session_config = tf.ConfigProto(allow_soft_placement=True, isolate_session_state=True, operation_timeout_in_ms=600 * 60 * 1000) # 10 hours cluster_spec = self.tpu_cluster_resolver.cluster_spec() if cluster_spec: self.session_config.cluster_def.CopyFrom( cluster_spec.as_cluster_def()) self.tpu_init = tf.contrib.tpu.initialize_system() self.tpu_shutdown = tf.contrib.tpu.shutdown_system() self.master = self.tpu_cluster_resolver.get_master() self.init_sess = tf.Session(self.master, config=self.session_config) self.device_topology = self.init_sess.run(self.tpu_init) self.input_partition_dims = input_partition_dims self.use_spatial_partition = input_partition_dims is not None self.num_cores_per_replica = num_cores_per_replica if self.use_spatial_partition: computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[ self.num_cores_per_replica] self.device_assignment = tpu_device_assignment.device_assignment( topology=self.device_topology, computation_shape=computation_shape, num_replicas=self.num_shards) tf.logging.info("num_cores_per_replica: %d", self.num_cores_per_replica) tf.logging.info("computation_shape: %s", str(computation_shape)) tf.logging.info("num_replicas: %d", self.num_shards) tf.logging.info( "device_assignment.topology.device_coordinates: %s", str(self.device_assignment.topology.device_coordinates)) tf.logging.info("device_assignment.core_assignment: %s", str(self.device_assignment.core_assignment)) self.input_dims_flattener = runner_utils.InputDimsFlattener( self.input_partition_dims) eval_input_partition_dims = [ dict(self.input_partition_dims[0]), None ] # don't need to partition the "is_padding" dimension if eval_params["eval_samples"] % eval_params[ "eval_batch_size"] != 0: eval_input_partition_dims[0][ mask_rcnn_params.IS_PADDING] = None self.eval_input_dims_flattener = runner_utils.InputDimsFlattener( eval_input_partition_dims) else: self.device_assignment = None self.input_dims_flattener = None self.eval_input_dims_flattener = None # Summary writer writes out train metrics. self.summary_writer = tf.summary.FileWriter(self.model_dir) # Summary writer writes out eval metrics. eval_output_dir = os.path.join(self.model_dir, "eval") tf.gfile.MakeDirs(eval_output_dir) self.eval_summary_writer = tf.summary.FileWriter(eval_output_dir) self.infeed_thread = None self.total_epoch = self.train_params[ "total_steps"] // self.iterations_per_loop
def __init__(self, iterations, eval_steps, sleep_seconds=120, num_multiprocessing_workers=ssd_constants.WORKER_COUNT, num_cores_per_shard=1, input_partition_dims=None): tf.logging.info("TrainAndEvalLowLevelRunner: constructor") self.eval_steps = eval_steps self.feature_structure = {} self.eval_feature_structure = {} self.loss = None self.infeed_queue = [] self.eval_infeed_queue = [] self.enqueue_ops = [] self.dequeue_ops = [] self.predictions = {} self.eval_enqueue_ops = [] self.train_eval_compile_op = None self.dataset_initializer = [] self.eval_dataset_initializer = [] self.iterations = iterations # TODO(wangtao): change FLAGS.num_shards_per_host to # FLAGS.num_cores_per_host after other low level API # support spatial partition. FLAGS.num_shards_per_host means number of TPU # cores for each host. self.replicas_per_worker = FLAGS.num_shards_per_host // num_cores_per_shard self.num_hosts = FLAGS.num_shards * num_cores_per_shard // FLAGS.num_shards_per_host self.num_shards = FLAGS.num_shards self.scaffold_fn = None self.sess = None self.input_sess = None self.graph = tf.Graph() self.input_graph = tf.Graph() self.eval_op = None self.infeed_thread = None self.eval_epochs = [] self.success_epoch = 1000 self.log_epochs = {} self.params = {} self.train_loop = None self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name or FLAGS.master, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # Disable grappler for better performance. self.session_config = tf.ConfigProto( allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True, operation_timeout_in_ms=600 * 60 * 1000) # 10 hours cluster_spec = self.tpu_cluster_resolver.cluster_spec() if cluster_spec: self.session_config.cluster_def.CopyFrom( cluster_spec.as_cluster_def()) self.tpu_init = tpu.initialize_system() self.tpu_shutdown = tpu.shutdown_system() self.master = self.tpu_cluster_resolver.get_master() self.init_sess = tf.Session(self.master, config=self.session_config) self.outfeed_tensors = [] self.outfeed_names = [] self.run_success = False self.log_run_success = False self.num_multiprocessing_workers = num_multiprocessing_workers # Figure out the steps and epochs to eval for MLPerf. self.eval_at_steps = np.cumsum(ssd_constants.EVAL_STEPS).tolist() self.eval_iterations = [ steps // 20000 - 1 for steps in self.eval_at_steps ] self.max_train_iterations = int( math.ceil(FLAGS.num_epochs * FLAGS.num_examples_per_epoch / (FLAGS.train_batch_size * self.iterations))) self.sleep_seconds = sleep_seconds tf.logging.info("eval_at_steps: %s", self.eval_at_steps) tf.logging.info("eval_iterations: %s", self.eval_iterations) # Init for spatial partitioning. self.device_topology = self.init_sess.run(self.tpu_init) self.input_partition_dims = [input_partition_dims, None] self.use_spatial_partition = ( input_partition_dims is not None and int(np.prod(FLAGS.input_partition_dims)) > 1) self.use_spatial_partition = input_partition_dims is not None self.num_cores_per_shard = num_cores_per_shard if self.use_spatial_partition: computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[ self.num_cores_per_shard] self.device_assignment = tpu_device_assignment.device_assignment( topology=self.device_topology, computation_shape=computation_shape, num_replicas=self.num_shards) tf.logging.info("num_cores_per_shard: %d", self.num_cores_per_shard) tf.logging.info("num_hosts: %d", self.num_hosts) tf.logging.info("replicas_per_worker: %d", self.replicas_per_worker) tf.logging.info("computation_shape: %s", str(computation_shape)) tf.logging.info("num_shards: %d", self.num_shards) tf.logging.info( "device_assignment.topology.device_coordinates: %s", str(self.device_assignment.topology.device_coordinates)) tf.logging.info("device_assignment.core_assignment: %s", str(self.device_assignment.core_assignment)) eval_input_partition_dims = [{ ssd_constants.BOXES: None, ssd_constants.CLASSES: None, ssd_constants.IMAGE: input_partition_dims, ssd_constants.RAW_SHAPE: None, ssd_constants.SOURCE_ID: None, }, None] if FLAGS.eval_batch_size * eval_steps > FLAGS.eval_samples: eval_input_partition_dims[0][ssd_constants.IS_PADDED] = None self.eval_input_dims_flattener = utils.InputDimsFlattener( eval_input_partition_dims) else: self.device_assignment = None self.eval_input_dims_flattener = None
def __init__(self, tpu_cluster_resolver, params, input_partition_dims=None, tpu_job_name=None): tf.logging.info("TrainLowLevelRunner: constructor") self.tpu_cluster_resolver = tpu_cluster_resolver self.params = params self.tpu_job_name = tpu_job_name self.model_dir = params["model_dir"] self.iterations_per_loop = params["iterations_per_loop"] self.num_shards = self.params["num_shards"] self.input_flattener = runner_utils.InputsFlattener() self.feature_structure = {} self.train_compile_op = None self.train_op = None self.infeed_queue = [] self.enqueue_ops = [] self.dataset_initializer = [] self.scaffold_fn = None # Having two separate sessions and graphs to make the initialization faster. self.input_sess = None self.train_sess = None self.input_graph = tf.Graph() self.train_graph = None self.session_config = tf.ConfigProto(allow_soft_placement=True, isolate_session_state=True, operation_timeout_in_ms=600 * 60 * 1000) # 10 hours cluster_spec = self.tpu_cluster_resolver.cluster_spec() if cluster_spec: self.session_config.cluster_def.CopyFrom( cluster_spec.as_cluster_def()) self.tpu_init = tf.contrib.tpu.initialize_system() self.tpu_shutdown = tf.contrib.tpu.shutdown_system() self.init_sess = tf.Session(self.tpu_cluster_resolver.get_master(), config=self.session_config) self.device_topology = self.init_sess.run(self.tpu_init) self.input_partition_dims = input_partition_dims self.use_spatial_partition = input_partition_dims is not None self.num_cores_per_replica = (self.params["num_cores_per_replica"] if self.params["num_cores_per_replica"] else 1) if self.use_spatial_partition: computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[ self.num_cores_per_replica] self.device_assignment = tpu_device_assignment.device_assignment( topology=self.device_topology, computation_shape=computation_shape, num_replicas=self.num_shards) tf.logging.info("num_cores_per_replica: %d", self.num_cores_per_replica) tf.logging.info("computation_shape: %s", str(computation_shape)) tf.logging.info("num_replicas: %d", self.num_shards) tf.logging.info( "device_assignment.topology.device_coordinates: %s", str(self.device_assignment.topology.device_coordinates)) tf.logging.info("device_assignment.core_assignment: %s", str(self.device_assignment.core_assignment)) self.input_dims_flattener = runner_utils.InputDimsFlattener( self.input_partition_dims) else: self.device_assignment = None self.input_dims_flattener = None self.queue = Queue.Queue() # Summary writer writes out train metrics. self.summary_writer = tf.summary.FileWriter(self.model_dir) self.infeed_thread = None