def initialize(self): if context.executing_eagerly(): # TODO(priyag): Add appopriate call here when eager is supported for TPUs. raise NotImplementedError( 'Eager mode not supported in TPUStrategy.') else: return [tpu.initialize_system()]
def setup_tpu_session(master): """Initializes and returns a Keras/TF session connected the TPU `master`.""" session = tf_session.Session( target=master, config=config_pb2.ConfigProto(isolate_session_state=True)) K.set_session(session) K.get_session().run(tpu.initialize_system()) return session
def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum, renorm, is_tpu, update_ops_in_cross_tower_mode): """Verifies that moving mean updates are reduced across towers.""" with distribution.scope(): num_towers = len(distribution.worker_devices) model_fn, dataset_fn, batchnorm = batchnorm_example( optimizer_fn, batch_per_epoch=num_towers, momentum=momentum, renorm=renorm, update_ops_in_tower_mode=not update_ops_in_cross_tower_mode) # Make sure prefetching is disabled since that makes the # specific input on each device to be non deterministic, and # this test relies on specific input being on each device. if isinstance(distribution, mirrored_strategy.MirroredStrategy): self.assertFalse(distribution._prefetch_on_device) iterator = distribution.distribute_dataset( dataset_fn).make_one_shot_iterator() def run_step(): fetches = distribution.unwrap( distribution.call_for_each_tower( model_fn, iterator.get_next(), run_concurrently=batchnorm.built)) if update_ops_in_cross_tower_mode: fetches += ops.get_collection(ops.GraphKeys.UPDATE_OPS) return control_flow_ops.group(fetches) if not context.executing_eagerly(): with self.test_session() as sess: if is_tpu: sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) expected_moving_means = [0.] * 8 def averaged_batch_mean(i): # Each batch has shape [16, 8] where the ith element in jth list is # (8 * j + i + tower_id * 100). So the batch mean in each tower is # (60 + i + tower_id * 100). So here comes its batch mean over all # towers: return 60. + i + (num_towers - 1.) / 2. * 100. for _ in range(10): run_step() moving_means = self.evaluate(distribution.fetch(batchnorm.moving_mean)) # We make sure that the moving_mean is updated as if the sample mean is # calculated over all towers. for i, expected_moving_mean in enumerate(expected_moving_means): expected_moving_means[i] -= (( expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum)) self.assertNear(expected_moving_means[i], moving_means[i], 0.0001) if is_tpu: with self.test_session() as sess: sess.run(tpu.shutdown_system())
def setup_tpu_session(master): """Initializes and returns a Keras/TF session connected the TPU `master`.""" session = tf_session.Session( target=master, config=config_pb2.ConfigProto(isolate_session_state=True)) K.set_session(session) K.get_session().run(tpu.initialize_system()) return session
def _run_tpu_initialization(self): """Test TPU system initialization.""" with tf.Session('grpc://{0}:8470'.format(self.tpu_ip)) as sess: sess.run(tpu.initialize_system()) sess.run(tpu.shutdown_system()) logging.info('Successfully initialized and shutdown the tpu') self.tpu_initialization = 'Passed'
def __init__(self, cpu_model, tpu_name_or_address, strategy): super(models.Model, self).__init__( # pylint: disable=bad-super-call inputs=cpu_model.inputs, outputs=cpu_model.outputs, name=cpu_model.name, ) # Create a mapping from numpy arrays to infeed managers. # Note: uses a list of tuples instead of a map because numpy arrays are # not hashable. self._numpy_to_infeed_manager_list = [] self.predict_function = None self.test_function = None self.train_function = None self._strategy = strategy self._tpu_name_or_address = tpu_name_or_address self._cpu_model = cpu_model self._tpu_model = None self._tpu_weights_initialized = False self._graph = ops.Graph() self._cluster_resolver = tpu_cluster_resolver.TPUClusterResolver( tpu_name_or_address) master = self._cluster_resolver.master() cluster_spec = self._cluster_resolver.cluster_spec() self._session = tf_session.Session( graph=self._graph, target=master, config=config_pb2.ConfigProto(isolate_session_state=True)) # TODO(saeta): Confirm the lines below work in ClusterSpec propagation env. if cluster_spec: self._session.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) with self._graph.as_default(): self._session.run(tpu.initialize_system()) # If the input CPU model has already been compiled, compile our TPU model # immediately. if self._cpu_model.optimizer: self.compile( self._cpu_model.optimizer, self._cpu_model.loss, self._cpu_model.metrics, self._cpu_model.loss_weights, self._cpu_model.sample_weight_mode, self._cpu_model.weighted_metrics, self._cpu_model.target_tensors, )
def __init__(self, cpu_model, tpu_name_or_address, strategy): super(models.Model, self).__init__( # pylint: disable=bad-super-call inputs=cpu_model.inputs, outputs=cpu_model.outputs, name=cpu_model.name, ) # Create a mapping from numpy arrays to infeed managers. # Note: uses a list of tuples instead of a map because numpy arrays are # not hashable. self._numpy_to_infeed_manager_list = [] self.predict_function = None self.test_function = None self.train_function = None self._strategy = strategy self._tpu_name_or_address = tpu_name_or_address self._cpu_model = cpu_model self._tpu_model = None self._tpu_weights_initialized = False self._graph = ops.Graph() self._cluster_resolver = tpu_cluster_resolver.TPUClusterResolver( tpu_name_or_address) master = self._cluster_resolver.master() cluster_spec = self._cluster_resolver.cluster_spec() self._session = tf_session.Session( graph=self._graph, target=master, config=config_pb2.ConfigProto(isolate_session_state=True)) # TODO(saeta): Confirm the lines below work in ClusterSpec propagation env. if cluster_spec: self._session.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) with self._graph.as_default(): self._session.run(tpu.initialize_system()) # If the input CPU model has already been compiled, compile our TPU model # immediately. if self._cpu_model.optimizer: self.compile( self._cpu_model.optimizer, self._cpu_model.loss, self._cpu_model.metrics, self._cpu_model.loss_weights, self._cpu_model.sample_weight_mode, self._cpu_model.weighted_metrics, self._cpu_model.target_tensors, )
def initialize(self): if context.executing_eagerly(): # TODO(priyag): Add appopriate call here when eager is supported for TPUs. raise NotImplementedError('Eager mode not supported in TPUStrategy.') else: # TODO(jhseu): We need this hack because DistributionStrategies must be # pickleable for copy.deepcopy(). Remove when initialize_system goes away. graph = ops.get_default_graph() tpu_init = graph.get_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION) if tpu_init: return tpu_init graph.add_to_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION, tpu.initialize_system()) return graph.get_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION)
def initialize(self): if context.executing_eagerly(): # TODO(priyag): Add appopriate call here when eager is supported for TPUs. raise NotImplementedError("Eager mode not supported in TPUStrategy.") else: # TODO(jhseu): We need this hack because DistributionStrategies must be # pickleable for copy.deepcopy(). Remove when initialize_system goes away. graph = ops.get_default_graph() tpu_init = graph.get_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION) if tpu_init: return tpu_init graph.add_to_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION, tpu.initialize_system()) return graph.get_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION)
def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss, is_tpu): with distribution.scope(): model_fn, dataset, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss) # TODO(isaprykin): Eliminate `is_tpu`. Probably add a # `DistributionStrategy.create_monitor` so that each DistributionStrategy # could influence its training loop. That method would return an instance # of Monitor. TPUMonitor would execute tpu.initialize_system() and # tpu.shutdown_system(). if is_tpu: dataset = dataset.batch(2) iterator = distribution.distribute_dataset(dataset) def run_step(): # TODO(isaprykin): Make iterator get_next() return a list of sub- # batches for each iteration. Pass iterator.get_next() and not iterator # to call_for_each_tower. return distribution.group( distribution.call_for_each_tower( model_fn, iterator.get_next() if not is_tpu else iterator, run_concurrently=layer.built)) if not context.executing_eagerly(): with self.test_session() as sess: if is_tpu: sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) weights, biases = [], [] for _ in range(10): run_step() weights.append(self.evaluate(distribution.fetch(layer.kernel))) biases.append(self.evaluate(distribution.fetch(layer.bias))) if is_tpu: with self.test_session() as sess: sess.run(tpu.shutdown_system()) error = abs( numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1) is_not_increasing = all(y <= x for x, y in zip(error, error[1:])) self.assertTrue(is_not_increasing)
def _obtain_topology(master_address, run_config): try: logging.info( 'Initializing TPU system (master: %s) to fetch topology ' 'for model parallelism. This might take a while.', master_address) with ops.Graph().as_default(): session_config = _get_session_config_with_timeout( _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, run_config) with session_lib.Session(master_address, config=session_config) as sess: topology = sess.run(tpu.initialize_system()) return topology except errors.DeadlineExceededError: raise ValueError('Fail to initialize TPU system with master (%s). ' 'Please double check the TPU system is functional.' % (master_address))
def _obtain_topology(master_address, run_config): try: logging.info('Initializing TPU system (master: %s) to fetch topology ' 'for model parallelism. This might take a while.', master_address) with ops.Graph().as_default(): session_config = get_session_config_with_timeout( _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, run_config) with session_lib.Session( master_address, config=session_config) as sess: topology = sess.run(tpu.initialize_system()) return topology except errors.DeadlineExceededError: raise ValueError( 'Fail to initialize TPU system with master (%s). ' 'Please double check the TPU system is functional.' % ( master_address))
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices in a separate session and graph. Args: cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. Returns: The tf.contrib.tpu.Topology object for the topology of the TPU cluster. """ if cluster_resolver is None: cluster_resolver = TPUClusterResolver("") master = cluster_resolver.master() logging.info("Initializing the TPU system.") if context.executing_eagerly(): # This function looks as it is for the following non-intuitive reasons. # tpu.initialize_system creates a dummy op whose sole purpose is to trigger # DistributedTPURewritePass. This pass actually adds real ops that # initialize the TPU system. Thus, we can't simply run tpu.initialize_system # eagerly. We need to wrap it in defun and trigger the rewrite passes on it. # The easiest way to trigger a rewrite is to run the function with # TPUPartitionedCallOp. @function.defun def _tpu_init_fn(): return tpu.initialize_system() # We can't call _tpu_init_fn normally (because it contains just a dummy op, # see above) but need to define it to get it added to eager context # and get its assigned name. # pylint: disable=protected-access graph_func = _tpu_init_fn._get_concrete_function_internal() func_name = compat.as_str(graph_func._inference_function.name) # pylint: enable=protected-access output = tpu_functional_ops.TPUPartitionedCall( args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name) serialized_topology = output[0].numpy() else: session_config = config_pb2.ConfigProto(allow_soft_placement=True) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: serialized_topology = sess.run(tpu.initialize_system()) logging.info("Finished initializing TPU system.") return topology.Topology(serialized=serialized_topology)
def tpu_session(cluster_resolver): """Construct or return a `tf.Session` connected to the given cluster.""" global _SESSIONS master = cluster_resolver.master() if master not in _SESSIONS: cluster_spec = cluster_resolver.cluster_spec() config = config_pb2.ConfigProto(isolate_session_state=True) if cluster_spec: config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) graph = ops.Graph() session = tf_session.Session(graph=graph, target=master, config=config) with graph.as_default(): session.run(tpu.initialize_system()) _SESSIONS[master] = session return _SESSIONS[master]
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices in a separate session and graph. Args: cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. """ if cluster_resolver is None: cluster_resolver = resolver_lib.TPUClusterResolver("") master = cluster_resolver.master() logging.info("Initializing the TPU system.") session_config = config_pb2.ConfigProto(allow_soft_placement=True) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: sess.run([tpu.initialize_system()]) logging.info("Finished initializing TPU system.")
def tpu_session(cluster_resolver): """Construct or return a `tf.Session` connected to the given cluster.""" global _SESSIONS master = cluster_resolver.master() if master not in _SESSIONS: cluster_spec = cluster_resolver.cluster_spec() config = config_pb2.ConfigProto(isolate_session_state=True) if cluster_spec: config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) graph = ops.Graph() session = tf_session.Session(graph=graph, target=master, config=config) with graph.as_default(): session.run(tpu.initialize_system()) _SESSIONS[master] = session return _SESSIONS[master]
def initialize_tpu_system(cluster_resolver=None): """Initialize the TPU devices in a separate session and graph. Args: cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver, which provides information about the TPU cluster. """ if cluster_resolver is None: cluster_resolver = resolver_lib.TPUClusterResolver("") master = cluster_resolver.master() logging.info("Initializing the TPU system.") session_config = config_pb2.ConfigProto(allow_soft_placement=True) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: sess.run([tpu.initialize_system()]) logging.info("Finished initializing TPU system.")
def __init__(self, cpu_model, tpu_name_or_address, strategy): super(models.Model, self).__init__( # pylint: disable=bad-super-call inputs=cpu_model.inputs, outputs=cpu_model.outputs, name=cpu_model.name, ) self.predict_function = None self.test_function = None self.train_function = None self._strategy = strategy self._tpu_name_or_address = tpu_name_or_address self._cpu_model = cpu_model self._tpu_model = None self._tpu_weights_initialized = False self._graph = ops.Graph() cluster_resolver = tpu_cluster_resolver.TPUClusterResolver( tpu_name_or_address) cluster_spec = cluster_resolver.cluster_spec() self._session = tf_session.Session( graph=self._graph, target=cluster_resolver.master(), config=config_pb2.ConfigProto(isolate_session_state=True)) if cluster_spec: self._session.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) with self._graph.as_default(): self._session.run(tpu.initialize_system()) # If the input CPU model has already been compiled, compile our TPU model # immediately. if self._cpu_model.optimizer: self.compile( self._cpu_model.optimizer, self._cpu_model.loss, self._cpu_model.metrics, self._cpu_model.loss_weights, self._cpu_model.sample_weight_mode, self._cpu_model.weighted_metrics, self._cpu_model.target_tensors, )
def __init__(self, cpu_model, tpu_name_or_address, strategy): super(models.Model, self).__init__( # pylint: disable=bad-super-call inputs=cpu_model.inputs, outputs=cpu_model.outputs, name=cpu_model.name, ) self.predict_function = None self.test_function = None self.train_function = None self._strategy = strategy self._tpu_name_or_address = tpu_name_or_address self._cpu_model = cpu_model self._tpu_model = None self._tpu_weights_initialized = False self._graph = ops.Graph() cluster_resolver = tpu_cluster_resolver.TPUClusterResolver( tpu_name_or_address) cluster_spec = cluster_resolver.cluster_spec() self._session = tf_session.Session( graph=self._graph, target=cluster_resolver.master(), config=config_pb2.ConfigProto(isolate_session_state=True)) if cluster_spec: self._session.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) with self._graph.as_default(): self._session.run(tpu.initialize_system()) # If the input CPU model has already been compiled, compile our TPU model # immediately. if self._cpu_model.optimizer: self.compile( self._cpu_model.optimizer, self._cpu_model.loss, self._cpu_model.metrics, self._cpu_model.loss_weights, self._cpu_model.sample_weight_mode, self._cpu_model.weighted_metrics, self._cpu_model.target_tensors, )
def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss, is_tpu): # TODO(priyag): Remove this once the step TPU Strategy is stable. if is_tpu: self.skipTest("TPU tests are WIP.") with distribution.scope(): model_fn, dataset_fn, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss) # TODO(isaprykin): Eliminate `is_tpu`. Probably add a # `DistributionStrategy.create_monitor` so that each DistributionStrategy # could influence its training loop. That method would return an instance # of Monitor. TPUMonitor would execute tpu.initialize_system() and # tpu.shutdown_system(). iterator = distribution.distribute_dataset( dataset_fn).make_one_shot_iterator() def run_step(): return distribution.group( distribution.call_for_each_tower( model_fn, iterator.get_next(), run_concurrently=layer.built)) if not context.executing_eagerly(): with self.test_session() as sess: if is_tpu: sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) weights, biases = [], [] for _ in range(10): run_step() weights.append(self.evaluate(layer.kernel)) biases.append(self.evaluate(layer.bias)) if is_tpu: with self.test_session() as sess: sess.run(tpu.shutdown_system()) error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1) is_not_increasing = all(y <= x for x, y in zip(error, error[1:])) self.assertTrue(is_not_increasing)
def run_on_device(self, model_fn, model_inputs, device): """Runs `model_fn` on the given device. Raises an exception if no such device is available. `model_fn` should return one or more tensors as a list or tuple. Args: model_fn: Function returning one or more tensors. model_inputs: An iterable of Numpy arrays or scalars. These will be passed as arguments to `model_fn`. device: Device to run on. One of ("tpu", "gpu", "cpu"). Returns: Output from the model function. """ def _make_placeholders(): return dict([(gen_array_ops.placeholder_with_default(v, v.shape), v) for v in model_inputs]) if device == "tpu": with self.test_session(graph=ops.Graph()) as sess: placeholders = _make_placeholders() tpu_computation = tpu.rewrite(model_fn, placeholders.keys()) sess.run(tpu.initialize_system()) sess.run(variables.global_variables_initializer()) result = sess.run(tpu_computation, placeholders) sess.run(tpu.shutdown_system()) # TODO(b/36891278): supports non-flat returns lists in tpu.rewrite(). if len(result) == 1: return result[0] return result elif device == "gpu": with self.test_session(graph=ops.Graph(), use_gpu=True) as sess: placeholders = _make_placeholders() sess.run(variables.global_variables_initializer()) return sess.run(model_fn(placeholders.keys()), placeholders) elif device == "cpu": # TODO(power) -- will this interact poorly with cached GPU sessions? with self.test_session(graph=ops.Graph(), use_gpu=False) as sess: placeholders = _make_placeholders() sess.run(variables.global_variables_initializer()) return sess.run(model_fn(placeholders.keys()), placeholders)
def run_on_device(self, model_fn, model_inputs, device): """Runs `model_fn` on the given device. Raises an exception if no such device is available. `model_fn` should return one or more tensors as a list or tuple. Args: model_fn: Function returning one or more tensors. model_inputs: An iterable of Numpy arrays or scalars. These will be passed as arguments to `model_fn`. device: Device to run on. One of ("tpu", "gpu", "cpu"). Returns: Output from the model function. """ def _make_placeholders(): return dict( [(gen_array_ops.placeholder_with_default(v, v.shape), v) for v in model_inputs]) if device == "tpu": with self.test_session(graph=ops.Graph()) as sess: placeholders = _make_placeholders() tpu_computation = tpu.rewrite(model_fn, placeholders.keys()) sess.run(tpu.initialize_system()) sess.run(variables.global_variables_initializer()) result = sess.run(tpu_computation, placeholders) sess.run(tpu.shutdown_system()) # TODO(b/36891278): supports non-flat returns lists in tpu.rewrite(). if len(result) == 1: return result[0] return result elif device == "gpu": with self.test_session(graph=ops.Graph(), use_gpu=True) as sess: placeholders = _make_placeholders() sess.run(variables.global_variables_initializer()) return sess.run(model_fn(placeholders.keys()), placeholders) elif device == "cpu": # TODO(power) -- will this interact poorly with cached GPU sessions? with self.test_session(graph=ops.Graph(), use_gpu=False) as sess: placeholders = _make_placeholders() sess.run(variables.global_variables_initializer()) return sess.run(model_fn(placeholders.keys()), placeholders)
def _initialize_tpu(self): """Initialize the TPU devices in a separate session and graph. We keep track of all the TPU devices that we're initialized as we should only be running TPU initialize once for the entire process. """ master = self._tpu_cluster_resolver.master() # Verify TPU has not already been initialized in this process. if master in TPUExtended._initialized_devices: logging.info("TPU master %s has already been initialized." % master) return logging.info("Initializing the TPU system.") session_config = config_pb2.ConfigProto(allow_soft_placement=True) self._configure(session_config) with ops.Graph().as_default(): with session_lib.Session(config=session_config, target=master) as sess: sess.run([tpu.initialize_system()]) logging.info("Finized initializing TPU system.") # Update Strategy state to make sure we can track device initialization. TPUExtended._initialized_devices.append(master)
def setup_tpu_session(tpu_name_or_address): """Initializes and returns a Keras/TF session connected the TPU `master`. Args: tpu_name_or_address: A string that is either the name of the Cloud TPU, the grpc address of the Cloud TPU, or (Googlers only) the BNS name of the Cloud TPU. If tpu_name_or_address is None, the TPUClusterResolver will examine the environment to determine a potential Cloud TPU to use. Returns: A `tf.Session`. """ cluster_resolver = tpu_cluster_resolver.TPUClusterResolver( tpu_name_or_address) cluster_spec = cluster_resolver.cluster_spec() session = tf_session.Session( target=cluster_resolver.master(), config=config_pb2.ConfigProto(isolate_session_state=True)) if cluster_spec: session.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) K.set_session(session) K.get_session().run(tpu.initialize_system()) return session
def setup_tpu_session(tpu_name_or_address): """Initializes and returns a Keras/TF session connected the TPU `master`. Args: tpu_name_or_address: A string that is either the name of the Cloud TPU, the grpc address of the Cloud TPU, or (Googlers only) the BNS name of the Cloud TPU. If tpu_name_or_address is None, the TPUClusterResolver will examine the environment to determine a potential Cloud TPU to use. Returns: A `tf.Session`. """ cluster_resolver = tpu_cluster_resolver.TPUClusterResolver( tpu_name_or_address) cluster_spec = cluster_resolver.cluster_spec() session = tf_session.Session( target=cluster_resolver.master(), config=config_pb2.ConfigProto( isolate_session_state=True)) if cluster_spec: session.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) K.set_session(session) K.get_session().run(tpu.initialize_system()) return session
def load(self, model_path, model_output_dir, image_list_inmemory, params, batch_size=128, master="local", scenario="Offline", batch_timeout_micros=20 * 1000): if params["use_fused_bn"]: model_path = convert_checkpoint.convert_checkpoint( model_path, model_output_dir) tpu_graph = tf.Graph() tpu_config = tf.ConfigProto( operation_timeout_in_ms=600 * 1000, allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True) self.sess = tf.Session(master, graph=tpu_graph, config=tpu_config) self.params = params with tpu_graph.as_default(): image_list = tf.constant(image_list_inmemory, dtype=tf.int32) if scenario == "Offline": self.indices = tf.placeholder(shape=(batch_size[-1]), dtype=tf.int32) self.source_id = tf.placeholder(shape=(batch_size[-1]), dtype=tf.int32) self.raw_shape = tf.placeholder(shape=(batch_size[-1], 3), dtype=tf.int32) image = tf.gather(image_list, self.indices, axis=0) if not params["conv0_space_to_depth"]: # Transpose from [N, C, H, W] to [H, W, C, N] image = tf.transpose(image, [2, 3, 1, 0]) self.predict_op = self.offline_op( (image, self.source_id, self.raw_shape)) else: self.indices = tf.placeholder(dtype=tf.int32) self.source_id = tf.placeholder(dtype=tf.int32) self.raw_shape = tf.placeholder(dtype=tf.int32, shape=[None, 3]) image = tf.gather(image_list, self.indices, axis=0) self.predict_op = self.server_op( [image, self.source_id, self.raw_shape], num_batch_threads=16, max_batch_size=batch_size[-1], batch_timeout_micros=batch_timeout_micros, allowed_batch_sizes=batch_size, max_enqueued_batches=10000) self.sess.run(tpu.initialize_system()) for param in tf.trainable_variables(): tf.logging.info( " %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device)) # Checkpoint's variable name: https://internal/6714143388205056 tf.compat.v1.train.init_from_checkpoint(model_path, { "ssd1200/": "ssd1200/", }) self.sess.run(tf.initializers.global_variables()) return self
def begin(self): self._enqueue_ops = self._enqueue_fn() logging.info('TPU job name %s', self._tpu_job) self._init_op = [tpu.initialize_system(job=self._tpu_job)] self._finalize_op = [tpu.shutdown_system(job=self._tpu_job)]
def _tpu_init_fn(): return tpu.initialize_system()
def initialize(self): if context.executing_eagerly(): # TODO(priyag): Add appopriate call here when eager is supported for TPUs. raise NotImplementedError('Eager mode not supported in TPUStrategy.') else: return [tpu.initialize_system()]
def load(self, ckpt_path, hparams, master='local', batch_timeout_micros=80 * 1000, buckets=None): self.hparams = hparams self.buckets = buckets self.tpu_graph = tf.Graph() tpu_config = tf.ConfigProto( operation_timeout_in_ms=600 * 1000, allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True) # Find tpu master. print('master value set to:', master) tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( master, zone=None, project=None) master = tpu_cluster_resolver.get_master() self.sess = tf.Session(master, graph=self.tpu_graph, config=tpu_config) with self.tpu_graph.as_default(): self.vocab_table = tf.contrib.lookup.index_to_string_table_from_file( self.vocab_prefix, default_value=vocab_utils.UNK) if self.scenario == 'Offline': with self.tpu_graph.as_default(): self.source = tf.placeholder(shape=(hparams.infer_batch_size, hparams.src_max_len_infer), dtype=tf.int32) self.source_sequence_length = tf.placeholder( shape=(hparams.infer_batch_size), dtype=tf.int32) inputs = [[self.source, self.source_sequence_length]] self.predict_ops.append(self.offline_op(inputs)) else: with self.tpu_graph.as_default(): self.source = tf.placeholder( shape=[None, hparams.src_max_len_infer], dtype=tf.int32) self.source_sequence_length = tf.placeholder(shape=[None], dtype=tf.int32) inputs = [self.source, self.source_sequence_length] for _ in buckets: self.predict_ops.append( self.server_op( inputs, num_batch_threads=16, max_batch_size=hparams.infer_batch_size, batch_timeout_micros=batch_timeout_micros, allowed_batch_sizes=[hparams.infer_batch_size], max_enqueued_batches=10000)) # Add longest sequence predict op. self.predict_ops.append( self.server_op( inputs, num_batch_threads=16, max_batch_size=hparams.infer_batch_size, batch_timeout_micros=5000 * 1000, allowed_batch_sizes=[hparams.infer_batch_size], max_enqueued_batches=10000)) with self.tpu_graph.as_default(): vs = tf.global_variables() assign_ops = [] var_map = {} with tf.variable_scope('f32', dtype=tf.float32): for i in vs: if 'output_projection' in i.name: new_var = tf.get_variable( i.name[:-2], [i.shape[0], hparams.tgt_vocab_size]) assign_ops.append( tf.assign( i, tf.pad( tf.cast(new_var, i.dtype), [[0, 0], [ 0, 128 * (hparams.tgt_vocab_size // 128 + 1) - hparams.tgt_vocab_size ]]))) else: new_var = tf.get_variable(i.name[:-2], i.shape) assign_ops.append( tf.assign(i, tf.cast(new_var, i.dtype))) var_map[i.name[:-2]] = new_var.name[:-2] self.sess.run(tpu.initialize_system()) tf.train.init_from_checkpoint(ckpt_path, var_map) self.sess.run(tf.initializers.global_variables()) self.sess.run(tf.tables_initializer()) self.sess.run(assign_ops) return self
def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction, use_callable_loss, is_tpu): with distribution.scope(): all_vars = [] def model_fn(x, y): def loss_fn(): # Use fixed initialization to make the steps deterministic. w = variable_scope.get_variable("w", initializer=[[2.]]) all_vars.append(w) predict = math_ops.matmul(x, w) return losses_impl.mean_squared_error( y, predict, reduction=loss_reduction) optimizer = optimizer_fn() # GradientDescent with 0.2 learning rate if use_callable_loss: return optimizer.minimize(loss_fn) else: return optimizer.minimize(loss_fn()) def dataset_fn(): features = dataset_ops.Dataset.from_tensors([[2.], [7.]]) labels = dataset_ops.Dataset.from_tensors([[6.], [21.]]) return dataset_ops.Dataset.zip((features, labels)).repeat() iterator = distribution.distribute_dataset( dataset_fn).make_one_shot_iterator() def run_step(): return distribution.group( distribution.call_for_each_tower( model_fn, *iterator.get_next(), run_concurrently=False)) if not context.executing_eagerly(): with self.test_session() as sess: if is_tpu: sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) run_step() v = all_vars[0] self.assertTrue(all([v is vi for vi in all_vars[1:]])) weight = numpy.squeeze(self.evaluate(distribution.fetch(v))) # Our model is: # predict = x * w # loss = (predict - y)^2 # dloss/dpredict = 2*(predict - y) # dloss/dw = 2 * x^T @ (predict - y) # For our batch size of 2, assuming sum loss reduction: # x = [2, 7] # y = [6, 21] # w_initial = 2 # predict = [4, 14] # predict - y = [-2, -7] # dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106 # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2 # with sum loss reduction, or 10.6 with mean. if loss_reduction == losses_impl.Reduction.SUM: # Note that the "distribution.num_towers" factor will go away once # we split the input across towers, instead of pulling a complete # batch of input per tower. self.assertNear(weight, 2 + 21.2 * distribution.num_towers, 0.0001) else: # One of the mean loss reductions. self.assertNear(weight, 2 + 10.6, 0.0001) if is_tpu: with self.test_session() as sess: sess.run(tpu.shutdown_system())
def _check(): with session.Session() as sess: sess.run(tpu.initialize_system()) sess.run(tpu.shutdown_system())
def _check(): with tf_session.Session() as sess: sess.run(tpu.initialize_system()) sess.run(tpu.shutdown_system())
def testOptimizerInsideModelFn(self, distribution, optimizer_fn, is_tpu): created_variables = [] trainable_variables = [] def appending_creator(next_creator, *args, **kwargs): v = next_creator(*args, **kwargs) created_variables.append(v.name) if "trainable" in kwargs and kwargs["trainable"]: trainable_variables.append(v.name) return v # Creator scope needs to be set before it's used inside # `distribution.scope`. with variable_scope.variable_creator_scope( appending_creator), distribution.scope(): model_fn, dataset_fn, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=True, create_optimizer_inside_model_fn=True) iterator = distribution.distribute_dataset( dataset_fn).make_one_shot_iterator() def run_step(): return distribution.group( distribution.call_for_each_tower( model_fn, iterator.get_next(), run_concurrently=layer.built)) if not context.executing_eagerly(): with self.test_session() as sess: if is_tpu: sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) run_step() if is_tpu: with self.test_session() as sess: sess.run(tpu.shutdown_system()) def get_expected_variables(optimizer_fn, num_parameter_devices): variables_map = { "GradientDescent": ["dense/kernel", "dense/bias"], "Adam": [ "dense/kernel", "dense/bias", "beta1_power", "beta2_power", "dense/kernel/Adam", "dense/kernel/Adam_1", "dense/bias/Adam", "dense/bias/Adam_1" ] } variables = variables_map[optimizer_fn().get_name()] variables.extend([ v + "/replica_{}".format(replica) for v in variables for replica in range(1, num_parameter_devices) ]) return set([v + ":0" for v in variables]) self.assertEqual( get_expected_variables(optimizer_fn, len(distribution.parameter_devices)), set(created_variables))
def begin(self): self._enqueue_ops = self._enqueue_fn() logging.info('TPU job name %s', self._tpu_job) self._init_op = [tpu.initialize_system(job=self._tpu_job)] self._finalize_op = [tpu.shutdown_system(job=self._tpu_job)]
def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction, use_callable_loss, is_tpu): with distribution.scope(): all_vars = [] def model_fn(x, y): def loss_fn(): # Use fixed initialization to make the steps deterministic. w = variable_scope.get_variable("w", initializer=[[2.]]) all_vars.append(w) predict = math_ops.matmul(x, w) return losses_impl.mean_squared_error( y, predict, reduction=loss_reduction) optimizer = optimizer_fn( ) # GradientDescent with 0.2 learning rate if use_callable_loss: return optimizer.minimize(loss_fn) else: return optimizer.minimize(loss_fn()) def dataset_fn(): features = dataset_ops.Dataset.from_tensors([[2.], [7.]]) labels = dataset_ops.Dataset.from_tensors([[6.], [21.]]) return dataset_ops.Dataset.zip((features, labels)).repeat() iterator = distribution.distribute_dataset( dataset_fn).make_one_shot_iterator() def run_step(): return distribution.group( distribution.call_for_each_tower(model_fn, *iterator.get_next(), run_concurrently=False)) if not context.executing_eagerly(): with self.test_session() as sess: if is_tpu: sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) run_step() v = all_vars[0] self.assertTrue(all([v is vi for vi in all_vars[1:]])) weight = numpy.squeeze(self.evaluate(distribution.fetch(v))) # Our model is: # predict = x * w # loss = (predict - y)^2 # dloss/dpredict = 2*(predict - y) # dloss/dw = 2 * x^T @ (predict - y) # For our batch size of 2, assuming sum loss reduction: # x = [2, 7] # y = [6, 21] # w_initial = 2 # predict = [4, 14] # predict - y = [-2, -7] # dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106 # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2 # with sum loss reduction, or 10.6 with mean. if loss_reduction == losses_impl.Reduction.SUM: # Note that the "distribution.num_towers" factor will go away once # we split the input across towers, instead of pulling a complete # batch of input per tower. self.assertNear(weight, 2 + 21.2 * distribution.num_towers, 0.0001) else: # One of the mean loss reductions. self.assertNear(weight, 2 + 10.6, 0.0001) if is_tpu: with self.test_session() as sess: sess.run(tpu.shutdown_system())
def testOptimizerInsideModelFn(self, distribution, optimizer_fn, is_tpu): created_variables = [] trainable_variables = [] def appending_creator(next_creator, *args, **kwargs): v = next_creator(*args, **kwargs) created_variables.append(v.name) if "trainable" in kwargs and kwargs["trainable"]: trainable_variables.append(v.name) return v # Creator scope needs to be set before it's used inside # `distribution.scope`. with variable_scope.variable_creator_scope( appending_creator), distribution.scope(): model_fn, dataset_fn, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=True, create_optimizer_inside_model_fn=True) iterator = distribution.distribute_dataset( dataset_fn).make_one_shot_iterator() def run_step(): return distribution.group( distribution.call_for_each_tower( model_fn, iterator.get_next(), run_concurrently=layer.built)) if not context.executing_eagerly(): with self.test_session() as sess: if is_tpu: sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) run_step() if is_tpu: with self.test_session() as sess: sess.run(tpu.shutdown_system()) def get_expected_variables(optimizer_fn, num_parameter_devices): variables_map = { "GradientDescent": ["dense/kernel", "dense/bias"], "Adam": [ "dense/kernel", "dense/bias", "beta1_power", "beta2_power", "dense/kernel/Adam", "dense/kernel/Adam_1", "dense/bias/Adam", "dense/bias/Adam_1" ] } variables = variables_map[optimizer_fn().get_name()] variables.extend([ v + "/replica_{}".format(replica) for v in variables for replica in range(1, num_parameter_devices) ]) return set([v + ":0" for v in variables]) self.assertEqual( get_expected_variables(optimizer_fn, len(distribution.parameter_devices)), set(created_variables))
def get_initialization_ops(self): return [tpu.initialize_system()]