def finalize(self): if context.executing_eagerly(): # TODO(priyag): Add appopriate call here when eager is supported for TPUs. raise NotImplementedError( 'Eager mode not supported in TPUStrategy.') else: return [tpu.shutdown_system()]
def _run_tpu_initialization(self): """Test TPU system initialization.""" with tf.Session('grpc://{0}:8470'.format(self.tpu_ip)) as sess: sess.run(tpu.initialize_system()) sess.run(tpu.shutdown_system()) logging.info('Successfully initialized and shutdown the tpu') self.tpu_initialization = 'Passed'
def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum, renorm, is_tpu, update_ops_in_cross_tower_mode): """Verifies that moving mean updates are reduced across towers.""" with distribution.scope(): num_towers = len(distribution.worker_devices) model_fn, dataset_fn, batchnorm = batchnorm_example( optimizer_fn, batch_per_epoch=num_towers, momentum=momentum, renorm=renorm, update_ops_in_tower_mode=not update_ops_in_cross_tower_mode) # Make sure prefetching is disabled since that makes the # specific input on each device to be non deterministic, and # this test relies on specific input being on each device. if isinstance(distribution, mirrored_strategy.MirroredStrategy): self.assertFalse(distribution._prefetch_on_device) iterator = distribution.distribute_dataset( dataset_fn).make_one_shot_iterator() def run_step(): fetches = distribution.unwrap( distribution.call_for_each_tower( model_fn, iterator.get_next(), run_concurrently=batchnorm.built)) if update_ops_in_cross_tower_mode: fetches += ops.get_collection(ops.GraphKeys.UPDATE_OPS) return control_flow_ops.group(fetches) if not context.executing_eagerly(): with self.test_session() as sess: if is_tpu: sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) expected_moving_means = [0.] * 8 def averaged_batch_mean(i): # Each batch has shape [16, 8] where the ith element in jth list is # (8 * j + i + tower_id * 100). So the batch mean in each tower is # (60 + i + tower_id * 100). So here comes its batch mean over all # towers: return 60. + i + (num_towers - 1.) / 2. * 100. for _ in range(10): run_step() moving_means = self.evaluate(distribution.fetch(batchnorm.moving_mean)) # We make sure that the moving_mean is updated as if the sample mean is # calculated over all towers. for i, expected_moving_mean in enumerate(expected_moving_means): expected_moving_means[i] -= (( expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum)) self.assertNear(expected_moving_means[i], moving_means[i], 0.0001) if is_tpu: with self.test_session() as sess: sess.run(tpu.shutdown_system())
def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss, is_tpu): with distribution.scope(): model_fn, dataset, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss) # TODO(isaprykin): Eliminate `is_tpu`. Probably add a # `DistributionStrategy.create_monitor` so that each DistributionStrategy # could influence its training loop. That method would return an instance # of Monitor. TPUMonitor would execute tpu.initialize_system() and # tpu.shutdown_system(). if is_tpu: dataset = dataset.batch(2) iterator = distribution.distribute_dataset(dataset) def run_step(): # TODO(isaprykin): Make iterator get_next() return a list of sub- # batches for each iteration. Pass iterator.get_next() and not iterator # to call_for_each_tower. return distribution.group( distribution.call_for_each_tower( model_fn, iterator.get_next() if not is_tpu else iterator, run_concurrently=layer.built)) if not context.executing_eagerly(): with self.test_session() as sess: if is_tpu: sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) weights, biases = [], [] for _ in range(10): run_step() weights.append(self.evaluate(distribution.fetch(layer.kernel))) biases.append(self.evaluate(distribution.fetch(layer.bias))) if is_tpu: with self.test_session() as sess: sess.run(tpu.shutdown_system()) error = abs( numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1) is_not_increasing = all(y <= x for x, y in zip(error, error[1:])) self.assertTrue(is_not_increasing)
def shutdown_tpu_session(session=None): """Shutdown the TPU attached to session. This should be called to cleanly shut down the TPU system before the client exits. Args: session: Session to shutdown, or None to use the default session. Returns: """ if session is None: session = K.get_session() session.run(tpu.shutdown_system())
def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss, is_tpu): # TODO(priyag): Remove this once the step TPU Strategy is stable. if is_tpu: self.skipTest("TPU tests are WIP.") with distribution.scope(): model_fn, dataset_fn, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss) # TODO(isaprykin): Eliminate `is_tpu`. Probably add a # `DistributionStrategy.create_monitor` so that each DistributionStrategy # could influence its training loop. That method would return an instance # of Monitor. TPUMonitor would execute tpu.initialize_system() and # tpu.shutdown_system(). iterator = distribution.distribute_dataset( dataset_fn).make_one_shot_iterator() def run_step(): return distribution.group( distribution.call_for_each_tower( model_fn, iterator.get_next(), run_concurrently=layer.built)) if not context.executing_eagerly(): with self.test_session() as sess: if is_tpu: sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) weights, biases = [], [] for _ in range(10): run_step() weights.append(self.evaluate(layer.kernel)) biases.append(self.evaluate(layer.bias)) if is_tpu: with self.test_session() as sess: sess.run(tpu.shutdown_system()) error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1) is_not_increasing = all(y <= x for x, y in zip(error, error[1:])) self.assertTrue(is_not_increasing)
def run_on_device(self, model_fn, model_inputs, device): """Runs `model_fn` on the given device. Raises an exception if no such device is available. `model_fn` should return one or more tensors as a list or tuple. Args: model_fn: Function returning one or more tensors. model_inputs: An iterable of Numpy arrays or scalars. These will be passed as arguments to `model_fn`. device: Device to run on. One of ("tpu", "gpu", "cpu"). Returns: Output from the model function. """ def _make_placeholders(): return dict( [(gen_array_ops.placeholder_with_default(v, v.shape), v) for v in model_inputs]) if device == "tpu": with self.test_session(graph=ops.Graph()) as sess: placeholders = _make_placeholders() tpu_computation = tpu.rewrite(model_fn, placeholders.keys()) sess.run(tpu.initialize_system()) sess.run(variables.global_variables_initializer()) result = sess.run(tpu_computation, placeholders) sess.run(tpu.shutdown_system()) # TODO(b/36891278): supports non-flat returns lists in tpu.rewrite(). if len(result) == 1: return result[0] return result elif device == "gpu": with self.test_session(graph=ops.Graph(), use_gpu=True) as sess: placeholders = _make_placeholders() sess.run(variables.global_variables_initializer()) return sess.run(model_fn(placeholders.keys()), placeholders) elif device == "cpu": # TODO(power) -- will this interact poorly with cached GPU sessions? with self.test_session(graph=ops.Graph(), use_gpu=False) as sess: placeholders = _make_placeholders() sess.run(variables.global_variables_initializer()) return sess.run(model_fn(placeholders.keys()), placeholders)
def _run_tpu_computation(self): """Attempt to run computation graph directly on TPU.""" def _computation_fn(alpha, x, y): return alpha * x + y alpha = tf.Variable(3.0, name='alpha') x = tf.Variable(tf.ones([3, 3], tf.float32), name='x') y = tf.Variable(tf.ones([3, 3], tf.float32), name='y') result = contrib_tpu.rewrite(_computation_fn, [alpha, x, y]) with tf.Session('grpc://{0}:8470'.format(self.tpu_ip)) as sess: sess.run(contrib_tpu.initialize_system()) sess.run(tf.global_variables_initializer()) logging.info(sess.run(result)) sess.run(tpu.shutdown_system()) logging.info('Output should be a 3x3 matrix with all 4s.') self.tpu_computation = 'Passed' logging.info('Successfully ran a computation on the TPU')
def run_on_device(self, model_fn, model_inputs, device): """Runs `model_fn` on the given device. Raises an exception if no such device is available. `model_fn` should return one or more tensors as a list or tuple. Args: model_fn: Function returning one or more tensors. model_inputs: An iterable of Numpy arrays or scalars. These will be passed as arguments to `model_fn`. device: Device to run on. One of ("tpu", "gpu", "cpu"). Returns: Output from the model function. """ def _make_placeholders(): return dict([(gen_array_ops.placeholder_with_default(v, v.shape), v) for v in model_inputs]) if device == "tpu": with self.test_session(graph=ops.Graph()) as sess: placeholders = _make_placeholders() tpu_computation = tpu.rewrite(model_fn, placeholders.keys()) sess.run(tpu.initialize_system()) sess.run(variables.global_variables_initializer()) result = sess.run(tpu_computation, placeholders) sess.run(tpu.shutdown_system()) # TODO(b/36891278): supports non-flat returns lists in tpu.rewrite(). if len(result) == 1: return result[0] return result elif device == "gpu": with self.test_session(graph=ops.Graph(), use_gpu=True) as sess: placeholders = _make_placeholders() sess.run(variables.global_variables_initializer()) return sess.run(model_fn(placeholders.keys()), placeholders) elif device == "cpu": # TODO(power) -- will this interact poorly with cached GPU sessions? with self.test_session(graph=ops.Graph(), use_gpu=False) as sess: placeholders = _make_placeholders() sess.run(variables.global_variables_initializer()) return sess.run(model_fn(placeholders.keys()), placeholders)
def shutdown(self): logging.info('Shutting down TPU session.') with self.tpu_session() as session: session.run(tpu.shutdown_system()) self._session.close()
def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction, use_callable_loss, is_tpu): with distribution.scope(): all_vars = [] def model_fn(x, y): def loss_fn(): # Use fixed initialization to make the steps deterministic. w = variable_scope.get_variable("w", initializer=[[2.]]) all_vars.append(w) predict = math_ops.matmul(x, w) return losses_impl.mean_squared_error( y, predict, reduction=loss_reduction) optimizer = optimizer_fn() # GradientDescent with 0.2 learning rate if use_callable_loss: return optimizer.minimize(loss_fn) else: return optimizer.minimize(loss_fn()) def dataset_fn(): features = dataset_ops.Dataset.from_tensors([[2.], [7.]]) labels = dataset_ops.Dataset.from_tensors([[6.], [21.]]) return dataset_ops.Dataset.zip((features, labels)).repeat() iterator = distribution.distribute_dataset( dataset_fn).make_one_shot_iterator() def run_step(): return distribution.group( distribution.call_for_each_tower( model_fn, *iterator.get_next(), run_concurrently=False)) if not context.executing_eagerly(): with self.test_session() as sess: if is_tpu: sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) run_step() v = all_vars[0] self.assertTrue(all([v is vi for vi in all_vars[1:]])) weight = numpy.squeeze(self.evaluate(distribution.fetch(v))) # Our model is: # predict = x * w # loss = (predict - y)^2 # dloss/dpredict = 2*(predict - y) # dloss/dw = 2 * x^T @ (predict - y) # For our batch size of 2, assuming sum loss reduction: # x = [2, 7] # y = [6, 21] # w_initial = 2 # predict = [4, 14] # predict - y = [-2, -7] # dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106 # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2 # with sum loss reduction, or 10.6 with mean. if loss_reduction == losses_impl.Reduction.SUM: # Note that the "distribution.num_towers" factor will go away once # we split the input across towers, instead of pulling a complete # batch of input per tower. self.assertNear(weight, 2 + 21.2 * distribution.num_towers, 0.0001) else: # One of the mean loss reductions. self.assertNear(weight, 2 + 10.6, 0.0001) if is_tpu: with self.test_session() as sess: sess.run(tpu.shutdown_system())
def _check(): with session.Session() as sess: sess.run(tpu.initialize_system()) sess.run(tpu.shutdown_system())
def finalize(self): if context.executing_eagerly(): # TODO(priyag): Add appopriate call here when eager is supported for TPUs. raise NotImplementedError('Eager mode not supported in TPUStrategy.') else: return [tpu.shutdown_system()]
def testOptimizerInsideModelFn(self, distribution, optimizer_fn, is_tpu): created_variables = [] trainable_variables = [] def appending_creator(next_creator, *args, **kwargs): v = next_creator(*args, **kwargs) created_variables.append(v.name) if "trainable" in kwargs and kwargs["trainable"]: trainable_variables.append(v.name) return v # Creator scope needs to be set before it's used inside # `distribution.scope`. with variable_scope.variable_creator_scope( appending_creator), distribution.scope(): model_fn, dataset_fn, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=True, create_optimizer_inside_model_fn=True) iterator = distribution.distribute_dataset( dataset_fn).make_one_shot_iterator() def run_step(): return distribution.group( distribution.call_for_each_tower( model_fn, iterator.get_next(), run_concurrently=layer.built)) if not context.executing_eagerly(): with self.test_session() as sess: if is_tpu: sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) run_step() if is_tpu: with self.test_session() as sess: sess.run(tpu.shutdown_system()) def get_expected_variables(optimizer_fn, num_parameter_devices): variables_map = { "GradientDescent": ["dense/kernel", "dense/bias"], "Adam": [ "dense/kernel", "dense/bias", "beta1_power", "beta2_power", "dense/kernel/Adam", "dense/kernel/Adam_1", "dense/bias/Adam", "dense/bias/Adam_1" ] } variables = variables_map[optimizer_fn().get_name()] variables.extend([ v + "/replica_{}".format(replica) for v in variables for replica in range(1, num_parameter_devices) ]) return set([v + ":0" for v in variables]) self.assertEqual( get_expected_variables(optimizer_fn, len(distribution.parameter_devices)), set(created_variables))
def _check(): with tf_session.Session() as sess: sess.run(tpu.initialize_system()) sess.run(tpu.shutdown_system())
def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction, use_callable_loss, is_tpu): with distribution.scope(): all_vars = [] def model_fn(x, y): def loss_fn(): # Use fixed initialization to make the steps deterministic. w = variable_scope.get_variable("w", initializer=[[2.]]) all_vars.append(w) predict = math_ops.matmul(x, w) return losses_impl.mean_squared_error( y, predict, reduction=loss_reduction) optimizer = optimizer_fn( ) # GradientDescent with 0.2 learning rate if use_callable_loss: return optimizer.minimize(loss_fn) else: return optimizer.minimize(loss_fn()) def dataset_fn(): features = dataset_ops.Dataset.from_tensors([[2.], [7.]]) labels = dataset_ops.Dataset.from_tensors([[6.], [21.]]) return dataset_ops.Dataset.zip((features, labels)).repeat() iterator = distribution.distribute_dataset( dataset_fn).make_one_shot_iterator() def run_step(): return distribution.group( distribution.call_for_each_tower(model_fn, *iterator.get_next(), run_concurrently=False)) if not context.executing_eagerly(): with self.test_session() as sess: if is_tpu: sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) run_step() v = all_vars[0] self.assertTrue(all([v is vi for vi in all_vars[1:]])) weight = numpy.squeeze(self.evaluate(distribution.fetch(v))) # Our model is: # predict = x * w # loss = (predict - y)^2 # dloss/dpredict = 2*(predict - y) # dloss/dw = 2 * x^T @ (predict - y) # For our batch size of 2, assuming sum loss reduction: # x = [2, 7] # y = [6, 21] # w_initial = 2 # predict = [4, 14] # predict - y = [-2, -7] # dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106 # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2 # with sum loss reduction, or 10.6 with mean. if loss_reduction == losses_impl.Reduction.SUM: # Note that the "distribution.num_towers" factor will go away once # we split the input across towers, instead of pulling a complete # batch of input per tower. self.assertNear(weight, 2 + 21.2 * distribution.num_towers, 0.0001) else: # One of the mean loss reductions. self.assertNear(weight, 2 + 10.6, 0.0001) if is_tpu: with self.test_session() as sess: sess.run(tpu.shutdown_system())
def get_finalize_ops(self): return [tpu.shutdown_system()]
def begin(self): self._enqueue_ops = self._enqueue_fn() logging.info('TPU job name %s', self._tpu_job) self._init_op = [tpu.initialize_system(job=self._tpu_job)] self._finalize_op = [tpu.shutdown_system(job=self._tpu_job)]