def testTrainReplicated(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_model_fn(features, labels, mode): # pylint: disable=unused-argument self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode) loss = ipu.ops.cross_replica_ops.cross_replica_sum(features, name="loss") train_op = array_ops.identity(loss) return model_fn_lib.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) def my_input_fn(): dataset = tu.create_dual_increasing_dataset(10, data_shape=[1], label_shape=[1]) dataset = dataset.batch(batch_size=1, drop_remainder=True) return dataset ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, 4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=2, num_replicas=4, ipu_options=ipu_options), log_step_count_steps=1, save_summary_steps=1) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) session_run_counter = _SessionRunCounter() num_steps = 6 estimator.train(input_fn=my_input_fn, steps=num_steps, hooks=[session_run_counter]) self.assertEqual( session_run_counter.num_session_runs, num_steps // config.ipu_run_config.iterations_per_loop) model_dir = estimator.model_dir events_file = glob.glob(model_dir + "/*tfevents*") assert len(events_file) == 1 events_file = events_file[0] loss_output = list() for e in summary_iterator.summary_iterator(events_file): for v in e.summary.value: if "loss" in v.tag: loss_output.append(v.simple_value) # loss is averaged across iterations per loop self.assertEqual(loss_output, [14.0, 16.0, 18.0])
def testNumUniqueDevicesBelowNumShardsRange(self): def model_fn_with_zero_stages(mode): def optimizer_function(): pass return IPUPipelineEstimatorSpec(mode, computational_stages=[], gradient_accumulation_count=1, device_mapping=[0, 1, 0], optimizer_function=optimizer_function) def my_input_fn(): return dataset_ops.Dataset.from_tensor_slices(([0], [0])) ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( num_shards=4, iterations_per_loop=1, ipu_options=ipu_options)) estimator = IPUPipelineEstimator(model_fn=model_fn_with_zero_stages, config=config) with self.assertRaisesRegex( ValueError, r"This pipeline requires 2 devices, but " "`IPURunConfig.num_shards` was set to 4"): estimator.train(input_fn=my_input_fn, steps=1)
def testTrainWithAutomaticSharding(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_model_fn(features, labels, mode): self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode) with variable_scope.variable_scope("vs", use_resource=True): predictions = layers.Dense(units=1)(features) loss = losses.mean_squared_error(labels=labels, predictions=predictions) sharded_optimizer_obj = sharded_optimizer.ShardedOptimizer( gradient_descent.GradientDescentOptimizer(0.1)) train_op = sharded_optimizer_obj.minimize(loss) return model_fn_lib.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) def my_input_fn(): dataset = dataset_ops.Dataset.from_tensor_slices( _create_regression_dataset(num_samples=1000, num_features=5)) dataset = dataset.batch(batch_size=2, drop_remainder=True).repeat() return dataset ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, 4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=2, num_shards=4, autosharding=True, ipu_options=ipu_options), log_step_count_steps=1, save_summary_steps=1) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) estimator.train(input_fn=my_input_fn, steps=10) model_dir = estimator.model_dir events_file = glob.glob(model_dir + "/*tfevents*") assert len(events_file) == 1 events_file = events_file[0] loss_output = list() for e in summary_iterator.summary_iterator(events_file): for v in e.summary.value: if "loss" in v.tag: loss_output.append(v.simple_value) self.assertTrue(loss_output[0] > loss_output[-1])
def testReplicatedEvaluationOnHost(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_input_fn(): features = [0, 0, 0, 1, 0, 0, 0, 1] labels = [0, 1, 0, 1, 0, 1, 0, 1] return dataset_ops.Dataset.from_tensor_slices( (features, labels)).batch(2, drop_remainder=True) def my_metrics_fn(features, labels): labels64 = math_ops.cast(labels, np.int64) return { "accuracy": metrics_impl.accuracy(labels, features), "precision": metrics_impl.precision(labels, features), "recall": metrics_impl.recall(labels, features), "recall_at_1": metrics_impl.recall_at_k(labels64, features, k=1), "recall_at_2": metrics_impl.recall_at_k(labels64, features, k=2), "mse": metrics_impl.mean_squared_error(labels, features), "rmse": metrics_impl.root_mean_squared_error(labels, features), } def my_model_fn(features, labels, mode): loss = math_ops.cast(replication_ops.replication_index(), np.float32) eval_metrics = (my_metrics_fn, [features, labels]) return ipu_estimator.IPUEstimatorSpec(mode, loss=loss, eval_metrics=eval_metrics) ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=1, num_replicas=4, ipu_options=ipu_options)) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) scores = estimator.evaluate(my_input_fn, steps=1) self.assertEqual(0.75, scores["accuracy"]) self.assertEqual(1.0, scores["precision"]) self.assertEqual(0.5, scores["recall"]) self.assertEqual(0.5, scores["recall_at_1"]) self.assertEqual(1.0, scores["recall_at_2"]) self.assertEqual(0.25, scores["mse"]) self.assertEqual(0.5, scores["rmse"]) self.assertEqual(1.5, scores[model_fn_lib.LOSS_METRIC_KEY])
def _make_config(iterations_per_loop=1): num_ipus_in_pipeline = 2 ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.set_ipu_model_options(ipu_options, compile_ipu_code=True, tiles_per_ipu=128) ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=num_ipus_in_pipeline) return ipu_run_config.RunConfig(ipu_run_config=ipu_run_config.IPURunConfig( num_shards=num_ipus_in_pipeline, iterations_per_loop=iterations_per_loop, ipu_options=ipu_options))
def testReplicatedPrediction(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_input_fn(): features = [ [1.0], # IPU0 [3.0], # IPU0 [5.0], # IPU1 [3.0], # IPU1 [7.0], # IPU2 [3.0], # IPU2 [9.0], # IPU3 [3.0], # IPU3 ] return dataset_ops.Dataset.from_tensor_slices(features).batch( batch_size=2, drop_remainder=True) hook = ipu_session_run_hooks.IPULoggingTensorHook(every_n_iter=1, replication_factor=4) def my_model_fn(features, mode): logging_op = hook.log({"features": features}) with ops.control_dependencies([logging_op]): predictions = math_ops.reduce_max(features) return model_fn_lib.EstimatorSpec( mode, predictions=predictions, ) ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=1, num_replicas=4, ipu_options=ipu_options)) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) outputs = estimator.predict(input_fn=my_input_fn, yield_single_examples=True) self.assertEqual(3.0, next(outputs)) self.assertEqual(5.0, next(outputs)) outputs = estimator.predict(input_fn=my_input_fn, yield_single_examples=False, hooks=[hook]) np.testing.assert_array_equal([3.0, 5.0, 7.0, 9.0], next(outputs))
def testReplicatedEvaluation(self): if ipu_utils.running_on_ipu_model(): self.skipTest( "Replicated top level graphs are not supported on the " "IPU_MODEL target") def my_input_fn(): # IPU0 mean: 2, max: 3 # IPU1 mean: 4, max: 5 features = [ [1.0], # IPU0 [3.0], # IPU0 [5.0], # IPU1 [3.0], # IPU1 [1.0], # IPU2 [3.0], # IPU2 [5.0], # IPU3 [3.0], # IPU3 ] return dataset_ops.Dataset.from_tensor_slices(features).batch( batch_size=2, drop_remainder=True) def my_model_fn(features, mode): loss = math_ops.reduce_max(features) eval_metric_ops = { "feature_mean": metrics_impl.mean(features), } return model_fn_lib.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops) ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=1, num_replicas=4, ipu_options=ipu_options)) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) scores = estimator.evaluate(my_input_fn, steps=1) self.assertEqual(3., scores["feature_mean"]) self.assertEqual(4., scores[model_fn_lib.LOSS_METRIC_KEY])
def testReplicatedTrainingWithoutCrossReplicaSumShouldThrow(self): def my_input_fn(): return dataset_ops.Dataset.from_tensor_slices([]) def my_model_fn(features, mode): loss = math_ops.reduce_sum(features) train_op = array_ops.identity(loss) return model_fn_lib.EstimatorSpec(mode, loss=loss, train_op=train_op) ipu_options = ipu_utils.create_ipu_config() ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4) config = ipu_run_config.RunConfig( ipu_run_config=ipu_run_config.IPURunConfig( iterations_per_loop=1, num_replicas=4, ipu_options=ipu_options)) estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn, config=config) with self.assertRaisesRegex( ValueError, "This is not a valid replicated training graph"): estimator.train(input_fn=my_input_fn, steps=1)