Exemple #1
0
    def testTrainReplicated(self):
        if ipu_utils.running_on_ipu_model():
            self.skipTest(
                "Replicated top level graphs are not supported on the "
                "IPU_MODEL target")

        def my_model_fn(features, labels, mode):  # pylint: disable=unused-argument
            self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode)

            loss = ipu.ops.cross_replica_ops.cross_replica_sum(features,
                                                               name="loss")

            train_op = array_ops.identity(loss)

            return model_fn_lib.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)

        def my_input_fn():
            dataset = tu.create_dual_increasing_dataset(10,
                                                        data_shape=[1],
                                                        label_shape=[1])
            dataset = dataset.batch(batch_size=1, drop_remainder=True)
            return dataset

        ipu_options = ipu_utils.create_ipu_config()
        ipu_options = ipu_utils.auto_select_ipus(ipu_options, 4)
        config = ipu_run_config.RunConfig(
            ipu_run_config=ipu_run_config.IPURunConfig(
                iterations_per_loop=2, num_replicas=4,
                ipu_options=ipu_options),
            log_step_count_steps=1,
            save_summary_steps=1)

        estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn,
                                               config=config)

        session_run_counter = _SessionRunCounter()

        num_steps = 6
        estimator.train(input_fn=my_input_fn,
                        steps=num_steps,
                        hooks=[session_run_counter])

        self.assertEqual(
            session_run_counter.num_session_runs,
            num_steps // config.ipu_run_config.iterations_per_loop)

        model_dir = estimator.model_dir
        events_file = glob.glob(model_dir + "/*tfevents*")
        assert len(events_file) == 1
        events_file = events_file[0]
        loss_output = list()
        for e in summary_iterator.summary_iterator(events_file):
            for v in e.summary.value:
                if "loss" in v.tag:
                    loss_output.append(v.simple_value)

        # loss is averaged across iterations per loop
        self.assertEqual(loss_output, [14.0, 16.0, 18.0])
Exemple #2
0
    def testTrainWithAutomaticSharding(self):
        if ipu_utils.running_on_ipu_model():
            self.skipTest(
                "Replicated top level graphs are not supported on the "
                "IPU_MODEL target")

        def my_model_fn(features, labels, mode):
            self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode)

            with variable_scope.variable_scope("vs", use_resource=True):
                predictions = layers.Dense(units=1)(features)

            loss = losses.mean_squared_error(labels=labels,
                                             predictions=predictions)
            sharded_optimizer_obj = sharded_optimizer.ShardedOptimizer(
                gradient_descent.GradientDescentOptimizer(0.1))
            train_op = sharded_optimizer_obj.minimize(loss)

            return model_fn_lib.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)

        def my_input_fn():
            dataset = dataset_ops.Dataset.from_tensor_slices(
                _create_regression_dataset(num_samples=1000, num_features=5))
            dataset = dataset.batch(batch_size=2, drop_remainder=True).repeat()
            return dataset

        ipu_options = ipu_utils.create_ipu_config()
        ipu_options = ipu_utils.auto_select_ipus(ipu_options, 4)

        config = ipu_run_config.RunConfig(
            ipu_run_config=ipu_run_config.IPURunConfig(
                iterations_per_loop=2,
                num_shards=4,
                autosharding=True,
                ipu_options=ipu_options),
            log_step_count_steps=1,
            save_summary_steps=1)

        estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn,
                                               config=config)

        estimator.train(input_fn=my_input_fn, steps=10)

        model_dir = estimator.model_dir
        events_file = glob.glob(model_dir + "/*tfevents*")
        assert len(events_file) == 1
        events_file = events_file[0]
        loss_output = list()
        for e in summary_iterator.summary_iterator(events_file):
            for v in e.summary.value:
                if "loss" in v.tag:
                    loss_output.append(v.simple_value)

        self.assertTrue(loss_output[0] > loss_output[-1])
Exemple #3
0
    def testReplicatedEvaluationOnHost(self):
        if ipu_utils.running_on_ipu_model():
            self.skipTest(
                "Replicated top level graphs are not supported on the "
                "IPU_MODEL target")

        def my_input_fn():
            features = [0, 0, 0, 1, 0, 0, 0, 1]
            labels = [0, 1, 0, 1, 0, 1, 0, 1]
            return dataset_ops.Dataset.from_tensor_slices(
                (features, labels)).batch(2, drop_remainder=True)

        def my_metrics_fn(features, labels):
            labels64 = math_ops.cast(labels, np.int64)
            return {
                "accuracy": metrics_impl.accuracy(labels, features),
                "precision": metrics_impl.precision(labels, features),
                "recall": metrics_impl.recall(labels, features),
                "recall_at_1": metrics_impl.recall_at_k(labels64,
                                                        features,
                                                        k=1),
                "recall_at_2": metrics_impl.recall_at_k(labels64,
                                                        features,
                                                        k=2),
                "mse": metrics_impl.mean_squared_error(labels, features),
                "rmse": metrics_impl.root_mean_squared_error(labels, features),
            }

        def my_model_fn(features, labels, mode):
            loss = math_ops.cast(replication_ops.replication_index(),
                                 np.float32)
            eval_metrics = (my_metrics_fn, [features, labels])
            return ipu_estimator.IPUEstimatorSpec(mode,
                                                  loss=loss,
                                                  eval_metrics=eval_metrics)

        ipu_options = ipu_utils.create_ipu_config()
        ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4)
        config = ipu_run_config.RunConfig(
            ipu_run_config=ipu_run_config.IPURunConfig(
                iterations_per_loop=1, num_replicas=4,
                ipu_options=ipu_options))

        estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn,
                                               config=config)
        scores = estimator.evaluate(my_input_fn, steps=1)
        self.assertEqual(0.75, scores["accuracy"])
        self.assertEqual(1.0, scores["precision"])
        self.assertEqual(0.5, scores["recall"])
        self.assertEqual(0.5, scores["recall_at_1"])
        self.assertEqual(1.0, scores["recall_at_2"])
        self.assertEqual(0.25, scores["mse"])
        self.assertEqual(0.5, scores["rmse"])
        self.assertEqual(1.5, scores[model_fn_lib.LOSS_METRIC_KEY])
Exemple #4
0
    def testReplicatedPrediction(self):
        if ipu_utils.running_on_ipu_model():
            self.skipTest(
                "Replicated top level graphs are not supported on the "
                "IPU_MODEL target")

        def my_input_fn():
            features = [
                [1.0],  # IPU0
                [3.0],  # IPU0
                [5.0],  # IPU1
                [3.0],  # IPU1
                [7.0],  # IPU2
                [3.0],  # IPU2
                [9.0],  # IPU3
                [3.0],  # IPU3
            ]
            return dataset_ops.Dataset.from_tensor_slices(features).batch(
                batch_size=2, drop_remainder=True)

        hook = ipu_session_run_hooks.IPULoggingTensorHook(every_n_iter=1,
                                                          replication_factor=4)

        def my_model_fn(features, mode):
            logging_op = hook.log({"features": features})
            with ops.control_dependencies([logging_op]):
                predictions = math_ops.reduce_max(features)

            return model_fn_lib.EstimatorSpec(
                mode,
                predictions=predictions,
            )

        ipu_options = ipu_utils.create_ipu_config()
        ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4)
        config = ipu_run_config.RunConfig(
            ipu_run_config=ipu_run_config.IPURunConfig(
                iterations_per_loop=1, num_replicas=4,
                ipu_options=ipu_options))
        estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn,
                                               config=config)

        outputs = estimator.predict(input_fn=my_input_fn,
                                    yield_single_examples=True)
        self.assertEqual(3.0, next(outputs))
        self.assertEqual(5.0, next(outputs))

        outputs = estimator.predict(input_fn=my_input_fn,
                                    yield_single_examples=False,
                                    hooks=[hook])
        np.testing.assert_array_equal([3.0, 5.0, 7.0, 9.0], next(outputs))
Exemple #5
0
    def testReplicatedEvaluation(self):
        if ipu_utils.running_on_ipu_model():
            self.skipTest(
                "Replicated top level graphs are not supported on the "
                "IPU_MODEL target")

        def my_input_fn():
            # IPU0 mean: 2, max: 3
            # IPU1 mean: 4, max: 5
            features = [
                [1.0],  # IPU0
                [3.0],  # IPU0
                [5.0],  # IPU1
                [3.0],  # IPU1
                [1.0],  # IPU2
                [3.0],  # IPU2
                [5.0],  # IPU3
                [3.0],  # IPU3
            ]
            return dataset_ops.Dataset.from_tensor_slices(features).batch(
                batch_size=2, drop_remainder=True)

        def my_model_fn(features, mode):
            loss = math_ops.reduce_max(features)
            eval_metric_ops = {
                "feature_mean": metrics_impl.mean(features),
            }
            return model_fn_lib.EstimatorSpec(mode,
                                              loss=loss,
                                              eval_metric_ops=eval_metric_ops)

        ipu_options = ipu_utils.create_ipu_config()
        ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4)
        config = ipu_run_config.RunConfig(
            ipu_run_config=ipu_run_config.IPURunConfig(
                iterations_per_loop=1, num_replicas=4,
                ipu_options=ipu_options))

        estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn,
                                               config=config)
        scores = estimator.evaluate(my_input_fn, steps=1)
        self.assertEqual(3., scores["feature_mean"])
        self.assertEqual(4., scores[model_fn_lib.LOSS_METRIC_KEY])
Exemple #6
0
    def testReplicatedTrainingWithoutCrossReplicaSumShouldThrow(self):
        def my_input_fn():
            return dataset_ops.Dataset.from_tensor_slices([])

        def my_model_fn(features, mode):
            loss = math_ops.reduce_sum(features)
            train_op = array_ops.identity(loss)
            return model_fn_lib.EstimatorSpec(mode,
                                              loss=loss,
                                              train_op=train_op)

        ipu_options = ipu_utils.create_ipu_config()
        ipu_options = ipu_utils.auto_select_ipus(ipu_options, num_ipus=4)
        config = ipu_run_config.RunConfig(
            ipu_run_config=ipu_run_config.IPURunConfig(
                iterations_per_loop=1, num_replicas=4,
                ipu_options=ipu_options))
        estimator = ipu_estimator.IPUEstimator(model_fn=my_model_fn,
                                               config=config)

        with self.assertRaisesRegex(
                ValueError, "This is not a valid replicated training graph"):
            estimator.train(input_fn=my_input_fn, steps=1)