Exemple #1
0
    def test_ipu_horovod_strategy(self):
        hvd_size = hvd.size()
        hvd_rank = hvd.rank()

        strategy = IPUHorovodStrategy()
        self.assertEqual(strategy.num_replicas_in_sync, hvd_size)

        cfg = ipu_utils.create_ipu_config()
        cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=1)
        ipu_utils.configure_ipu_system(cfg)

        with strategy.scope():

            def per_replica_fn():
                w = variable_scope.get_variable(name="w",
                                                initializer=hvd_rank + 1.0)
                self.assertEqual("/replica:0/task:0/device:IPU:0", w.device)
                return w * w

            per_replica_val = strategy.experimental_run_v2(per_replica_fn)
            strategy_sum = strategy.reduce(ReduceOp.SUM, per_replica_val)
            strategy_mean = strategy.reduce(ReduceOp.MEAN, per_replica_val)

            with session.Session() as sess:
                sess.run(variables.global_variables_initializer())

                # All workers should have the initial value from the first worker.
                self.assertEqual([1.0], sess.run(variables.global_variables()))
                self.assertEqual(1.0 * hvd_size, strategy_sum.eval())
                self.assertEqual(1.0, strategy_mean.eval())
Exemple #2
0
    def test_collectives(self):
        rank = constant_op.constant(hvd.rank(), dtype=np.float32)
        allreduced = hvd.allreduce(rank, op=hvd.Sum)
        allgathered = hvd.allgather(array_ops.expand_dims(rank, axis=0))
        broadcast = hvd.broadcast(rank, root_rank=0)

        with self.assertRaisesRegex(NotImplementedError,
                                    "The Adasum reduction is not implemented"):
            hvd.allreduce(rank, op=hvd.Adasum)

        cfg = ipu_utils.create_ipu_config()
        cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=1)
        ipu_utils.configure_ipu_system(cfg)

        with session.Session() as sess:
            self.assertAllEqual(np.arange(hvd.size()), sess.run(allgathered))
            self.assertAllEqual(np.sum(np.arange(hvd.size())),
                                sess.run(allreduced))
            self.assertAllEqual(0.0, sess.run(broadcast))
Exemple #3
0
    def update_ipu_config(self, config):
        """Update the given IPU configuration with the multi-replica
    distribution options.

    Args:
      config: The IpuOptions configuration protobuf to update.

    Returns:
      The IpuOptions configuration protobuf.
    """
        return ipu_utils.set_experimental_multi_replica_distribution_options(
            config, process_count=size(), process_index=rank())
Exemple #4
0
    def test_basics(self):
        self.assertTrue(hvd.mpi_built())
        self.assertTrue(hvd.mpi_enabled())

        self.assertFalse(hvd.nccl_built())
        self.assertFalse(hvd.ddl_built())
        self.assertFalse(hvd.mlsl_built())
        self.assertFalse(hvd.gloo_built())
        self.assertFalse(hvd.gloo_enabled())

        self.assertEqual(hvd.rank(), int(os.environ["OMPI_COMM_WORLD_RANK"]))

        self.assertEqual(hvd.local_rank(),
                         int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]))

        self.assertEqual(hvd.size(), hvd.local_size())
        self.assertTrue(hvd.is_homogeneous())
Exemple #5
0
    def test_strategy(self):
        strategy = ipu_multi_replica_strategy.IPUMultiReplicaStrategy()

        with strategy.scope():

            v = variables.Variable(initial_value=hvd.rank() + 1,
                                   dtype=np.float32)
            self.assertEndsWith(v.device, "/device:IPU:0")

            def per_replica_fn(x):
                y = v * x

                replica_context = distribution_strategy_context.get_replica_context(
                )

                # This reduction is done on IPU, and hence uses GCL. In this case,
                # since there is no replication in this test, it is an identity op.
                y_allreduced = replica_context.all_reduce(ReduceOp.SUM, y)
                self.assertEndsWith(y_allreduced.device, "/device:IPU:0")

                # Sanity check that replication normalise does not support int.
                with self.assertRaisesRegex(
                        TypeError, "int32 not in list of allowed values"):
                    replica_context.all_reduce(ReduceOp.MEAN, 1)

                return y_allreduced

            per_replica_value = strategy.experimental_run_v2(
                per_replica_fn, args=[constant_op.constant(2.0)])

            # This reduction is performed on CPU, and hence uses Horovod.
            value_allreduced = strategy.reduce(ReduceOp.SUM, per_replica_value)

            with session.Session() as sess:
                config = ipu.utils.create_ipu_config()
                config = ipu.utils.auto_select_ipus(config, 1)
                ipu.utils.configure_ipu_system(config)

                sess.run(v.initializer)

                # The initial value should be broadcast from rank 0.
                self.assertEqual(sess.run(v), 1.0)

                # There should be one allreduce sum of the values.
                self.assertEqual(sess.run(value_allreduced), hvd.size() * 2.0)
Exemple #6
0
def input_fn(mode):  # pylint: disable=unused-argument
    train_data, _ = tf.keras.datasets.mnist.load_data()

    def normalise(image, label):
        image = image.astype(np.float32) / 255.0
        image = np.expand_dims(image, axis=-1)
        label = label.astype(np.int32)
        return image, label

    x_train, y_train = normalise(*train_data)

    def generator():
        return zip(x_train, y_train)

    types = (x_train.dtype, y_train.dtype)
    shapes = (x_train.shape[1:], y_train.shape[1:])
    mnist_dataset = tf.data.Dataset.from_generator(generator, types, shapes)
    mnist_dataset = mnist_dataset.shard(hvd.size(), hvd.rank())
    mnist_dataset = mnist_dataset.shuffle(len(y_train)) \
        .cache().batch(BATCH_SIZE, drop_remainder=True).repeat()
    return mnist_dataset
Exemple #7
0
 def __init__(self, container_strategy, cluster_resolver, ipu_device):
     super().__init__(container_strategy,
                      cluster_resolver,
                      ipu_device,
                      variables_on_host=False)
     self._num_workers = size()
Exemple #8
0
 def test_update_ipu_config(self):
     strategy = ipu_multi_replica_strategy.IPUMultiReplicaStrategy()
     config = ipu.utils.create_ipu_config()
     config = strategy.update_ipu_config(config)
     self.assertEqual(config.multi_replica_process_count, hvd.size())
     self.assertEqual(config.multi_replica_process_index, hvd.rank())