def test_ipu_horovod_strategy(self): hvd_size = hvd.size() hvd_rank = hvd.rank() strategy = IPUHorovodStrategy() self.assertEqual(strategy.num_replicas_in_sync, hvd_size) cfg = ipu_utils.create_ipu_config() cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=1) ipu_utils.configure_ipu_system(cfg) with strategy.scope(): def per_replica_fn(): w = variable_scope.get_variable(name="w", initializer=hvd_rank + 1.0) self.assertEqual("/replica:0/task:0/device:IPU:0", w.device) return w * w per_replica_val = strategy.experimental_run_v2(per_replica_fn) strategy_sum = strategy.reduce(ReduceOp.SUM, per_replica_val) strategy_mean = strategy.reduce(ReduceOp.MEAN, per_replica_val) with session.Session() as sess: sess.run(variables.global_variables_initializer()) # All workers should have the initial value from the first worker. self.assertEqual([1.0], sess.run(variables.global_variables())) self.assertEqual(1.0 * hvd_size, strategy_sum.eval()) self.assertEqual(1.0, strategy_mean.eval())
def test_collectives(self): rank = constant_op.constant(hvd.rank(), dtype=np.float32) allreduced = hvd.allreduce(rank, op=hvd.Sum) allgathered = hvd.allgather(array_ops.expand_dims(rank, axis=0)) broadcast = hvd.broadcast(rank, root_rank=0) with self.assertRaisesRegex(NotImplementedError, "The Adasum reduction is not implemented"): hvd.allreduce(rank, op=hvd.Adasum) cfg = ipu_utils.create_ipu_config() cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=1) ipu_utils.configure_ipu_system(cfg) with session.Session() as sess: self.assertAllEqual(np.arange(hvd.size()), sess.run(allgathered)) self.assertAllEqual(np.sum(np.arange(hvd.size())), sess.run(allreduced)) self.assertAllEqual(0.0, sess.run(broadcast))
def update_ipu_config(self, config): """Update the given IPU configuration with the multi-replica distribution options. Args: config: The IpuOptions configuration protobuf to update. Returns: The IpuOptions configuration protobuf. """ return ipu_utils.set_experimental_multi_replica_distribution_options( config, process_count=size(), process_index=rank())
def test_basics(self): self.assertTrue(hvd.mpi_built()) self.assertTrue(hvd.mpi_enabled()) self.assertFalse(hvd.nccl_built()) self.assertFalse(hvd.ddl_built()) self.assertFalse(hvd.mlsl_built()) self.assertFalse(hvd.gloo_built()) self.assertFalse(hvd.gloo_enabled()) self.assertEqual(hvd.rank(), int(os.environ["OMPI_COMM_WORLD_RANK"])) self.assertEqual(hvd.local_rank(), int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])) self.assertEqual(hvd.size(), hvd.local_size()) self.assertTrue(hvd.is_homogeneous())
def test_strategy(self): strategy = ipu_multi_replica_strategy.IPUMultiReplicaStrategy() with strategy.scope(): v = variables.Variable(initial_value=hvd.rank() + 1, dtype=np.float32) self.assertEndsWith(v.device, "/device:IPU:0") def per_replica_fn(x): y = v * x replica_context = distribution_strategy_context.get_replica_context( ) # This reduction is done on IPU, and hence uses GCL. In this case, # since there is no replication in this test, it is an identity op. y_allreduced = replica_context.all_reduce(ReduceOp.SUM, y) self.assertEndsWith(y_allreduced.device, "/device:IPU:0") # Sanity check that replication normalise does not support int. with self.assertRaisesRegex( TypeError, "int32 not in list of allowed values"): replica_context.all_reduce(ReduceOp.MEAN, 1) return y_allreduced per_replica_value = strategy.experimental_run_v2( per_replica_fn, args=[constant_op.constant(2.0)]) # This reduction is performed on CPU, and hence uses Horovod. value_allreduced = strategy.reduce(ReduceOp.SUM, per_replica_value) with session.Session() as sess: config = ipu.utils.create_ipu_config() config = ipu.utils.auto_select_ipus(config, 1) ipu.utils.configure_ipu_system(config) sess.run(v.initializer) # The initial value should be broadcast from rank 0. self.assertEqual(sess.run(v), 1.0) # There should be one allreduce sum of the values. self.assertEqual(sess.run(value_allreduced), hvd.size() * 2.0)
def input_fn(mode): # pylint: disable=unused-argument train_data, _ = tf.keras.datasets.mnist.load_data() def normalise(image, label): image = image.astype(np.float32) / 255.0 image = np.expand_dims(image, axis=-1) label = label.astype(np.int32) return image, label x_train, y_train = normalise(*train_data) def generator(): return zip(x_train, y_train) types = (x_train.dtype, y_train.dtype) shapes = (x_train.shape[1:], y_train.shape[1:]) mnist_dataset = tf.data.Dataset.from_generator(generator, types, shapes) mnist_dataset = mnist_dataset.shard(hvd.size(), hvd.rank()) mnist_dataset = mnist_dataset.shuffle(len(y_train)) \ .cache().batch(BATCH_SIZE, drop_remainder=True).repeat() return mnist_dataset
def __init__(self, container_strategy, cluster_resolver, ipu_device): super().__init__(container_strategy, cluster_resolver, ipu_device, variables_on_host=False) self._num_workers = size()
def test_update_ipu_config(self): strategy = ipu_multi_replica_strategy.IPUMultiReplicaStrategy() config = ipu.utils.create_ipu_config() config = strategy.update_ipu_config(config) self.assertEqual(config.multi_replica_process_count, hvd.size()) self.assertEqual(config.multi_replica_process_index, hvd.rank())