def test_ipu_horovod_strategy(self): hvd_size = hvd.size() hvd_rank = hvd.rank() strategy = IPUHorovodStrategy() self.assertEqual(strategy.num_replicas_in_sync, hvd_size) cfg = ipu_utils.create_ipu_config() cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=1) ipu_utils.configure_ipu_system(cfg) with strategy.scope(): def per_replica_fn(): w = variable_scope.get_variable(name="w", initializer=hvd_rank + 1.0) self.assertEqual("/replica:0/task:0/device:IPU:0", w.device) return w * w per_replica_val = strategy.experimental_run_v2(per_replica_fn) strategy_sum = strategy.reduce(ReduceOp.SUM, per_replica_val) strategy_mean = strategy.reduce(ReduceOp.MEAN, per_replica_val) with session.Session() as sess: sess.run(variables.global_variables_initializer()) # All workers should have the initial value from the first worker. self.assertEqual([1.0], sess.run(variables.global_variables())) self.assertEqual(1.0 * hvd_size, strategy_sum.eval()) self.assertEqual(1.0, strategy_mean.eval())
def update_ipu_config(self, config): """Update the given IPU configuration with the multi-replica distribution options. Args: config: The IpuOptions configuration protobuf to update. Returns: The IpuOptions configuration protobuf. """ return ipu_utils.set_experimental_multi_replica_distribution_options( config, process_count=size(), process_index=rank())
def test_basics(self): self.assertTrue(hvd.mpi_built()) self.assertTrue(hvd.mpi_enabled()) self.assertFalse(hvd.nccl_built()) self.assertFalse(hvd.ddl_built()) self.assertFalse(hvd.mlsl_built()) self.assertFalse(hvd.gloo_built()) self.assertFalse(hvd.gloo_enabled()) self.assertEqual(hvd.rank(), int(os.environ["OMPI_COMM_WORLD_RANK"])) self.assertEqual(hvd.local_rank(), int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])) self.assertEqual(hvd.size(), hvd.local_size()) self.assertTrue(hvd.is_homogeneous())
def test_strategy(self): strategy = ipu_multi_replica_strategy.IPUMultiReplicaStrategy() with strategy.scope(): v = variables.Variable(initial_value=hvd.rank() + 1, dtype=np.float32) self.assertEndsWith(v.device, "/device:IPU:0") def per_replica_fn(x): y = v * x replica_context = distribution_strategy_context.get_replica_context( ) # This reduction is done on IPU, and hence uses GCL. In this case, # since there is no replication in this test, it is an identity op. y_allreduced = replica_context.all_reduce(ReduceOp.SUM, y) self.assertEndsWith(y_allreduced.device, "/device:IPU:0") # Sanity check that replication normalise does not support int. with self.assertRaisesRegex( TypeError, "int32 not in list of allowed values"): replica_context.all_reduce(ReduceOp.MEAN, 1) return y_allreduced per_replica_value = strategy.experimental_run_v2( per_replica_fn, args=[constant_op.constant(2.0)]) # This reduction is performed on CPU, and hence uses Horovod. value_allreduced = strategy.reduce(ReduceOp.SUM, per_replica_value) with session.Session() as sess: config = ipu.utils.create_ipu_config() config = ipu.utils.auto_select_ipus(config, 1) ipu.utils.configure_ipu_system(config) sess.run(v.initializer) # The initial value should be broadcast from rank 0. self.assertEqual(sess.run(v), 1.0) # There should be one allreduce sum of the values. self.assertEqual(sess.run(value_allreduced), hvd.size() * 2.0)
def test_collectives(self): rank = constant_op.constant(hvd.rank(), dtype=np.float32) allreduced = hvd.allreduce(rank, op=hvd.Sum) allgathered = hvd.allgather(array_ops.expand_dims(rank, axis=0)) broadcast = hvd.broadcast(rank, root_rank=0) with self.assertRaisesRegex(NotImplementedError, "The Adasum reduction is not implemented"): hvd.allreduce(rank, op=hvd.Adasum) cfg = ipu_utils.create_ipu_config() cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=1) ipu_utils.configure_ipu_system(cfg) with session.Session() as sess: self.assertAllEqual(np.arange(hvd.size()), sess.run(allgathered)) self.assertAllEqual(np.sum(np.arange(hvd.size())), sess.run(allreduced)) self.assertAllEqual(0.0, sess.run(broadcast))
def input_fn(mode): # pylint: disable=unused-argument train_data, _ = tf.keras.datasets.mnist.load_data() def normalise(image, label): image = image.astype(np.float32) / 255.0 image = np.expand_dims(image, axis=-1) label = label.astype(np.int32) return image, label x_train, y_train = normalise(*train_data) def generator(): return zip(x_train, y_train) types = (x_train.dtype, y_train.dtype) shapes = (x_train.shape[1:], y_train.shape[1:]) mnist_dataset = tf.data.Dataset.from_generator(generator, types, shapes) mnist_dataset = mnist_dataset.shard(hvd.size(), hvd.rank()) mnist_dataset = mnist_dataset.shuffle(len(y_train)) \ .cache().batch(BATCH_SIZE, drop_remainder=True).repeat() return mnist_dataset
def test_update_ipu_config(self): strategy = ipu_multi_replica_strategy.IPUMultiReplicaStrategy() config = ipu.utils.create_ipu_config() config = strategy.update_ipu_config(config) self.assertEqual(config.multi_replica_process_count, hvd.size()) self.assertEqual(config.multi_replica_process_index, hvd.rank())
def test_pipelining(self): gradient_accumulation_count = 4 local_batch_size = 2 features = np.ones((1, 20), dtype=np.float32) * hvd.rank() labels = np.ones(1, dtype=np.int32) * hvd.rank() dataset = dataset_ops.Dataset.from_tensor_slices((features, labels)) dataset = dataset.repeat().batch(local_batch_size, drop_remainder=True) loss_vals = [] strategy = IPUHorovodStrategy() with strategy.scope(): infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "infeed") outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("outfeed") def stage1(lr, images, labels): partial = keras.layers.Dense(32, activation="relu")(images) partial = keras.layers.Dense(16, activation="relu")(partial) return lr, partial, labels def stage2(lr, partial, labels): logits = keras.layers.Dense(10)(partial) per_example_loss = keras.losses.sparse_categorical_crossentropy( y_true=labels, y_pred=logits, from_logits=True) # In a custom training loop, the optimiser does an allreduce *sum*, not # average, of the gradients across the distributed workers. Therefore # we want to divide the loss here by the *global* batch size, which is # done by the `tf.nn.compute_average_loss()` function. loss = nn.compute_average_loss(per_example_loss) return lr, loss def optimizer_function(lr, loss): optimizer = GradientDescentOptimizer(lr) return pipelining_ops.OptimizerFunctionOutput(optimizer, loss) def model(lr): pipeline_op = pipelining_ops.pipeline( computational_stages=[stage1, stage2], device_mapping=[0, 0], gradient_accumulation_count=gradient_accumulation_count, inputs=[lr], infeed_queue=infeed_queue, repeat_count=2, outfeed_queue=outfeed_queue, optimizer_function=optimizer_function, name="Pipeline") return pipeline_op def compiled_model(lr): with ipu_scope("/device:IPU:0"): return ipu_compiler.compile(model, inputs=[lr]) with ops.device("cpu"): lr = array_ops.placeholder(np.float32, []) train_op = strategy.experimental_run_v2(compiled_model, args=[lr]) _, per_worker_losses = outfeed_queue.dequeue() # Mean across the local `gradient_accumulation_count` batches: per_worker_loss = math_ops.reduce_mean(per_worker_losses) # Global mean across the distributed workers (since it is already # divided by the global batch size above, we do a sum here): global_loss = strategy.reduce(ReduceOp.SUM, per_worker_loss) config = ipu_utils.create_ipu_config() config = ipu_utils.auto_select_ipus(config, num_ipus=1) ipu_utils.configure_ipu_system(config) ipu_utils.move_variable_initialization_to_cpu() with session.Session() as sess: sess.run(infeed_queue.initializer) sess.run(variables.global_variables_initializer()) for _ in range(10): sess.run(train_op, {lr: 0.01}) global_loss_val = sess.run(global_loss) if loss_vals: # Check that the loss decreases monotonically. self.assertLess(global_loss_val, loss_vals[-1]) loss_vals.append(global_loss_val) sess.run(infeed_queue.deleter) sess.run(outfeed_queue.deleter) # Check all variables are equal across workers. for variable in variables.global_variables(): self.assertAllRanksEqual(variable.eval(), variable.name)