Python rankの例、tensorflow.python.ipu.horovod.rank Pythonの例

コード例 #1

0

ファイルを表示

    def test_ipu_horovod_strategy(self):
        hvd_size = hvd.size()
        hvd_rank = hvd.rank()

        strategy = IPUHorovodStrategy()
        self.assertEqual(strategy.num_replicas_in_sync, hvd_size)

        cfg = ipu_utils.create_ipu_config()
        cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=1)
        ipu_utils.configure_ipu_system(cfg)

        with strategy.scope():

            def per_replica_fn():
                w = variable_scope.get_variable(name="w",
                                                initializer=hvd_rank + 1.0)
                self.assertEqual("/replica:0/task:0/device:IPU:0", w.device)
                return w * w

            per_replica_val = strategy.experimental_run_v2(per_replica_fn)
            strategy_sum = strategy.reduce(ReduceOp.SUM, per_replica_val)
            strategy_mean = strategy.reduce(ReduceOp.MEAN, per_replica_val)

            with session.Session() as sess:
                sess.run(variables.global_variables_initializer())

                # All workers should have the initial value from the first worker.
                self.assertEqual([1.0], sess.run(variables.global_variables()))
                self.assertEqual(1.0 * hvd_size, strategy_sum.eval())
                self.assertEqual(1.0, strategy_mean.eval())

コード例 #2

0

ファイルを表示

    def update_ipu_config(self, config):
        """Update the given IPU configuration with the multi-replica
    distribution options.

    Args:
      config: The IpuOptions configuration protobuf to update.

    Returns:
      The IpuOptions configuration protobuf.
    """
        return ipu_utils.set_experimental_multi_replica_distribution_options(
            config, process_count=size(), process_index=rank())

コード例 #3

0

ファイルを表示

    def test_basics(self):
        self.assertTrue(hvd.mpi_built())
        self.assertTrue(hvd.mpi_enabled())

        self.assertFalse(hvd.nccl_built())
        self.assertFalse(hvd.ddl_built())
        self.assertFalse(hvd.mlsl_built())
        self.assertFalse(hvd.gloo_built())
        self.assertFalse(hvd.gloo_enabled())

        self.assertEqual(hvd.rank(), int(os.environ["OMPI_COMM_WORLD_RANK"]))

        self.assertEqual(hvd.local_rank(),
                         int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]))

        self.assertEqual(hvd.size(), hvd.local_size())
        self.assertTrue(hvd.is_homogeneous())

コード例 #4

0

ファイルを表示

    def test_strategy(self):
        strategy = ipu_multi_replica_strategy.IPUMultiReplicaStrategy()

        with strategy.scope():

            v = variables.Variable(initial_value=hvd.rank() + 1,
                                   dtype=np.float32)
            self.assertEndsWith(v.device, "/device:IPU:0")

            def per_replica_fn(x):
                y = v * x

                replica_context = distribution_strategy_context.get_replica_context(
                )

                # This reduction is done on IPU, and hence uses GCL. In this case,
                # since there is no replication in this test, it is an identity op.
                y_allreduced = replica_context.all_reduce(ReduceOp.SUM, y)
                self.assertEndsWith(y_allreduced.device, "/device:IPU:0")

                # Sanity check that replication normalise does not support int.
                with self.assertRaisesRegex(
                        TypeError, "int32 not in list of allowed values"):
                    replica_context.all_reduce(ReduceOp.MEAN, 1)

                return y_allreduced

            per_replica_value = strategy.experimental_run_v2(
                per_replica_fn, args=[constant_op.constant(2.0)])

            # This reduction is performed on CPU, and hence uses Horovod.
            value_allreduced = strategy.reduce(ReduceOp.SUM, per_replica_value)

            with session.Session() as sess:
                config = ipu.utils.create_ipu_config()
                config = ipu.utils.auto_select_ipus(config, 1)
                ipu.utils.configure_ipu_system(config)

                sess.run(v.initializer)

                # The initial value should be broadcast from rank 0.
                self.assertEqual(sess.run(v), 1.0)

                # There should be one allreduce sum of the values.
                self.assertEqual(sess.run(value_allreduced), hvd.size() * 2.0)

コード例 #5

0

ファイルを表示

    def test_collectives(self):
        rank = constant_op.constant(hvd.rank(), dtype=np.float32)
        allreduced = hvd.allreduce(rank, op=hvd.Sum)
        allgathered = hvd.allgather(array_ops.expand_dims(rank, axis=0))
        broadcast = hvd.broadcast(rank, root_rank=0)

        with self.assertRaisesRegex(NotImplementedError,
                                    "The Adasum reduction is not implemented"):
            hvd.allreduce(rank, op=hvd.Adasum)

        cfg = ipu_utils.create_ipu_config()
        cfg = ipu_utils.auto_select_ipus(cfg, num_ipus=1)
        ipu_utils.configure_ipu_system(cfg)

        with session.Session() as sess:
            self.assertAllEqual(np.arange(hvd.size()), sess.run(allgathered))
            self.assertAllEqual(np.sum(np.arange(hvd.size())),
                                sess.run(allreduced))
            self.assertAllEqual(0.0, sess.run(broadcast))

コード例 #6

0

ファイルを表示

def input_fn(mode):  # pylint: disable=unused-argument
    train_data, _ = tf.keras.datasets.mnist.load_data()

    def normalise(image, label):
        image = image.astype(np.float32) / 255.0
        image = np.expand_dims(image, axis=-1)
        label = label.astype(np.int32)
        return image, label

    x_train, y_train = normalise(*train_data)

    def generator():
        return zip(x_train, y_train)

    types = (x_train.dtype, y_train.dtype)
    shapes = (x_train.shape[1:], y_train.shape[1:])
    mnist_dataset = tf.data.Dataset.from_generator(generator, types, shapes)
    mnist_dataset = mnist_dataset.shard(hvd.size(), hvd.rank())
    mnist_dataset = mnist_dataset.shuffle(len(y_train)) \
        .cache().batch(BATCH_SIZE, drop_remainder=True).repeat()
    return mnist_dataset

コード例 #7

0

ファイルを表示

 def test_update_ipu_config(self):
     strategy = ipu_multi_replica_strategy.IPUMultiReplicaStrategy()
     config = ipu.utils.create_ipu_config()
     config = strategy.update_ipu_config(config)
     self.assertEqual(config.multi_replica_process_count, hvd.size())
     self.assertEqual(config.multi_replica_process_index, hvd.rank())

コード例 #8

0

ファイルを表示

    def test_pipelining(self):
        gradient_accumulation_count = 4
        local_batch_size = 2

        features = np.ones((1, 20), dtype=np.float32) * hvd.rank()
        labels = np.ones(1, dtype=np.int32) * hvd.rank()
        dataset = dataset_ops.Dataset.from_tensor_slices((features, labels))
        dataset = dataset.repeat().batch(local_batch_size, drop_remainder=True)

        loss_vals = []

        strategy = IPUHorovodStrategy()

        with strategy.scope():

            infeed_queue = ipu_infeed_queue.IPUInfeedQueue(dataset, "infeed")
            outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue("outfeed")

            def stage1(lr, images, labels):
                partial = keras.layers.Dense(32, activation="relu")(images)
                partial = keras.layers.Dense(16, activation="relu")(partial)
                return lr, partial, labels

            def stage2(lr, partial, labels):
                logits = keras.layers.Dense(10)(partial)
                per_example_loss = keras.losses.sparse_categorical_crossentropy(
                    y_true=labels, y_pred=logits, from_logits=True)
                # In a custom training loop, the optimiser does an allreduce *sum*, not
                # average, of the gradients across the distributed workers. Therefore
                # we want to divide the loss here by the *global* batch size, which is
                # done by the `tf.nn.compute_average_loss()` function.
                loss = nn.compute_average_loss(per_example_loss)
                return lr, loss

            def optimizer_function(lr, loss):
                optimizer = GradientDescentOptimizer(lr)
                return pipelining_ops.OptimizerFunctionOutput(optimizer, loss)

            def model(lr):
                pipeline_op = pipelining_ops.pipeline(
                    computational_stages=[stage1, stage2],
                    device_mapping=[0, 0],
                    gradient_accumulation_count=gradient_accumulation_count,
                    inputs=[lr],
                    infeed_queue=infeed_queue,
                    repeat_count=2,
                    outfeed_queue=outfeed_queue,
                    optimizer_function=optimizer_function,
                    name="Pipeline")
                return pipeline_op

            def compiled_model(lr):
                with ipu_scope("/device:IPU:0"):
                    return ipu_compiler.compile(model, inputs=[lr])

            with ops.device("cpu"):
                lr = array_ops.placeholder(np.float32, [])

            train_op = strategy.experimental_run_v2(compiled_model, args=[lr])

            _, per_worker_losses = outfeed_queue.dequeue()

            # Mean across the local `gradient_accumulation_count` batches:
            per_worker_loss = math_ops.reduce_mean(per_worker_losses)

            # Global mean across the distributed workers (since it is already
            # divided by the global batch size above, we do a sum here):
            global_loss = strategy.reduce(ReduceOp.SUM, per_worker_loss)

            config = ipu_utils.create_ipu_config()
            config = ipu_utils.auto_select_ipus(config, num_ipus=1)
            ipu_utils.configure_ipu_system(config)
            ipu_utils.move_variable_initialization_to_cpu()

            with session.Session() as sess:
                sess.run(infeed_queue.initializer)
                sess.run(variables.global_variables_initializer())

                for _ in range(10):
                    sess.run(train_op, {lr: 0.01})
                    global_loss_val = sess.run(global_loss)

                    if loss_vals:
                        # Check that the loss decreases monotonically.
                        self.assertLess(global_loss_val, loss_vals[-1])
                    loss_vals.append(global_loss_val)

                sess.run(infeed_queue.deleter)
                sess.run(outfeed_queue.deleter)

                # Check all variables are equal across workers.
                for variable in variables.global_variables():
                    self.assertAllRanksEqual(variable.eval(), variable.name)