Esempio n. 1
0
        def __init__(self, **kwargs):
            self._name = name or "Distributed%s" % self.__class__.__base__.__name__
            self._aggregated_gradients = False

            self._allreduce_grads = hvd._make_allreduce_grads_fn(
                self._name, device_dense, device_sparse, compression,
                sparse_as_dense, op, gradient_predivide_factor)

            self._agg_helper = None
            if backward_passes_per_step > 1:
                if hvd._executing_eagerly():
                    self._agg_helper = LocalGradientAggregationHelperEager(
                        backward_passes_per_step=backward_passes_per_step,
                        allreduce_func=self._allreduce_grads,
                        sparse_as_dense=sparse_as_dense,
                        average_aggregated_gradients=
                        average_aggregated_gradients,
                    )
                else:
                    self._agg_helper = LocalGradientAggregationHelper(
                        backward_passes_per_step=backward_passes_per_step,
                        allreduce_func=self._allreduce_grads,
                        sparse_as_dense=sparse_as_dense,
                        average_aggregated_gradients=
                        average_aggregated_gradients,
                        rank=rank(),
                        optimizer_type=LocalGradientAggregationHelper.
                        _OPTIMIZER_TYPE_KERAS,
                    )

            super(self.__class__, self).__init__(**kwargs)
Esempio n. 2
0
 def __init__(self, name, device_dense, device_sparse, compression, sparse_as_dense,
              config):
     if name is None:
         name = "Distributed%s" % self.__class__.__base__.__name__
     self._allreduce_grads = hvd._make_allreduce_grads_fn(
         name, device_dense, device_sparse, compression, sparse_as_dense)
     super(self.__class__, self).__init__(**config)
Esempio n. 3
0
 def __init__(self, **kwargs):
     self._name = name or "Distributed%s" % self.__class__.__base__.__name__
     self._aggregated_gradients = False
     self._allreduce_grads = hvd._make_allreduce_grads_fn(
         self._name, device_dense, device_sparse, compression,
         sparse_as_dense, hvd.Average, gradient_predivide_factor)
     super(self.__class__, self).__init__(**kwargs)
Esempio n. 4
0
    def __init__(self,
                 compression=None,
                 sparse_as_dense=False,
                 device_dense="",
                 device_sparse="",
                 hvd_backend="horovod",
                 **kwargs):
        super(self.__class__, self).__init__(**kwargs)
        # using revised loss scale
        self._loss_scale = RevisedDynamicLossScale(initial_loss_scale=2**15,
                                                   growth_steps=2000,
                                                   multiplier=2)
        self._track_trackable(self._loss_scale, "loss_scale", overwrite=True)
        self._device_dense = device_dense
        self._device_sparse = device_sparse
        self._hvd_backend = hvd_backend
        self._compression = compression
        self._sparse_as_dense = sparse_as_dense
        self._aggregated_gradients = False
        if hvd_backend == "horovod":
            import horovod.tensorflow as hvd

            self._allreduce_grads = hvd._make_allreduce_grads_fn(
                "DistributedLossScaleOptimizer", self._device_dense,
                self._device_sparse, compression, sparse_as_dense, hvd.Average,
                1.0, 0)
        else:
            assert hvd_backend == "byteps", f"Unknown `hvd_backend`={hvd_backend}"
Esempio n. 5
0
    def __init__(
        self,
        optimizer,
        name=None,
        use_locking=False,
        device_dense="",
        device_sparse="",
        compression=hvd.Compression.none,
        sparse_as_dense=False,
        op=hvd.Average,
        gradient_predivide_factor=1.0,
        backward_passes_per_step=1,
        average_aggregated_gradients=False,
        num_groups=0,
        global_batch_count_per_step=None,
    ):
        if name is None:
            name = "Distributed{}".format(type(optimizer).__name__)
        super(_DistributedOptimizer, self).__init__(name=name,
                                                    use_locking=use_locking)

        self._optimizer = optimizer
        self._allreduce_grads = hvd._make_allreduce_grads_fn(
            name,
            device_dense,
            device_sparse,
            compression,
            sparse_as_dense,
            op,
            gradient_predivide_factor,
            num_groups,
        )

        # *ElasticDL Update*: Always create LocalGradientAggregationHelper
        # for Elastic training with fixed global batch size.
        self._agg_helper = LocalGradientAggregationHelper(
            backward_passes_per_step=backward_passes_per_step,
            allreduce_func=self._allreduce_grads,
            sparse_as_dense=sparse_as_dense,
            average_aggregated_gradients=average_aggregated_gradients,
            rank=hvd.rank(),
            optimizer_type=LocalGradientAggregationHelper.
            _OPTIMIZER_TYPE_LEGACY,  # noqa: ignore=E501
            global_batch_count_per_step=global_batch_count_per_step,
            op=op,
        )