Beispiel #1
0
def DistributedOptimizer(optimizer,
                         name=None,
                         device_dense='',
                         device_sparse='',
                         compression=Compression.none,
                         sparse_as_dense=False,
                         gradient_predivide_factor=1.0,
                         op=Average,
                         num_groups=0):
    """
    An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to
    average gradient values before applying gradients to model weights.

    Args:
        optimizer: Optimizer to use for computing gradients and applying updates.
        name: Optional name prefix for the operations created when applying
              gradients. Defaults to "Distributed" followed by the provided
              optimizer type.
        device_dense: Device to be used for dense tensors. Uses GPU by default
                      if Horovod was build with HOROVOD_GPU_OPERATIONS.
        device_sparse: Device to be used for sparse tensors. Uses GPU by default
                       if Horovod was build with HOROVOD_GPU_OPERATIONS.
        compression: Compression algorithm used to reduce the amount of data
                     sent and received by each worker node.  Defaults to not
                     using compression.
        sparse_as_dense: Treat all sparse gradients as dense tensors.  This can
                         help improve performance and memory utilization if
                         the original sparse gradient has high density.
                         Defaults to false.
        gradient_predivide_factor: gradient_predivide_factor splits the averaging
                                   before and after the sum. Gradients are scaled by
                                   1.0 / gradient_predivide_factor before the sum and
                                   gradient_predivide_factor / size after the sum.
        op: The reduction operation to use when combining gradients across
            different ranks. Defaults to Average.
        num_groups: Number of groups to assign gradient allreduce ops to for explicit
                    grouping. Defaults to no explicit groups.
    """
    if gradient_predivide_factor != 1.0 and rocm_built():
        raise ValueError(
            'gradient_predivide_factor not supported yet with ROCm')

    if op != Average and op != Sum:
        raise ValueError('op currently only supports Average and Sum')

    return _impl.create_distributed_optimizer(
        keras=keras,
        optimizer=optimizer,
        name=name,
        device_dense=device_dense,
        device_sparse=device_sparse,
        compression=compression,
        sparse_as_dense=sparse_as_dense,
        gradient_predivide_factor=gradient_predivide_factor,
        op=op,
        num_groups=num_groups,
    )
Beispiel #2
0
def DistributedOptimizer(
    optimizer,
    name=None,
    use_locking=False,
    device_dense="",
    device_sparse="",
    compression=hvd.Compression.none,
    sparse_as_dense=False,
    backward_passes_per_step=1,
    op=hvd.Average,
    gradient_predivide_factor=1.0,
    average_aggregated_gradients=False,
    num_groups=0,
    fixed_global_batch_size=False,
    hvd_max_size=None,
):
    """Construct a new DistributedOptimizer, which uses another optimizer
    under the hood for computing single-process gradient values and
    applying gradient updates after the gradient values have been combined
    across all the Horovod ranks.

    Args:
      optimizer:
        Optimizer to use for computing gradients and applying updates.
      name:
        Optional name prefix for the operations created when applying
        gradients. Defaults to "Distributed" followed by the provided
        optimizer type.
      use_locking:
        Whether to use locking when updating variables.
        See Optimizer.__init__ for more info.
      device_dense:
        Device to be used for dense tensors. Uses GPU by default
        if Horovod was built with HOROVOD_GPU_OPERATIONS.
      device_sparse:
        Device to be used for sparse tensors. Uses GPU by default
        if Horovod was built with HOROVOD_GPU_OPERATIONS.
      compression:
        Compression algorithm used during allreduce to reduce the amount
        of data sent during each parameter update step.  Defaults to
        not using compression.
      sparse_as_dense:
        Treat all sparse gradients as dense tensors.  This can help improve
        performance and memory utilization if the original sparse gradient
        has high density.  Defaults to false.
      backward_passes_per_step:
        Number of backward passes to perform before calling hvd.allreduce.
        This allows accumulating updates over multiple mini-batches before
        reducing and applying them.
      op:
        The reduction operation to use when combining gradients across
        different ranks.
      gradient_predivide_factor:
        If op == Average, gradient_predivide_factor splits the averaging
        before and after the sum. Gradients are scaled by
        1.0 / gradient_predivide_factor before the sum and
        gradient_predivide_factor / size after the sum.
      average_aggregated_gradients:
        Whether to average the aggregated gradients that have been accumulated
        over multiple mini-batches. If true divides gradients updates by
        backward_passes_per_step. Only applicable for
        backward_passes_per_step > 1.
      num_groups:
        Number of groups to assign gradient allreduce ops to for explicit
        grouping. Defaults to no explicit groups.
      fixed_global_batch_size:
        Whether to keep the global batch size is fixed even though the worker
        number is changing during elastic execution.
      hvd_max_size:
        The maximum horovod size for the elastic training.
    """

    # *ElasticDL Update*: If `fixed_global_batch_size` == False,
    # just fallback to the native horovod DistributedOptimizer.
    if not fixed_global_batch_size:
        return hvd.DistributedOptimizer(
            optimizer=optimizer,
            name=name,
            use_locking=use_locking,
            device_dense=device_dense,
            device_sparse=device_sparse,
            compression=compression,
            sparse_as_dense=sparse_as_dense,
            backward_passes_per_step=backward_passes_per_step,
            op=op,
            gradient_predivide_factor=gradient_predivide_factor,
            average_aggregated_gradients=average_aggregated_gradients,
            num_groups=num_groups,
        )

    if gradient_predivide_factor != 1.0:
        if hvd.rocm_built():
            raise ValueError(
                "gradient_predivide_factor not supported yet with ROCm")
        if op != hvd.Average:
            raise ValueError(
                "gradient_predivide_factor not supported with op != Average")

    if op == hvd.Adasum and average_aggregated_gradients:
        raise ValueError(
            "Adasum does not support average_aggregated_gradients == True")

    if isinstance(optimizer, _LegacyOptimizer):
        if op == hvd.Adasum:
            raise ValueError(
                """op == Adasum and fixed_global_batch_size == True is
                not yet supported""")

        hvd_max_size = complement_value_from_env_if_none(
            hvd_max_size, "WORKER_NUM", int, 1)
        global_batch_count_per_step = hvd_max_size * backward_passes_per_step
        opt = _DistributedOptimizer(
            optimizer=optimizer,
            name=name,
            use_locking=use_locking,
            device_dense=device_dense,
            device_sparse=device_sparse,
            compression=compression,
            sparse_as_dense=sparse_as_dense,
            op=op,
            gradient_predivide_factor=gradient_predivide_factor,
            backward_passes_per_step=backward_passes_per_step,
            average_aggregated_gradients=average_aggregated_gradients,
            num_groups=num_groups,
            global_batch_count_per_step=global_batch_count_per_step,
        )
        optimizer_instances.append(opt)
        return opt
    elif isinstance(optimizer, tf.keras.optimizers.Optimizer):
        raise ValueError(
            "fixed_global_batch_size == True is not supported yet with Keras")
    else:
        raise ValueError(
            "Provided optimizer doesn't inherit from either legacy "
            "TensorFlow or Keras optimizer: %s" % optimizer)
Beispiel #3
0
def DistributedOptimizer(optimizer,
                         name=None,
                         device_dense='',
                         device_sparse='',
                         compression=Compression.none,
                         sparse_as_dense=False,
                         gradient_predivide_factor=1.0,
                         op=Average,
                         backward_passes_per_step=1,
                         average_aggregated_gradients=False):
    """
    An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to
    average gradient values before applying gradients to model weights.

    Args:
        optimizer: Optimizer to use for computing gradients and applying updates.
        name: Optional name prefix for the operations created when applying
              gradients. Defaults to "Distributed" followed by the provided
              optimizer type.
        device_dense: Device to be used for dense tensors. Uses GPU by default
                      if Horovod was build with HOROVOD_GPU_OPERATIONS.
        device_sparse: Device to be used for sparse tensors. Uses GPU by default
                       if Horovod was build with HOROVOD_GPU_OPERATIONS.
        compression: Compression algorithm used to reduce the amount of data
                     sent and received by each worker node.  Defaults to not
                     using compression.
        sparse_as_dense: Treat all sparse gradients as dense tensors.  This can
                         help improve performance and memory utilization if
                         the original sparse gradient has high density.
                         Defaults to false.
        gradient_predivide_factor: gradient_predivide_factor splits the averaging
                                   before and after the sum. Gradients are scaled by
                                   1.0 / gradient_predivide_factor before the sum and
                                   gradient_predivide_factor / size after the sum.
        op: The reduction operation to use when combining gradients across
            different ranks. Defaults to Average.
        backward_passes_per_step: Number of backward passes to perform before calling
                                  hvd.allreduce. This allows accumulating updates over
                                  multiple mini-batches before reducing and applying them.
        average_aggregated_gradients: Whether to average the aggregated gradients that
                                      have been accumulated over multiple mini-batches.
                                      If true divides gradient updates by
                                      backward_passes_per_step.
                                      Only applicable for backward_passes_per_step > 1.
    """
    if gradient_predivide_factor != 1.0 and rocm_built():
        raise ValueError(
            'gradient_predivide_factor not supported yet with ROCm')

    if op != Average and op != Sum:
        raise ValueError('op currently only supports Average and Sum')

    return _impl.create_distributed_optimizer(
        keras=keras,
        optimizer=optimizer,
        name=name,
        device_dense=device_dense,
        device_sparse=device_sparse,
        compression=compression,
        sparse_as_dense=sparse_as_dense,
        gradient_predivide_factor=gradient_predivide_factor,
        op=op,
        backward_passes_per_step=backward_passes_per_step,
        average_aggregated_gradients=average_aggregated_gradients,
    )
Beispiel #4
0
def DistributedOptimizer(optimizer,
                         name=None,
                         device_dense='',
                         device_sparse='',
                         compression=Compression.none,
                         sparse_as_dense=False,
                         gradient_predivide_factor=1.0,
                         op=Average,
                         backward_passes_per_step=1,
                         average_aggregated_gradients=False,
                         num_groups=0,
                         groups=None,
                         process_set=global_process_set):
    """
    An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to
    average gradient values before applying gradients to model weights.

    Args:
        optimizer: Optimizer to use for computing gradients and applying updates.
        name: Optional name prefix for the operations created when applying
              gradients. Defaults to "Distributed" followed by the provided
              optimizer type.
        device_dense: Device to be used for dense tensors. Uses GPU by default
                      if Horovod was build with HOROVOD_GPU_OPERATIONS.
        device_sparse: Device to be used for sparse tensors. Uses GPU by default
                       if Horovod was build with HOROVOD_GPU_OPERATIONS.
        compression: Compression algorithm used to reduce the amount of data
                     sent and received by each worker node.  Defaults to not
                     using compression.
        sparse_as_dense: Treat all sparse gradients as dense tensors.  This can
                         help improve performance and memory utilization if
                         the original sparse gradient has high density.
                         Defaults to false.
        gradient_predivide_factor: gradient_predivide_factor splits the averaging
                                   before and after the sum. Gradients are scaled by
                                   1.0 / gradient_predivide_factor before the sum and
                                   gradient_predivide_factor / size after the sum.
        op: The reduction operation to use when combining gradients across
            different ranks. Defaults to Average.
        backward_passes_per_step: Number of backward passes to perform before calling
                                  hvd.allreduce. This allows accumulating updates over
                                  multiple mini-batches before reducing and applying them.
        average_aggregated_gradients: Whether to average the aggregated gradients that
                                      have been accumulated over multiple mini-batches.
                                      If true divides gradient updates by
                                      backward_passes_per_step.
                                      Only applicable for backward_passes_per_step > 1.
        num_groups: Number of groups to assign gradient allreduce ops to for explicit
                    grouping. Defaults to no explicit groups.
        groups: The parameter to group the gradient allreduce ops. Accept values is a
                non-negative integer or a list of list of tf.Variable.
                If groups is a non-negative integer, it is the number of groups to assign
                gradient allreduce ops to for explicit grouping.
                If groups is a list of list of tf.Variable. Variables in the same
                inner list will be assigned to the same group, while parameter that does
                not appear in any list will form a group itself.
                Defaults as None, which is no explicit groups.
      process_set: Gradients will only be reduced over Horovod processes belonging
                   to this process set. Defaults to the global process set.
    """
    if gradient_predivide_factor != 1.0 and rocm_built():
        raise ValueError(
            'gradient_predivide_factor not supported yet with ROCm')

    if op != Average and op != Sum:
        raise ValueError('op currently only supports Average and Sum')

    if num_groups != 0:
        warnings.warn(
            'Parameter `num_groups` has been replaced by `groups` '
            'and will be removed in v0.23.0.', DeprecationWarning)
        if groups is None:
            groups = num_groups

    if groups is not None:
        if not (isinstance(groups, list) or groups > 0):
            raise ValueError('groups should be a non-negative integer or '
                             'a list of list of tf.Variable.')

    return _impl.create_distributed_optimizer(
        keras=keras,
        optimizer=optimizer,
        name=name,
        device_dense=device_dense,
        device_sparse=device_sparse,
        compression=compression,
        sparse_as_dense=sparse_as_dense,
        gradient_predivide_factor=gradient_predivide_factor,
        op=op,
        backward_passes_per_step=backward_passes_per_step,
        average_aggregated_gradients=average_aggregated_gradients,
        groups=groups,
        process_set=process_set,
    )