Python rocm_built Beispiele, horovod.tensorflow.rocm_built Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: __init__.py Projekt: zuston/horovod

def DistributedOptimizer(optimizer,
                         name=None,
                         device_dense='',
                         device_sparse='',
                         compression=Compression.none,
                         sparse_as_dense=False,
                         gradient_predivide_factor=1.0,
                         op=Average,
                         num_groups=0):
    """
    An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to
    average gradient values before applying gradients to model weights.

    Args:
        optimizer: Optimizer to use for computing gradients and applying updates.
        name: Optional name prefix for the operations created when applying
              gradients. Defaults to "Distributed" followed by the provided
              optimizer type.
        device_dense: Device to be used for dense tensors. Uses GPU by default
                      if Horovod was build with HOROVOD_GPU_OPERATIONS.
        device_sparse: Device to be used for sparse tensors. Uses GPU by default
                       if Horovod was build with HOROVOD_GPU_OPERATIONS.
        compression: Compression algorithm used to reduce the amount of data
                     sent and received by each worker node.  Defaults to not
                     using compression.
        sparse_as_dense: Treat all sparse gradients as dense tensors.  This can
                         help improve performance and memory utilization if
                         the original sparse gradient has high density.
                         Defaults to false.
        gradient_predivide_factor: gradient_predivide_factor splits the averaging
                                   before and after the sum. Gradients are scaled by
                                   1.0 / gradient_predivide_factor before the sum and
                                   gradient_predivide_factor / size after the sum.
        op: The reduction operation to use when combining gradients across
            different ranks. Defaults to Average.
        num_groups: Number of groups to assign gradient allreduce ops to for explicit
                    grouping. Defaults to no explicit groups.
    """
    if gradient_predivide_factor != 1.0 and rocm_built():
        raise ValueError(
            'gradient_predivide_factor not supported yet with ROCm')

    if op != Average and op != Sum:
        raise ValueError('op currently only supports Average and Sum')

    return _impl.create_distributed_optimizer(
        keras=keras,
        optimizer=optimizer,
        name=name,
        device_dense=device_dense,
        device_sparse=device_sparse,
        compression=compression,
        sparse_as_dense=sparse_as_dense,
        gradient_predivide_factor=gradient_predivide_factor,
        op=op,
        num_groups=num_groups,
    )

Beispiel #2

0

Datei anzeigen

def DistributedOptimizer(
    optimizer,
    name=None,
    use_locking=False,
    device_dense="",
    device_sparse="",
    compression=hvd.Compression.none,
    sparse_as_dense=False,
    backward_passes_per_step=1,
    op=hvd.Average,
    gradient_predivide_factor=1.0,
    average_aggregated_gradients=False,
    num_groups=0,
    fixed_global_batch_size=False,
    hvd_max_size=None,
):
    """Construct a new DistributedOptimizer, which uses another optimizer
    under the hood for computing single-process gradient values and
    applying gradient updates after the gradient values have been combined
    across all the Horovod ranks.

    Args:
      optimizer:
        Optimizer to use for computing gradients and applying updates.
      name:
        Optional name prefix for the operations created when applying
        gradients. Defaults to "Distributed" followed by the provided
        optimizer type.
      use_locking:
        Whether to use locking when updating variables.
        See Optimizer.__init__ for more info.
      device_dense:
        Device to be used for dense tensors. Uses GPU by default
        if Horovod was built with HOROVOD_GPU_OPERATIONS.
      device_sparse:
        Device to be used for sparse tensors. Uses GPU by default
        if Horovod was built with HOROVOD_GPU_OPERATIONS.
      compression:
        Compression algorithm used during allreduce to reduce the amount
        of data sent during each parameter update step.  Defaults to
        not using compression.
      sparse_as_dense:
        Treat all sparse gradients as dense tensors.  This can help improve
        performance and memory utilization if the original sparse gradient
        has high density.  Defaults to false.
      backward_passes_per_step:
        Number of backward passes to perform before calling hvd.allreduce.
        This allows accumulating updates over multiple mini-batches before
        reducing and applying them.
      op:
        The reduction operation to use when combining gradients across
        different ranks.
      gradient_predivide_factor:
        If op == Average, gradient_predivide_factor splits the averaging
        before and after the sum. Gradients are scaled by
        1.0 / gradient_predivide_factor before the sum and
        gradient_predivide_factor / size after the sum.
      average_aggregated_gradients:
        Whether to average the aggregated gradients that have been accumulated
        over multiple mini-batches. If true divides gradients updates by
        backward_passes_per_step. Only applicable for
        backward_passes_per_step > 1.
      num_groups:
        Number of groups to assign gradient allreduce ops to for explicit
        grouping. Defaults to no explicit groups.
      fixed_global_batch_size:
        Whether to keep the global batch size is fixed even though the worker
        number is changing during elastic execution.
      hvd_max_size:
        The maximum horovod size for the elastic training.
    """

    # *ElasticDL Update*: If `fixed_global_batch_size` == False,
    # just fallback to the native horovod DistributedOptimizer.
    if not fixed_global_batch_size:
        return hvd.DistributedOptimizer(
            optimizer=optimizer,
            name=name,
            use_locking=use_locking,
            device_dense=device_dense,
            device_sparse=device_sparse,
            compression=compression,
            sparse_as_dense=sparse_as_dense,
            backward_passes_per_step=backward_passes_per_step,
            op=op,
            gradient_predivide_factor=gradient_predivide_factor,
            average_aggregated_gradients=average_aggregated_gradients,
            num_groups=num_groups,
        )

    if gradient_predivide_factor != 1.0:
        if hvd.rocm_built():
            raise ValueError(
                "gradient_predivide_factor not supported yet with ROCm")
        if op != hvd.Average:
            raise ValueError(
                "gradient_predivide_factor not supported with op != Average")

    if op == hvd.Adasum and average_aggregated_gradients:
        raise ValueError(
            "Adasum does not support average_aggregated_gradients == True")

    if isinstance(optimizer, _LegacyOptimizer):
        if op == hvd.Adasum:
            raise ValueError(
                """op == Adasum and fixed_global_batch_size == True is
                not yet supported""")

        hvd_max_size = complement_value_from_env_if_none(
            hvd_max_size, "WORKER_NUM", int, 1)
        global_batch_count_per_step = hvd_max_size * backward_passes_per_step
        opt = _DistributedOptimizer(
            optimizer=optimizer,
            name=name,
            use_locking=use_locking,
            device_dense=device_dense,
            device_sparse=device_sparse,
            compression=compression,
            sparse_as_dense=sparse_as_dense,
            op=op,
            gradient_predivide_factor=gradient_predivide_factor,
            backward_passes_per_step=backward_passes_per_step,
            average_aggregated_gradients=average_aggregated_gradients,
            num_groups=num_groups,
            global_batch_count_per_step=global_batch_count_per_step,
        )
        optimizer_instances.append(opt)
        return opt
    elif isinstance(optimizer, tf.keras.optimizers.Optimizer):
        raise ValueError(
            "fixed_global_batch_size == True is not supported yet with Keras")
    else:
        raise ValueError(
            "Provided optimizer doesn't inherit from either legacy "
            "TensorFlow or Keras optimizer: %s" % optimizer)

Beispiel #3

0

Datei anzeigen

Datei: __init__.py Projekt: zuston/horovod

def DistributedOptimizer(optimizer,
name=None,
device_dense='',
device_sparse='',
compression=Compression.none,
sparse_as_dense=False,
gradient_predivide_factor=1.0,
op=Average,
backward_passes_per_step=1,
average_aggregated_gradients=False):
"""
An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to
average gradient values before applying gradients to model weights.

Args:
optimizer: Optimizer to use for computing gradients and applying updates.
name: Optional name prefix for the operations created when applying
gradients. Defaults to "Distributed" followed by the provided
optimizer type.
device_dense: Device to be used for dense tensors. Uses GPU by default
if Horovod was build with HOROVOD_GPU_OPERATIONS.
device_sparse: Device to be used for sparse tensors. Uses GPU by default
if Horovod was build with HOROVOD_GPU_OPERATIONS.
compression: Compression algorithm used to reduce the amount of data
sent and received by each worker node. Defaults to not
using compression.
sparse_as_dense: Treat all sparse gradients as dense tensors. This can
help improve performance and memory utilization if
the original sparse gradient has high density.
Defaults to false.
gradient_predivide_factor: gradient_predivide_factor splits the averaging
before and after the sum. Gradients are scaled by
1.0 / gradient_predivide_factor before the sum and
gradient_predivide_factor / size after the sum.
op: The reduction operation to use when combining gradients across
different ranks. Defaults to Average.
backward_passes_per_step: Number of backward passes to perform before calling
hvd.allreduce. This allows accumulating updates over
multiple mini-batches before reducing and applying them.
average_aggregated_gradients: Whether to average the aggregated gradients that
have been accumulated over multiple mini-batches.
If true divides gradient updates by
backward_passes_per_step.
Only applicable for backward_passes_per_step > 1.
"""
if gradient_predivide_factor != 1.0 and rocm_built():
raise ValueError(
'gradient_predivide_factor not supported yet with ROCm')

if op != Average and op != Sum:
raise ValueError('op currently only supports Average and Sum')

return _impl.create_distributed_optimizer(
keras=keras,
optimizer=optimizer,
name=name,
device_dense=device_dense,
device_sparse=device_sparse,
compression=compression,
sparse_as_dense=sparse_as_dense,
gradient_predivide_factor=gradient_predivide_factor,
op=op,
backward_passes_per_step=backward_passes_per_step,
average_aggregated_gradients=average_aggregated_gradients,
)

Beispiel #4

0

Datei anzeigen

Datei: __init__.py Projekt: rongou/horovod

def DistributedOptimizer(optimizer,
name=None,
device_dense='',
device_sparse='',
compression=Compression.none,
sparse_as_dense=False,
gradient_predivide_factor=1.0,
op=Average,
backward_passes_per_step=1,
average_aggregated_gradients=False,
num_groups=0,
groups=None,
process_set=global_process_set):
"""
An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to
average gradient values before applying gradients to model weights.

Args:
optimizer: Optimizer to use for computing gradients and applying updates.
name: Optional name prefix for the operations created when applying
gradients. Defaults to "Distributed" followed by the provided
optimizer type.
device_dense: Device to be used for dense tensors. Uses GPU by default
if Horovod was build with HOROVOD_GPU_OPERATIONS.
device_sparse: Device to be used for sparse tensors. Uses GPU by default
if Horovod was build with HOROVOD_GPU_OPERATIONS.
compression: Compression algorithm used to reduce the amount of data
sent and received by each worker node. Defaults to not
using compression.
sparse_as_dense: Treat all sparse gradients as dense tensors. This can
help improve performance and memory utilization if
the original sparse gradient has high density.
Defaults to false.
gradient_predivide_factor: gradient_predivide_factor splits the averaging
before and after the sum. Gradients are scaled by
1.0 / gradient_predivide_factor before the sum and
gradient_predivide_factor / size after the sum.
op: The reduction operation to use when combining gradients across
different ranks. Defaults to Average.
backward_passes_per_step: Number of backward passes to perform before calling
hvd.allreduce. This allows accumulating updates over
multiple mini-batches before reducing and applying them.
average_aggregated_gradients: Whether to average the aggregated gradients that
have been accumulated over multiple mini-batches.
If true divides gradient updates by
backward_passes_per_step.
Only applicable for backward_passes_per_step > 1.
num_groups: Number of groups to assign gradient allreduce ops to for explicit
grouping. Defaults to no explicit groups.
groups: The parameter to group the gradient allreduce ops. Accept values is a
non-negative integer or a list of list of tf.Variable.
If groups is a non-negative integer, it is the number of groups to assign
gradient allreduce ops to for explicit grouping.
If groups is a list of list of tf.Variable. Variables in the same
inner list will be assigned to the same group, while parameter that does
not appear in any list will form a group itself.
Defaults as None, which is no explicit groups.
process_set: Gradients will only be reduced over Horovod processes belonging
to this process set. Defaults to the global process set.
"""
if gradient_predivide_factor != 1.0 and rocm_built():
raise ValueError(
'gradient_predivide_factor not supported yet with ROCm')

if op != Average and op != Sum:
raise ValueError('op currently only supports Average and Sum')

if num_groups != 0:
warnings.warn(
'Parameter `num_groups` has been replaced by `groups` '
'and will be removed in v0.23.0.', DeprecationWarning)
if groups is None:
groups = num_groups

if groups is not None:
if not (isinstance(groups, list) or groups > 0):
raise ValueError('groups should be a non-negative integer or '
'a list of list of tf.Variable.')

return _impl.create_distributed_optimizer(
keras=keras,
optimizer=optimizer,
name=name,
device_dense=device_dense,
device_sparse=device_sparse,
compression=compression,
sparse_as_dense=sparse_as_dense,
gradient_predivide_factor=gradient_predivide_factor,
op=op,
backward_passes_per_step=backward_passes_per_step,
average_aggregated_gradients=average_aggregated_gradients,
groups=groups,
process_set=process_set,
)