Python create_distributed_optimizer Beispiele, horovod._keras.create_distributed_optimizer Python Beispiele

Beispiel #1

0

Datei anzeigen

def DistributedOptimizer(optimizer,
                         name=None,
                         device_dense='',
                         device_sparse='',
                         compression=Compression.none,
                         sparse_as_dense=False):
    """
    An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to
    average gradient values before applying gradients to model weights.

    Args:
        optimizer: Optimizer to use for computing gradients and applying updates.
        name: Optional name prefix for the operations created when applying
              gradients. Defaults to "Distributed" followed by the provided
              optimizer type.
        device_dense: Device to be used for dense tensors. Uses GPU by default
                      if Horovod was build with HOROVOD_GPU_ALLREDUCE.
        device_sparse: Device to be used for sparse tensors. Uses GPU by default
                       if Horovod was build with HOROVOD_GPU_ALLGATHER.
        compression: Compression algorithm used to reduce the amount of data
                     sent and received by each worker node.  Defaults to not
                     using compression.
        sparse_as_dense: Treat all sparse gradients as dense tensors.  This can
                         help improve performance and memory utilization if
                         the original sparse gradient has high density.
                         Defaults to false.
    """
    return _impl.create_distributed_optimizer(keras, optimizer, name,
                                              device_dense, device_sparse,
                                              compression, sparse_as_dense)

Beispiel #2

0

Datei anzeigen

Datei: __init__.py Projekt: zuston/horovod

def DistributedOptimizer(optimizer,
                         name=None,
                         device_dense='',
                         device_sparse='',
                         compression=Compression.none,
                         sparse_as_dense=False,
                         gradient_predivide_factor=1.0,
                         op=Average,
                         num_groups=0):
    """
    An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to
    average gradient values before applying gradients to model weights.

    Args:
        optimizer: Optimizer to use for computing gradients and applying updates.
        name: Optional name prefix for the operations created when applying
              gradients. Defaults to "Distributed" followed by the provided
              optimizer type.
        device_dense: Device to be used for dense tensors. Uses GPU by default
                      if Horovod was build with HOROVOD_GPU_OPERATIONS.
        device_sparse: Device to be used for sparse tensors. Uses GPU by default
                       if Horovod was build with HOROVOD_GPU_OPERATIONS.
        compression: Compression algorithm used to reduce the amount of data
                     sent and received by each worker node.  Defaults to not
                     using compression.
        sparse_as_dense: Treat all sparse gradients as dense tensors.  This can
                         help improve performance and memory utilization if
                         the original sparse gradient has high density.
                         Defaults to false.
        gradient_predivide_factor: gradient_predivide_factor splits the averaging
                                   before and after the sum. Gradients are scaled by
                                   1.0 / gradient_predivide_factor before the sum and
                                   gradient_predivide_factor / size after the sum.
        op: The reduction operation to use when combining gradients across
            different ranks. Defaults to Average.
        num_groups: Number of groups to assign gradient allreduce ops to for explicit
                    grouping. Defaults to no explicit groups.
    """
    if gradient_predivide_factor != 1.0 and rocm_built():
        raise ValueError(
            'gradient_predivide_factor not supported yet with ROCm')

    if op != Average and op != Sum:
        raise ValueError('op currently only supports Average and Sum')

    return _impl.create_distributed_optimizer(
        keras=keras,
        optimizer=optimizer,
        name=name,
        device_dense=device_dense,
        device_sparse=device_sparse,
        compression=compression,
        sparse_as_dense=sparse_as_dense,
        gradient_predivide_factor=gradient_predivide_factor,
        op=op,
        num_groups=num_groups,
    )

Beispiel #3

0

Datei anzeigen

def DistributedOptimizer(optimizer,
                         name=None,
                         device_dense='',
                         device_sparse='',
                         compression=Compression.none,
                         sparse_as_dense=False,
                         aggregation_frequency=1,
                         grad_updated_sizes_dict=None,
                         profile_frequency=0,
                         profile_filename=None,
                         average_aggregated_gradients=False):
    """
    An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to
    average gradient values before applying gradients to model weights.

    Args:
        optimizer: Optimizer to use for computing gradients and applying updates.
        name: Optional name prefix for the operations created when applying
              gradients. Defaults to "Distributed" followed by the provided
              optimizer type.
        device_dense: Device to be used for dense tensors. Uses GPU by default
                      if Horovod was build with HOROVOD_GPU_ALLREDUCE.
        device_sparse: Device to be used for sparse tensors. Uses GPU by default
                       if Horovod was build with HOROVOD_GPU_ALLGATHER.
        compression: Compression algorithm used to reduce the amount of data
                     sent and received by each worker node.  Defaults to not
                     using compression.
        sparse_as_dense: Treat all sparse gradients as dense tensors.  This can
                         help improve performance and memory utilization if
                         the original sparse gradient has high density.
                         Defaults to false.
        aggregation_frequency: How many batches to aggregate the gradients before
                               averaging the gradients with allreduce.
        grad_updated_sizes_dict: A dictionary containing the shape of each
                                 update grad.
        profile_frequency: How often (in terms of number of batches) to profile
                           the commnication time of the batch.
        profile_filename: Name of the file to write profiling logs to.
        average_aggregated_gradients: Whether to average the aggregated gradients
                                      across the iterations. Only possible for
                                      aggregation_frequency > 1.
        """
    return _impl.create_distributed_optimizer(
        keras=keras,
        optimizer=optimizer,
        name=name,
        device_dense=device_dense,
        device_sparse=device_sparse,
        compression=compression,
        sparse_as_dense=sparse_as_dense,
        aggregation_frequency=aggregation_frequency,
        grad_updated_sizes_dict=grad_updated_sizes_dict,
        profile_frequency=profile_frequency,
        profile_filename=profile_filename,
        average_aggregated_gradients=average_aggregated_gradients)

Beispiel #4

0

Datei anzeigen

Datei: __init__.py Projekt: zuston/horovod

def DistributedOptimizer(optimizer,
name=None,
device_dense='',
device_sparse='',
compression=Compression.none,
sparse_as_dense=False,
gradient_predivide_factor=1.0,
op=Average,
backward_passes_per_step=1,
average_aggregated_gradients=False):
"""
An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to
average gradient values before applying gradients to model weights.

Args:
optimizer: Optimizer to use for computing gradients and applying updates.
name: Optional name prefix for the operations created when applying
gradients. Defaults to "Distributed" followed by the provided
optimizer type.
device_dense: Device to be used for dense tensors. Uses GPU by default
if Horovod was build with HOROVOD_GPU_OPERATIONS.
device_sparse: Device to be used for sparse tensors. Uses GPU by default
if Horovod was build with HOROVOD_GPU_OPERATIONS.
compression: Compression algorithm used to reduce the amount of data
sent and received by each worker node. Defaults to not
using compression.
sparse_as_dense: Treat all sparse gradients as dense tensors. This can
help improve performance and memory utilization if
the original sparse gradient has high density.
Defaults to false.
gradient_predivide_factor: gradient_predivide_factor splits the averaging
before and after the sum. Gradients are scaled by
1.0 / gradient_predivide_factor before the sum and
gradient_predivide_factor / size after the sum.
op: The reduction operation to use when combining gradients across
different ranks. Defaults to Average.
backward_passes_per_step: Number of backward passes to perform before calling
hvd.allreduce. This allows accumulating updates over
multiple mini-batches before reducing and applying them.
average_aggregated_gradients: Whether to average the aggregated gradients that
have been accumulated over multiple mini-batches.
If true divides gradient updates by
backward_passes_per_step.
Only applicable for backward_passes_per_step > 1.
"""
if gradient_predivide_factor != 1.0 and rocm_built():
raise ValueError(
'gradient_predivide_factor not supported yet with ROCm')

if op != Average and op != Sum:
raise ValueError('op currently only supports Average and Sum')

return _impl.create_distributed_optimizer(
keras=keras,
optimizer=optimizer,
name=name,
device_dense=device_dense,
device_sparse=device_sparse,
compression=compression,
sparse_as_dense=sparse_as_dense,
gradient_predivide_factor=gradient_predivide_factor,
op=op,
backward_passes_per_step=backward_passes_per_step,
average_aggregated_gradients=average_aggregated_gradients,
)

Beispiel #5

0

Datei anzeigen

Datei: __init__.py Projekt: rongou/horovod

def DistributedOptimizer(optimizer,
name=None,
device_dense='',
device_sparse='',
compression=Compression.none,
sparse_as_dense=False,
gradient_predivide_factor=1.0,
op=Average,
backward_passes_per_step=1,
average_aggregated_gradients=False,
num_groups=0,
groups=None,
process_set=global_process_set):
"""
An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to
average gradient values before applying gradients to model weights.

Args:
optimizer: Optimizer to use for computing gradients and applying updates.
name: Optional name prefix for the operations created when applying
gradients. Defaults to "Distributed" followed by the provided
optimizer type.
device_dense: Device to be used for dense tensors. Uses GPU by default
if Horovod was build with HOROVOD_GPU_OPERATIONS.
device_sparse: Device to be used for sparse tensors. Uses GPU by default
if Horovod was build with HOROVOD_GPU_OPERATIONS.
compression: Compression algorithm used to reduce the amount of data
sent and received by each worker node. Defaults to not
using compression.
sparse_as_dense: Treat all sparse gradients as dense tensors. This can
help improve performance and memory utilization if
the original sparse gradient has high density.
Defaults to false.
gradient_predivide_factor: gradient_predivide_factor splits the averaging
before and after the sum. Gradients are scaled by
1.0 / gradient_predivide_factor before the sum and
gradient_predivide_factor / size after the sum.
op: The reduction operation to use when combining gradients across
different ranks. Defaults to Average.
backward_passes_per_step: Number of backward passes to perform before calling
hvd.allreduce. This allows accumulating updates over
multiple mini-batches before reducing and applying them.
average_aggregated_gradients: Whether to average the aggregated gradients that
have been accumulated over multiple mini-batches.
If true divides gradient updates by
backward_passes_per_step.
Only applicable for backward_passes_per_step > 1.
num_groups: Number of groups to assign gradient allreduce ops to for explicit
grouping. Defaults to no explicit groups.
groups: The parameter to group the gradient allreduce ops. Accept values is a
non-negative integer or a list of list of tf.Variable.
If groups is a non-negative integer, it is the number of groups to assign
gradient allreduce ops to for explicit grouping.
If groups is a list of list of tf.Variable. Variables in the same
inner list will be assigned to the same group, while parameter that does
not appear in any list will form a group itself.
Defaults as None, which is no explicit groups.
process_set: Gradients will only be reduced over Horovod processes belonging
to this process set. Defaults to the global process set.
"""
if gradient_predivide_factor != 1.0 and rocm_built():
raise ValueError(
'gradient_predivide_factor not supported yet with ROCm')

if op != Average and op != Sum:
raise ValueError('op currently only supports Average and Sum')

if num_groups != 0:
warnings.warn(
'Parameter `num_groups` has been replaced by `groups` '
'and will be removed in v0.23.0.', DeprecationWarning)
if groups is None:
groups = num_groups

if groups is not None:
if not (isinstance(groups, list) or groups > 0):
raise ValueError('groups should be a non-negative integer or '
'a list of list of tf.Variable.')

return _impl.create_distributed_optimizer(
keras=keras,
optimizer=optimizer,
name=name,
device_dense=device_dense,
device_sparse=device_sparse,
compression=compression,
sparse_as_dense=sparse_as_dense,
gradient_predivide_factor=gradient_predivide_factor,
op=op,
backward_passes_per_step=backward_passes_per_step,
average_aggregated_gradients=average_aggregated_gradients,
groups=groups,
process_set=process_set,
)