def DistributedOptimizer(optimizer, name=None, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False): """ An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to average gradient values before applying gradients to model weights. Args: optimizer: Optimizer to use for computing gradients and applying updates. name: Optional name prefix for the operations created when applying gradients. Defaults to "Distributed" followed by the provided optimizer type. device_dense: Device to be used for dense tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_ALLREDUCE. device_sparse: Device to be used for sparse tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_ALLGATHER. compression: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. sparse_as_dense: Treat all sparse gradients as dense tensors. This can help improve performance and memory utilization if the original sparse gradient has high density. Defaults to false. """ return _impl.create_distributed_optimizer(keras, optimizer, name, device_dense, device_sparse, compression, sparse_as_dense)
def DistributedOptimizer(optimizer, name=None, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False, gradient_predivide_factor=1.0, op=Average, num_groups=0): """ An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to average gradient values before applying gradients to model weights. Args: optimizer: Optimizer to use for computing gradients and applying updates. name: Optional name prefix for the operations created when applying gradients. Defaults to "Distributed" followed by the provided optimizer type. device_dense: Device to be used for dense tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_OPERATIONS. device_sparse: Device to be used for sparse tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_OPERATIONS. compression: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. sparse_as_dense: Treat all sparse gradients as dense tensors. This can help improve performance and memory utilization if the original sparse gradient has high density. Defaults to false. gradient_predivide_factor: gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum. op: The reduction operation to use when combining gradients across different ranks. Defaults to Average. num_groups: Number of groups to assign gradient allreduce ops to for explicit grouping. Defaults to no explicit groups. """ if gradient_predivide_factor != 1.0 and rocm_built(): raise ValueError( 'gradient_predivide_factor not supported yet with ROCm') if op != Average and op != Sum: raise ValueError('op currently only supports Average and Sum') return _impl.create_distributed_optimizer( keras=keras, optimizer=optimizer, name=name, device_dense=device_dense, device_sparse=device_sparse, compression=compression, sparse_as_dense=sparse_as_dense, gradient_predivide_factor=gradient_predivide_factor, op=op, num_groups=num_groups, )
def DistributedOptimizer(optimizer, name=None, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False, aggregation_frequency=1, grad_updated_sizes_dict=None, profile_frequency=0, profile_filename=None, average_aggregated_gradients=False): """ An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to average gradient values before applying gradients to model weights. Args: optimizer: Optimizer to use for computing gradients and applying updates. name: Optional name prefix for the operations created when applying gradients. Defaults to "Distributed" followed by the provided optimizer type. device_dense: Device to be used for dense tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_ALLREDUCE. device_sparse: Device to be used for sparse tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_ALLGATHER. compression: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. sparse_as_dense: Treat all sparse gradients as dense tensors. This can help improve performance and memory utilization if the original sparse gradient has high density. Defaults to false. aggregation_frequency: How many batches to aggregate the gradients before averaging the gradients with allreduce. grad_updated_sizes_dict: A dictionary containing the shape of each update grad. profile_frequency: How often (in terms of number of batches) to profile the commnication time of the batch. profile_filename: Name of the file to write profiling logs to. average_aggregated_gradients: Whether to average the aggregated gradients across the iterations. Only possible for aggregation_frequency > 1. """ return _impl.create_distributed_optimizer( keras=keras, optimizer=optimizer, name=name, device_dense=device_dense, device_sparse=device_sparse, compression=compression, sparse_as_dense=sparse_as_dense, aggregation_frequency=aggregation_frequency, grad_updated_sizes_dict=grad_updated_sizes_dict, profile_frequency=profile_frequency, profile_filename=profile_filename, average_aggregated_gradients=average_aggregated_gradients)
def DistributedOptimizer(optimizer, name=None, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False, gradient_predivide_factor=1.0, op=Average, backward_passes_per_step=1, average_aggregated_gradients=False): """ An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to average gradient values before applying gradients to model weights. Args: optimizer: Optimizer to use for computing gradients and applying updates. name: Optional name prefix for the operations created when applying gradients. Defaults to "Distributed" followed by the provided optimizer type. device_dense: Device to be used for dense tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_OPERATIONS. device_sparse: Device to be used for sparse tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_OPERATIONS. compression: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. sparse_as_dense: Treat all sparse gradients as dense tensors. This can help improve performance and memory utilization if the original sparse gradient has high density. Defaults to false. gradient_predivide_factor: gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum. op: The reduction operation to use when combining gradients across different ranks. Defaults to Average. backward_passes_per_step: Number of backward passes to perform before calling hvd.allreduce. This allows accumulating updates over multiple mini-batches before reducing and applying them. average_aggregated_gradients: Whether to average the aggregated gradients that have been accumulated over multiple mini-batches. If true divides gradient updates by backward_passes_per_step. Only applicable for backward_passes_per_step > 1. """ if gradient_predivide_factor != 1.0 and rocm_built(): raise ValueError( 'gradient_predivide_factor not supported yet with ROCm') if op != Average and op != Sum: raise ValueError('op currently only supports Average and Sum') return _impl.create_distributed_optimizer( keras=keras, optimizer=optimizer, name=name, device_dense=device_dense, device_sparse=device_sparse, compression=compression, sparse_as_dense=sparse_as_dense, gradient_predivide_factor=gradient_predivide_factor, op=op, backward_passes_per_step=backward_passes_per_step, average_aggregated_gradients=average_aggregated_gradients, )
def DistributedOptimizer(optimizer, name=None, device_dense='', device_sparse='', compression=Compression.none, sparse_as_dense=False, gradient_predivide_factor=1.0, op=Average, backward_passes_per_step=1, average_aggregated_gradients=False, num_groups=0, groups=None, process_set=global_process_set): """ An optimizer that wraps another keras.optimizers.Optimizer, using an allreduce to average gradient values before applying gradients to model weights. Args: optimizer: Optimizer to use for computing gradients and applying updates. name: Optional name prefix for the operations created when applying gradients. Defaults to "Distributed" followed by the provided optimizer type. device_dense: Device to be used for dense tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_OPERATIONS. device_sparse: Device to be used for sparse tensors. Uses GPU by default if Horovod was build with HOROVOD_GPU_OPERATIONS. compression: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. sparse_as_dense: Treat all sparse gradients as dense tensors. This can help improve performance and memory utilization if the original sparse gradient has high density. Defaults to false. gradient_predivide_factor: gradient_predivide_factor splits the averaging before and after the sum. Gradients are scaled by 1.0 / gradient_predivide_factor before the sum and gradient_predivide_factor / size after the sum. op: The reduction operation to use when combining gradients across different ranks. Defaults to Average. backward_passes_per_step: Number of backward passes to perform before calling hvd.allreduce. This allows accumulating updates over multiple mini-batches before reducing and applying them. average_aggregated_gradients: Whether to average the aggregated gradients that have been accumulated over multiple mini-batches. If true divides gradient updates by backward_passes_per_step. Only applicable for backward_passes_per_step > 1. num_groups: Number of groups to assign gradient allreduce ops to for explicit grouping. Defaults to no explicit groups. groups: The parameter to group the gradient allreduce ops. Accept values is a non-negative integer or a list of list of tf.Variable. If groups is a non-negative integer, it is the number of groups to assign gradient allreduce ops to for explicit grouping. If groups is a list of list of tf.Variable. Variables in the same inner list will be assigned to the same group, while parameter that does not appear in any list will form a group itself. Defaults as None, which is no explicit groups. process_set: Gradients will only be reduced over Horovod processes belonging to this process set. Defaults to the global process set. """ if gradient_predivide_factor != 1.0 and rocm_built(): raise ValueError( 'gradient_predivide_factor not supported yet with ROCm') if op != Average and op != Sum: raise ValueError('op currently only supports Average and Sum') if num_groups != 0: warnings.warn( 'Parameter `num_groups` has been replaced by `groups` ' 'and will be removed in v0.23.0.', DeprecationWarning) if groups is None: groups = num_groups if groups is not None: if not (isinstance(groups, list) or groups > 0): raise ValueError('groups should be a non-negative integer or ' 'a list of list of tf.Variable.') return _impl.create_distributed_optimizer( keras=keras, optimizer=optimizer, name=name, device_dense=device_dense, device_sparse=device_sparse, compression=compression, sparse_as_dense=sparse_as_dense, gradient_predivide_factor=gradient_predivide_factor, op=op, backward_passes_per_step=backward_passes_per_step, average_aggregated_gradients=average_aggregated_gradients, groups=groups, process_set=process_set, )