Esempio n. 1
0
def _make_allreduce_grads_fn(name, device_dense, device_sparse, compression,
                             sparse_as_dense, op, gradient_predivide_factor):
    if op == Average:
        # Split average operation across pre/postscale factors
        # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average.
        prescale_factor = 1.0 / gradient_predivide_factor
        postscale_factor = gradient_predivide_factor
    else:
        prescale_factor = 1.0
        postscale_factor = 1.0

    def allreduce_grads(grads):
        with tf.name_scope(name + "_Allreduce"):
            if sparse_as_dense:
                grads = [
                    tf.convert_to_tensor(grad) if grad is not None
                    and isinstance(grad, tf.IndexedSlices) else grad
                    for grad in grads
                ]

            return [
                _allreduce_cond(grad,
                                device_dense=device_dense,
                                device_sparse=device_sparse,
                                compression=compression,
                                op=op,
                                prescale_factor=prescale_factor,
                                postscale_factor=postscale_factor)
                if grad is not None else grad for grad in grads
            ]

    if _executing_eagerly():
        return _make_subgraph(allreduce_grads)
    else:
        return allreduce_grads
Esempio n. 2
0
def _make_broadcast_group_fn():
    def broadcast_group(variables, root_rank):
        return [var.assign(broadcast(var, root_rank)) for var in variables]

    if _executing_eagerly():
        return _make_subgraph(broadcast_group)
    else:
        return broadcast_group
Esempio n. 3
0
def _make_broadcast_group_fn():
    if _executing_eagerly():
        # Eager mode requires Tensor
        def broadcast_group(variables, root_rank):
            return [var.assign(broadcast(var, root_rank)) for var in variables]

        return _make_subgraph(broadcast_group)
    else:
        # Graph mode requires an Op
        def broadcast_group(variables, root_rank):
            return tf.group(
                *[var.assign(broadcast(var, root_rank)) for var in variables])

        return broadcast_group
Esempio n. 4
0
def _make_broadcast_group_fn():
    if _executing_eagerly():
        # Eager mode will parallelize independent control flow
        def broadcast_group(variables, root_rank):
            for var in variables:
                var.assign(broadcast(var, root_rank))

        return _make_subgraph(broadcast_group)
    else:
        # Graph mode requires an Op
        def broadcast_group(variables, root_rank):
            return tf.group(
                *[var.assign(broadcast(var, root_rank)) for var in variables])

        return broadcast_group
Esempio n. 5
0
def _make_allreduce_grads_fn(name, device_dense, device_sparse,
                             compression, sparse_as_dense):
    def allreduce_grads(grads):
        with tf.name_scope(name + "_Allreduce"):
            if sparse_as_dense:
                grads = [tf.convert_to_tensor(grad)
                         if grad is not None and isinstance(grad, tf.IndexedSlices)
                         else grad for grad in grads]

            return [allreduce(grad,
                              device_dense=device_dense,
                              device_sparse=device_sparse,
                              compression=compression)
                    if grad is not None else grad
                    for grad in grads]

    if _executing_eagerly():
        return _make_subgraph(allreduce_grads)
    else:
        return allreduce_grads
Esempio n. 6
0
def _make_cached_allreduce_grads_fn(name, device_dense, device_sparse,
                                    compression, sparse_as_dense, op,
                                    gradient_predivide_factor, groups):
    groups = refs_to_vars(groups) if isinstance(groups, tuple) else groups
    if op == Average:
        # Split average operation across pre/postscale factors
        # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average.
        prescale_factor = 1.0 / gradient_predivide_factor
        postscale_factor = gradient_predivide_factor
    else:
        prescale_factor = 1.0
        postscale_factor = 1.0

    def allreduce_grads(grads, vars=None):
        with tf.name_scope(name + "_Allreduce"):
            if sparse_as_dense:
                grads = [
                    tf.convert_to_tensor(grad) if grad is not None
                    and isinstance(grad, tf.IndexedSlices) else grad
                    for grad in grads
                ]

            if groups is not None:
                if isinstance(groups, list):
                    var_name2grad = {}
                    for i in range(len(vars)):
                        var = vars[i]
                        grad = grads[i]
                        if grad is not None:
                            var_name2grad[var.name] = (i, grad)
                    grads_split = []
                    for group in groups:
                        grad_group = []
                        for var in group:
                            if var.name in var_name2grad:
                                grad_group.append(var_name2grad[var.name])
                                del var_name2grad[var.name]
                        grads_split.append(grad_group)
                    for _, grad in var_name2grad.items():
                        grads_split.append([grad])
                elif groups > 0:
                    grads_clean = [(i, grad) for i, grad in enumerate(grads)
                                   if grad is not None]
                    grads_split = split_list(grads_clean, groups)

                reduce_ops = [None] * len(vars)
                for group in grads_split:
                    index_group, grad_group = [list(t) for t in zip(*group)]
                    reduce_ops_group = _grouped_allreduce_cond(
                        grad_group,
                        device_dense=device_dense,
                        device_sparse=device_sparse,
                        compression=compression,
                        op=op,
                        prescale_factor=prescale_factor,
                        postscale_factor=postscale_factor)
                    for i in range(len(index_group)):
                        reduce_ops[index_group[i]] = reduce_ops_group[i]
                return reduce_ops

            return [
                _allreduce_cond(grad,
                                device_dense=device_dense,
                                device_sparse=device_sparse,
                                compression=compression,
                                op=op,
                                prescale_factor=prescale_factor,
                                postscale_factor=postscale_factor)
                if grad is not None else grad for grad in grads
            ]

    if _executing_eagerly():
        return _make_subgraph(allreduce_grads)
    else:
        return allreduce_grads