def _make_allreduce_grads_fn(name, device_dense, device_sparse, compression, sparse_as_dense, op, gradient_predivide_factor): if op == Average: # Split average operation across pre/postscale factors # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average. prescale_factor = 1.0 / gradient_predivide_factor postscale_factor = gradient_predivide_factor else: prescale_factor = 1.0 postscale_factor = 1.0 def allreduce_grads(grads): with tf.name_scope(name + "_Allreduce"): if sparse_as_dense: grads = [ tf.convert_to_tensor(grad) if grad is not None and isinstance(grad, tf.IndexedSlices) else grad for grad in grads ] return [ _allreduce_cond(grad, device_dense=device_dense, device_sparse=device_sparse, compression=compression, op=op, prescale_factor=prescale_factor, postscale_factor=postscale_factor) if grad is not None else grad for grad in grads ] if _executing_eagerly(): return _make_subgraph(allreduce_grads) else: return allreduce_grads
def _make_broadcast_group_fn(): def broadcast_group(variables, root_rank): return [var.assign(broadcast(var, root_rank)) for var in variables] if _executing_eagerly(): return _make_subgraph(broadcast_group) else: return broadcast_group
def _make_broadcast_group_fn(): if _executing_eagerly(): # Eager mode requires Tensor def broadcast_group(variables, root_rank): return [var.assign(broadcast(var, root_rank)) for var in variables] return _make_subgraph(broadcast_group) else: # Graph mode requires an Op def broadcast_group(variables, root_rank): return tf.group( *[var.assign(broadcast(var, root_rank)) for var in variables]) return broadcast_group
def _make_broadcast_group_fn(): if _executing_eagerly(): # Eager mode will parallelize independent control flow def broadcast_group(variables, root_rank): for var in variables: var.assign(broadcast(var, root_rank)) return _make_subgraph(broadcast_group) else: # Graph mode requires an Op def broadcast_group(variables, root_rank): return tf.group( *[var.assign(broadcast(var, root_rank)) for var in variables]) return broadcast_group
def _make_allreduce_grads_fn(name, device_dense, device_sparse, compression, sparse_as_dense): def allreduce_grads(grads): with tf.name_scope(name + "_Allreduce"): if sparse_as_dense: grads = [tf.convert_to_tensor(grad) if grad is not None and isinstance(grad, tf.IndexedSlices) else grad for grad in grads] return [allreduce(grad, device_dense=device_dense, device_sparse=device_sparse, compression=compression) if grad is not None else grad for grad in grads] if _executing_eagerly(): return _make_subgraph(allreduce_grads) else: return allreduce_grads
def _make_cached_allreduce_grads_fn(name, device_dense, device_sparse, compression, sparse_as_dense, op, gradient_predivide_factor, groups): groups = refs_to_vars(groups) if isinstance(groups, tuple) else groups if op == Average: # Split average operation across pre/postscale factors # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average. prescale_factor = 1.0 / gradient_predivide_factor postscale_factor = gradient_predivide_factor else: prescale_factor = 1.0 postscale_factor = 1.0 def allreduce_grads(grads, vars=None): with tf.name_scope(name + "_Allreduce"): if sparse_as_dense: grads = [ tf.convert_to_tensor(grad) if grad is not None and isinstance(grad, tf.IndexedSlices) else grad for grad in grads ] if groups is not None: if isinstance(groups, list): var_name2grad = {} for i in range(len(vars)): var = vars[i] grad = grads[i] if grad is not None: var_name2grad[var.name] = (i, grad) grads_split = [] for group in groups: grad_group = [] for var in group: if var.name in var_name2grad: grad_group.append(var_name2grad[var.name]) del var_name2grad[var.name] grads_split.append(grad_group) for _, grad in var_name2grad.items(): grads_split.append([grad]) elif groups > 0: grads_clean = [(i, grad) for i, grad in enumerate(grads) if grad is not None] grads_split = split_list(grads_clean, groups) reduce_ops = [None] * len(vars) for group in grads_split: index_group, grad_group = [list(t) for t in zip(*group)] reduce_ops_group = _grouped_allreduce_cond( grad_group, device_dense=device_dense, device_sparse=device_sparse, compression=compression, op=op, prescale_factor=prescale_factor, postscale_factor=postscale_factor) for i in range(len(index_group)): reduce_ops[index_group[i]] = reduce_ops_group[i] return reduce_ops return [ _allreduce_cond(grad, device_dense=device_dense, device_sparse=device_sparse, compression=compression, op=op, prescale_factor=prescale_factor, postscale_factor=postscale_factor) if grad is not None else grad for grad in grads ] if _executing_eagerly(): return _make_subgraph(allreduce_grads) else: return allreduce_grads