Exemple #1
0
def _init_allreduce_operators(length):
    """ initialize allreduce communication operators"""
    is_parallel_optimizer = context.get_auto_parallel_context(
        "enable_parallel_optimizer")
    split_indices = auto_parallel_context(
    ).get_all_reduce_fusion_split_indices()
    if is_parallel_optimizer and split_indices:
        group = 1
        fusion = ()
        for i in range(length):
            fusion = fusion + (group, )
            if split_indices[group - 1] <= i + 1:
                if group >= len(split_indices):
                    continue
                group = group + 1
        index = tuple(range(1, length + 1))
    else:
        fusion = (1, ) * length
        index = (0, ) * length
    opt_list = ()
    for i in range(length):
        opt = AllReduce('sum', GlobalComm.WORLD_COMM_GROUP)
        opt.add_prim_attr('fusion', fusion[i])
        opt.add_prim_attr('index', index[i])
        opt_list = opt_list + (opt, )
    return opt_list
Exemple #2
0
 def __init__(self, parameters, mean=True, degree=None):
     super(DistributedGradReducer, self).__init__(auto_prefix=False)
     self.map_ = C.Map()
     if degree is None:
         self.degree = get_group_size()
     else:
         if not isinstance(degree, int) or degree <= 0:
             raise ValueError(
                 "Parameter 'degree' in DistributedGradReducer should large than 0 and be int"
             )
         self.degree = degree
     self.mean = mean
     self.allreduce_filter = tuple(x.layerwise_parallel is False
                                   for x in parameters)
     is_parallel_optimizer = context.get_auto_parallel_context(
         "enable_parallel_optimizer")
     split_indices = auto_parallel_context(
     ).get_all_reduce_fusion_split_indices()
     if is_parallel_optimizer and split_indices:
         self.split_fusion = True
         self.op_list = _init_allreduce_operators(len(parameters),
                                                  split_indices)
     else:
         self.split_fusion = False
         self.allreduce = AllReduce().add_prim_attr('fusion', 1)
     self.allgather = AllGather(GlobalComm.WORLD_COMM_GROUP)
     ps_filter = lambda x: x.is_param_ps
     self.ps_parameters = tuple(ps_filter(x) for x in parameters)
     self.enable_parameter_server = any(self.ps_parameters)
Exemple #3
0
def _init_optimizer_communication():
    global _all_reduce
    global _all_gather

    _all_reduce = AllReduce(ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
    _all_reduce.add_prim_attr('fusion', 1)
    _all_gather = AllGather(GlobalComm.WORLD_COMM_GROUP)
 def __init__(self, axis=0, shape=None):
     super(Net, self).__init__()
     if shape is None:
         shape = [8, 8]
     self.all_reduce = AllReduce()
     self.gatherv2 = P.SparseGatherV2()
     self.index = Tensor(np.ones(shape), dtype=ms.int32)
     self.axis = axis
Exemple #5
0
def _init_allreduce_operators(length, split_indices):
    """ initialize allreduce communication operators"""
    group = 1
    fusion = ()
    for i in range(length):
        fusion = fusion + (group, )
        if split_indices[group - 1] <= i + 1:
            if group >= len(split_indices):
                continue
            group = group + 1
    index = tuple(range(1, length + 1))
    op_list = ()
    for i in range(length):
        op = AllReduce('sum', GlobalComm.WORLD_COMM_GROUP)
        op.add_prim_attr('fusion', fusion[i])
        op.add_prim_attr('index', index[i])
        op_list = op_list + (op, )
    return op_list
def _init_allreduce_operators(length, split_indices):
    """ initialize allreduce communication operators"""
    indices = split_indices[0]
    fusion = split_indices[1]
    op_list = ()
    j = 0
    for i in range(length):
        if j <= len(indices) - 1:
            temp = indices[j]
        else:
            temp = length
        if i >= temp:
            j = j + 1
            fusion = fusion + 1
        op = AllReduce('sum', GlobalComm.WORLD_COMM_GROUP)
        op.add_prim_attr('fusion', fusion)
        op_list = op_list + (op, )
    return op_list
Exemple #7
0
 def __init__(self, input_channel, out_channel, op):
     super(AllReduceNet, self).__init__()
     self.dense = Dense(input_channel, out_channel)
     self.reduce = AllReduce(op)
     self.relu = ReLU()
Exemple #8
0
def _init_optimizer_allreduce():
    global _all_reduce
    _all_reduce = AllReduce(ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
    _all_reduce.add_prim_attr('fusion', 1)
Exemple #9
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""grad reducer cell for distributed training"""
from mindspore.nn.cell import Cell
from mindspore.communication.management import GlobalComm, get_group_size
from mindspore.ops import functional as F, composite as C, operations as P
from mindspore.ops.operations.comm_ops import AllReduce, ReduceOp
import mindspore.common.dtype as mstype

reduce_opt = C.MultitypeFuncGraph("reduce_opt")

_all_reduce = AllReduce()


def _init_optimizer_allreduce():
    global _all_reduce
    _all_reduce = AllReduce(ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
    _all_reduce.add_prim_attr('fusion', 1)


@reduce_opt.register("Function", "Number", "Bool", "Tensor")
def _tensors_allreduce_mean(mul, degree, allreduce_filter, grad):
    """
    Apply mean and allreduce on gradient. Allreduce is a communication operation used for distributed deep learning.

    Args:
        mul (Primitive): Div operation.
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
"""grad_reducer_thor"""
import mindspore.common.dtype as mstype
from mindspore.communication.management import GlobalComm, get_group_size
from mindspore.nn.cell import Cell
from mindspore.ops import functional as F, composite as C, operations as P
from mindspore.ops.operations.comm_ops import AllReduce, ReduceOp

reduce_opt = C.MultitypeFuncGraph("reduce_opt")

_all_reduce_A = AllReduce()


def _init_optimizer_allreduce(group):
    global _all_reduce_A
    _all_reduce_A = AllReduce(ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
    _all_reduce_A.add_prim_attr('fusion', group)


@reduce_opt.register("Function", "Number", "Tensor")
def _tensors_allreduce_mean(mul, degree, grad):
    degree = F.scalar_cast(degree, F.dtype(grad))
    grad = _all_reduce_A(grad)
    cast_op = P.Cast()
    return mul(grad, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(grad)))
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""grad_reducer_thor"""
import mindspore.common.dtype as mstype
from mindspore.communication.management import GlobalComm, get_group_size
from mindspore.nn.cell import Cell
from mindspore.ops import functional as F, composite as C, operations as P
from mindspore.ops.operations.comm_ops import AllReduce, ReduceOp

reduce_opt = C.MultitypeFuncGraph("reduce_opt")

_all_reduce_G = AllReduce()


def _init_optimizer_allreduce(group):
    global _all_reduce_G
    _all_reduce_G = AllReduce(ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
    _all_reduce_G.add_prim_attr('fusion', group)


@reduce_opt.register("Function", "Number", "Tensor")
def _tensors_allreduce_mean(mul, degree, grad):
    degree = F.scalar_cast(degree, F.dtype(grad))
    grad = _all_reduce_G(grad)
    cast_op = P.Cast()
    return mul(grad, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(grad)))
def test_all_reduce(x):
    print('test_all_reduce with %s' % (x))
    all_reduce = AllReduce()
    y = all_reduce(x)
    print('y=%s' % (y))