def __init__(self, network, optimizer, scale_update_cell=None):
     super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
     self.network = network
     self.weights = ParameterTuple(network.trainable_params())
     self.optimizer = optimizer
     self.grad = ops.GradOperation(
         get_by_list=True,
         sens_param=True)
     self.reducer_flag = False
     self.allreduce = ops.AllReduce()
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
         self.reducer_flag = True
     self.grad_reducer = ops.identity
     self.degree = 1
     if self.reducer_flag:
         self.degree = get_group_size()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
     self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
     self.cast = ops.Cast()
     self.alloc_status = ops.NPUAllocFloatStatus()
     self.get_status = ops.NPUGetFloatStatus()
     self.clear_before_grad = ops.NPUClearFloatStatus()
     self.reduce_sum = ops.ReduceSum(keep_dims=False)
     self.depend_parameter_use = ops.ControlDepend(depend_mode=1)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = ops.LessEqual()
     self.hyper_map = ops.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_cell
     if scale_update_cell:
         self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                     name="loss_scale")
Beispiel #2
0
 def __init__(self,
              size=256,
              batch_size=16,
              image_size=(96, ),
              num_classes=16,
              random_offset=0):
     """init"""
     self.size = size
     self.rank_batch_size = batch_size
     self.total_batch_size = self.rank_batch_size
     self.random_offset = random_offset
     self.image_size = image_size
     self.num_classes = num_classes
     self.num_epochs = -1
     self.rank_size = 1
     self.rank_id = 0
     self.batch_index = 0
     self.image_data_type = np.float32
     self.label_data_type = np.float32
     self.is_onehot = True
     init(backend_name='hccl')
     self.rank_size = get_group_size()
     self.rank_id = get_rank()
     self.total_batch_size = self.rank_batch_size * self.rank_size
     self.total_batch_data_size = (self.rank_size,
                                   self.rank_batch_size) + image_size
     self.do_copy = False
Beispiel #3
0
def get_bprop_all_gather(self):
    """Generate bprop for AllGather"""
    fusion = self.get_attr_dict()["fusion"]
    if fusion == 0:
        reduce_scatter = ReduceScatter(ReduceOp.SUM, self.group)
        if self.instance_name:
            instance_name = "grad_" + self.instance_name
            reduce_scatter.set_prim_instance_name(instance_name)
    else:
        all_reduce = AllReduce(ReduceOp.SUM,
                               self.group).add_prim_attr("fusion", fusion)
        if self.instance_name:
            instance_name = "grad_" + self.instance_name
            all_reduce.set_prim_instance_name(instance_name)
        rank = get_rank(self.group)
        dev_num = get_group_size(self.group)
        split = P.Split(output_num=dev_num)
        mean_flag = self.get_attr_dict()["mean_flag"]
        scale = 1 / self.rank_size

    def bprop(x, out, dout):
        if fusion == 0:
            dx = reduce_scatter(dout)
        else:
            grad = all_reduce(dout)
            dx = split(grad)[rank]
            if mean_flag:
                dx = F.tensor_mul(dx, scale)
        return (dx, )

    return bprop
Beispiel #4
0
def get_bprop_mini_step_all_gather(self):
    """Generate bprop for _MiniStepAllGather"""
    fusion = self.get_attr_dict()["fusion"]
    mean_flag = self.get_attr_dict()["mean_flag"]
    do_mirror = self.get_attr_dict()["do_mirror"]
    scale = 1 / self.rank_size
    all_reduce = AllReduce(ReduceOp.SUM,
                           self.group).add_prim_attr("fusion", fusion)
    if self.instance_name:
        instance_name = "grad_" + self.instance_name
        all_reduce.set_prim_instance_name(instance_name)
    rank = get_rank(self.group)
    dev_num = get_group_size(self.group)
    split = P.Split(output_num=dev_num)

    def bprop(x, z, out, dout):
        if do_mirror:
            if mean_flag:
                z = F.depend(z, F.assign_add(z, dout))
                grad = all_reduce(z)
                dx = split(grad)[rank]
                dx = F.tensor_mul(dx, scale)
            else:
                z = F.depend(z, F.assign_add(z, dout))
                grad = all_reduce(z)
                dx = split(grad)[rank]
        else:
            dx = dout
        return (dx, zeros_like(z))

    return bprop
def create_dataset(data_path,
                   repeat_num=1,
                   batch_size=32,
                   rank_id=0,
                   rank_size=1):
    """create dataset"""
    resize_height = 224
    resize_width = 224
    rescale = 1.0 / 255.0
    shift = 0.0

    # get rank_id and rank_size
    rank_id = get_rank()
    rank_size = get_group_size()
    data_set = ds.Cifar10Dataset(data_path,
                                 num_shards=rank_size,
                                 shard_id=rank_id)

    # define map operations
    random_crop_op = vision.RandomCrop((32, 32), (4, 4, 4, 4))
    random_horizontal_op = vision.RandomHorizontalFlip()
    resize_op = vision.Resize((resize_height, resize_width))
    rescale_op = vision.Rescale(rescale, shift)
    normalize_op = vision.Normalize((0.4465, 0.4822, 0.4914),
                                    (0.2010, 0.1994, 0.2023))
    changeswap_op = vision.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)

    c_trans = [random_crop_op, random_horizontal_op]
    c_trans += [resize_op, rescale_op, normalize_op, changeswap_op]

    # apply map operations on images
    data_set = data_set.map(operations=type_cast_op, input_columns="label")
    data_set = data_set.map(operations=c_trans, input_columns="image")

    # apply shuffle operations
    data_set = data_set.shuffle(buffer_size=10)

    # apply batch operations
    data_set = data_set.batch(batch_size=batch_size, drop_remainder=True)

    # apply repeat operations
    data_set = data_set.repeat(repeat_num)

    return data_set
    def __init__(self, network, optimizer, sens=1.0):
        super(BertTrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = ops.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("mirror_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)

        self.cast = ops.Cast()
        self.hyper_map = ops.HyperMap()
Beispiel #7
0
    def __init__(self, network, optimizer, scale_update_cell=None):

        super(BertPoetryCell, self).__init__(network, optimizer, scale_update_cell)
        self.network = network
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = ops.GradOperation(
            get_by_list=True,
            sens_param=True)
        self.reducer_flag = False
        self.allreduce = ops.AllReduce()
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("mirror_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.cast = ops.Cast()
        self.gpu_target = False
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = ops.FloatStatus()
            self.addn = ops.AddN()
            self.reshape = ops.Reshape()
        else:
            self.alloc_status = ops.NPUAllocFloatStatus()
            self.get_status = ops.NPUGetFloatStatus()
            self.clear_before_grad = ops.NPUClearFloatStatus()
        self.reduce_sum = ops.ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = ops.LessEqual()
        self.hyper_map = ops.HyperMap()
        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")