def __init__(self, network, optimizer, scale_update_cell=None): super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = ops.GradOperation( get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = ops.AllReduce() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = ops.identity self.degree = 1 if self.reducer_flag: self.degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = ops.Cast() self.alloc_status = ops.NPUAllocFloatStatus() self.get_status = ops.NPUGetFloatStatus() self.clear_before_grad = ops.NPUClearFloatStatus() self.reduce_sum = ops.ReduceSum(keep_dims=False) self.depend_parameter_use = ops.ControlDepend(depend_mode=1) self.base = Tensor(1, mstype.float32) self.less_equal = ops.LessEqual() self.hyper_map = ops.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale")
def __init__(self, size=256, batch_size=16, image_size=(96, ), num_classes=16, random_offset=0): """init""" self.size = size self.rank_batch_size = batch_size self.total_batch_size = self.rank_batch_size self.random_offset = random_offset self.image_size = image_size self.num_classes = num_classes self.num_epochs = -1 self.rank_size = 1 self.rank_id = 0 self.batch_index = 0 self.image_data_type = np.float32 self.label_data_type = np.float32 self.is_onehot = True init(backend_name='hccl') self.rank_size = get_group_size() self.rank_id = get_rank() self.total_batch_size = self.rank_batch_size * self.rank_size self.total_batch_data_size = (self.rank_size, self.rank_batch_size) + image_size self.do_copy = False
def get_bprop_all_gather(self): """Generate bprop for AllGather""" fusion = self.get_attr_dict()["fusion"] if fusion == 0: reduce_scatter = ReduceScatter(ReduceOp.SUM, self.group) if self.instance_name: instance_name = "grad_" + self.instance_name reduce_scatter.set_prim_instance_name(instance_name) else: all_reduce = AllReduce(ReduceOp.SUM, self.group).add_prim_attr("fusion", fusion) if self.instance_name: instance_name = "grad_" + self.instance_name all_reduce.set_prim_instance_name(instance_name) rank = get_rank(self.group) dev_num = get_group_size(self.group) split = P.Split(output_num=dev_num) mean_flag = self.get_attr_dict()["mean_flag"] scale = 1 / self.rank_size def bprop(x, out, dout): if fusion == 0: dx = reduce_scatter(dout) else: grad = all_reduce(dout) dx = split(grad)[rank] if mean_flag: dx = F.tensor_mul(dx, scale) return (dx, ) return bprop
def get_bprop_mini_step_all_gather(self): """Generate bprop for _MiniStepAllGather""" fusion = self.get_attr_dict()["fusion"] mean_flag = self.get_attr_dict()["mean_flag"] do_mirror = self.get_attr_dict()["do_mirror"] scale = 1 / self.rank_size all_reduce = AllReduce(ReduceOp.SUM, self.group).add_prim_attr("fusion", fusion) if self.instance_name: instance_name = "grad_" + self.instance_name all_reduce.set_prim_instance_name(instance_name) rank = get_rank(self.group) dev_num = get_group_size(self.group) split = P.Split(output_num=dev_num) def bprop(x, z, out, dout): if do_mirror: if mean_flag: z = F.depend(z, F.assign_add(z, dout)) grad = all_reduce(z) dx = split(grad)[rank] dx = F.tensor_mul(dx, scale) else: z = F.depend(z, F.assign_add(z, dout)) grad = all_reduce(z) dx = split(grad)[rank] else: dx = dout return (dx, zeros_like(z)) return bprop
def create_dataset(data_path, repeat_num=1, batch_size=32, rank_id=0, rank_size=1): """create dataset""" resize_height = 224 resize_width = 224 rescale = 1.0 / 255.0 shift = 0.0 # get rank_id and rank_size rank_id = get_rank() rank_size = get_group_size() data_set = ds.Cifar10Dataset(data_path, num_shards=rank_size, shard_id=rank_id) # define map operations random_crop_op = vision.RandomCrop((32, 32), (4, 4, 4, 4)) random_horizontal_op = vision.RandomHorizontalFlip() resize_op = vision.Resize((resize_height, resize_width)) rescale_op = vision.Rescale(rescale, shift) normalize_op = vision.Normalize((0.4465, 0.4822, 0.4914), (0.2010, 0.1994, 0.2023)) changeswap_op = vision.HWC2CHW() type_cast_op = C.TypeCast(mstype.int32) c_trans = [random_crop_op, random_horizontal_op] c_trans += [resize_op, rescale_op, normalize_op, changeswap_op] # apply map operations on images data_set = data_set.map(operations=type_cast_op, input_columns="label") data_set = data_set.map(operations=c_trans, input_columns="image") # apply shuffle operations data_set = data_set.shuffle(buffer_size=10) # apply batch operations data_set = data_set.batch(batch_size=batch_size, drop_remainder=True) # apply repeat operations data_set = data_set.repeat(repeat_num) return data_set
def __init__(self, network, optimizer, sens=1.0): super(BertTrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = context.get_auto_parallel_context("mirror_mean") degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.cast = ops.Cast() self.hyper_map = ops.HyperMap()
def __init__(self, network, optimizer, scale_update_cell=None): super(BertPoetryCell, self).__init__(network, optimizer, scale_update_cell) self.network = network self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = ops.GradOperation( get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = ops.AllReduce() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = context.get_auto_parallel_context("mirror_mean") degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = ops.Cast() self.gpu_target = False if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = ops.FloatStatus() self.addn = ops.AddN() self.reshape = ops.Reshape() else: self.alloc_status = ops.NPUAllocFloatStatus() self.get_status = ops.NPUGetFloatStatus() self.clear_before_grad = ops.NPUClearFloatStatus() self.reduce_sum = ops.ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = ops.LessEqual() self.hyper_map = ops.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale")