Ejemplo n.º 1
0
 def __init__(self, network, optimizer, scale_update_cell=None):
     super(BertSquadCell, self).__init__(auto_prefix=False)
     self.network = network
     self.weights = ParameterTuple(network.trainable_params())
     self.optimizer = optimizer
     self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
     self.reducer_flag = False
     self.allreduce = P.AllReduce()
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
             ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     self.grad_reducer = None
     if self.reducer_flag:
         mean = context.get_auto_parallel_context("mirror_mean")
         degree = get_group_size()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                    mean, degree)
     self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
     self.cast = P.Cast()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_before_grad = P.NPUClearFloatStatus()
     self.reduce_sum = P.ReduceSum(keep_dims=False)
     self.depend_parameter_use = P.ControlDepend(depend_mode=1)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = P.LessEqual()
     self.hyper_map = C.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_cell
     if scale_update_cell:
         self.loss_scale = Parameter(Tensor(
             scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                     name="loss_scale")
Ejemplo n.º 2
0
 def __init__(self, network, optimizer, scale_update_cell=None):
     super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True,
                                 sens_param=True)
     self.reducer_flag = False
     self.allreduce = P.AllReduce()
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
         self.reducer_flag = True
     self.grad_reducer = F.identity
     self.degree = 1
     if self.reducer_flag:
         self.degree = get_group_size()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
     self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
     self.cast = P.Cast()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_status = P.NPUClearFloatStatus()
     self.reduce_sum = P.ReduceSum(keep_dims=False)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = P.LessEqual()
     self.hyper_map = C.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_cell
     if scale_update_cell:
         self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
    def __init__(self,
                 network,
                 optimizer,
                 scale_update_cell=None,
                 accumulation_steps=1,
                 enable_global_norm=False):
        super(BertTrainAccumulateStepsWithLossScaleCell,
              self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.accumulation_steps = accumulation_steps
        self.enable_global_norm = enable_global_norm
        self.one = Tensor(np.array([1]).astype(np.int32))
        self.zero = Tensor(np.array([0]).astype(np.int32))
        self.local_step = Parameter(initializer(0, [1], mstype.int32),
                                    name="local_step")
        self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros')
        self.accu_overflow = Parameter(initializer(0, [1], mstype.int32),
                                       name="accu_overflow")
        self.loss = Parameter(initializer(0, [1], mstype.float32),
                              name="accu_loss")

        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = F.identity
        self.degree = 1
        if self.reducer_flag:
            self.degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       False, self.degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.overflow_reducer = F.identity
        if self.is_distributed:
            self.overflow_reducer = P.AllReduce()
        self.cast = P.Cast()
        self.alloc_status = P.NPUAllocFloatStatus()
        self.get_status = P.NPUGetFloatStatus()
        self.clear_before_grad = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.logical_or = P.LogicalOr()
        self.not_equal = P.NotEqual()
        self.select = P.Select()
        self.reshape = P.Reshape()
        self.hyper_map = C.HyperMap()
        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(
                scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
Ejemplo n.º 4
0
    def __init__(self, network, optimizer, scale_update_cell=None):
        super(BertTrainWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.allreduce = P.AllReduce()
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = F.identity
        self.degree = 1
        if self.reducer_flag:
            self.degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       False, self.degree)
        self.clip_type = gradient_cfg.clip_type
        self.clip_value = gradient_cfg.clip_value
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.cast = P.Cast()
        self.alloc_status = P.NPUAllocFloatStatus()
        self.get_status = P.NPUGetFloatStatus()
        self.clear_before_grad = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.depend_parameter_use = P.ControlDepend(depend_mode=1)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()
        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(
                Tensor(scale_update_cell.get_loss_scale(),
                       dtype=mstype.float32))

        self.saved_params = self.weights.clone(prefix='saved')
        self.length = len(self.weights)
        self.quant_embedding_list = []
        self.quant_weight_list = []
        for i, key in enumerate(self.saved_params):
            if 'embedding_lookup' in key.name:
                self.quant_embedding_list.append(i)
            elif 'weight' in key.name and 'dense_1' not in key.name:
                self.quant_weight_list.append(i)
        self.quant_embedding_list_length = len(self.quant_embedding_list)
        self.quant_weight_list_length = len(self.quant_weight_list)

        self.quantize_embedding = QuantizeWeightCell(
            num_bits=network.embedding_bits,
            compute_type=network.compute_type,
            clip_value=network.weight_clip_value)
        self.quantize_weight = QuantizeWeightCell(
            num_bits=network.weight_bits,
            compute_type=network.compute_type,
            clip_value=network.weight_clip_value)
Ejemplo n.º 5
0
 def __init__(self):
     super().__init__()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_status = P.NPUClearFloatStatus()
     self.reduce_sum = P.ReduceSum(keep_dims=True)
     self.dtype = P.DType()
     self.sub = P.Sub()
     self.neg = P.Neg()
Ejemplo n.º 6
0
    def __init__(self, network, optimizer, scale_update_cell=None):

        super(TransformerTrainOneStepWithLossScaleCell,
              self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.all_reduce = P.AllReduce()

        self.parallel_mode = _get_parallel_mode()
        if self.parallel_mode not in ParallelMode.MODE_LIST:
            raise ValueError("Parallel mode does not support: ",
                             self.parallel_mode)
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.clip_gradients = ClipGradients()
        self.cast = P.Cast()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = P.NPUAllocFloatStatus()
            self.get_status = P.NPUGetFloatStatus()
            self.clear_status = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.depend_parameter_use = P.ControlDepend(depend_mode=1)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(
                Tensor(scale_update_cell.get_loss_scale(),
                       dtype=mstype.float32))
        self.add_flags(has_effect=True)
Ejemplo n.º 7
0
 def __init__(self):
     super(NpuFloatNet, self).__init__()
     self.mul = P.Mul()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_status = P.NPUClearFloatStatus()
     self.fill = P.Fill()
     self.shape_op = P.Shape()
     self.select = P.Select()
     self.less = P.Less()
     self.cast = P.Cast()
     self.dtype = P.DType()
     self.reduce_sum = P.ReduceSum(keep_dims=True)
     self.sub = P.Sub()
     self.neg = P.Neg()
 def __init__(self,
              network,
              optimizer,
              scale_update_cell=None,
              enable_global_norm=True,
              config=None):
     super(PANGUALPHATrainOneStepWithLossScaleCell,
           self).__init__(auto_prefix=False)
     self.network = network
     self.config = config
     self.network.add_flags(defer_inline=True)
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.enable_global_norm = enable_global_norm
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.reducer_flag = False
     self.allreduce = P.AllReduce()
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
         ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     self.grad_reducer = F.identity
     self.degree = 1
     if self.reducer_flag:
         self.degree = get_group_size()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                    False, self.degree)
     self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
     self.cast = P.Cast()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_before_grad = P.NPUClearFloatStatus()
     self.reduce_sum = P.ReduceSum(keep_dims=False)
     self.depend_parameter_use = P.ControlDepend(depend_mode=1)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = P.LessEqual()
     self.hyper_map = C.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_cell
     if scale_update_cell:
         self.loss_scale = Parameter(Tensor(
             scale_update_cell.get_loss_scale(), dtype=mstype.float32),
             name="loss_scale")
     self.clip = ClipByGlobalNorm(self.weights, self.config, pipeline=False)
    def __init__(self, network, optimizer, scale_update_cell=None):

        super(BertFinetuneCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.allreduce = P.AllReduce()
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.cast = P.Cast()
        self.gpu_target = False
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.alloc_status = P.NPUAllocFloatStatus()
            self.get_status = P.NPUGetFloatStatus()
            self.clear_status = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()
        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(
                Tensor(scale_update_cell.get_loss_scale(),
                       dtype=mstype.float32))
Ejemplo n.º 10
0
 def __init__(self, network, optimizer, scale_update_cell=None):
     super(TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
     self.network = network
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.reducer_flag = False
     self.grad_reducer = F.identity
     self.cast = P.Cast()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_status = P.NPUClearFloatStatus()
     self.reduce_sum = P.ReduceSum(keep_dims=False)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = P.LessEqual()
     self.hyper_map = C.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_cell
     if scale_update_cell:
         self.loss_scale = Parameter(Tensor(
             scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                     name="loss_scale")
Ejemplo n.º 11
0
 def __init__(self):
     super(Net, self).__init__()
     self.npu_get_float_status = P.NPUGetFloatStatus()
Ejemplo n.º 12
0
         'block': P.LogicalAnd(),
         'desc_inputs': [Tensor(np.zeros((2, 3, 4), np.bool_)), Tensor(np.ones((1), np.bool_))],
         'desc_bprop': [Tensor(np.zeros((2, 3, 4), np.bool_))]}),
 ('LogicalOr', {
         'block': P.LogicalOr(),
         'desc_inputs': [Tensor(np.zeros((3, 4, 5), np.bool_)), Tensor(np.ones((3, 1, 1), np.bool_))],
         'desc_bprop': [Tensor(np.zeros((3, 4, 5), np.bool_))]}),
 ('NpuAllocFloatStatus', {
     'block': P.NPUAllocFloatStatus(),
     'desc_inputs': [],
     'add_fack_input': True,
     'fack_input_type': np.float32,
     'desc_bprop': [Tensor(np.zeros([8]).astype(np.float32))],
     'skip': ['backward']}),
 ('NpuGetFloatStatus', {
     'block': P.NPUGetFloatStatus(),
     'desc_inputs': [Tensor(np.zeros([8]).astype(np.float32))],
     'desc_bprop': [Tensor(np.zeros([8]).astype(np.float32))],
     'skip': ['backward']}),
 ('NpuClearFloatStatus', {
     'block': P.NPUClearFloatStatus(),
     'desc_inputs': [Tensor(np.zeros([8]).astype(np.float32))],
     'desc_bprop': [Tensor(np.zeros([8]).astype(np.float32))],
     'skip': ['backward']}),
 ('CheckValid', {
     'block': P.CheckValid(),
     'desc_inputs': [[20000, 4], [3]],
     'desc_bprop': [[20000]],
     'skip': ['backward']}),
 ('NMSWithMask', {
     'block': P.NMSWithMask(0.5),
Ejemplo n.º 13
0
 def __init__(self):
     super(Net_hyper, self).__init__()
     self.func = Func()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_status = P.NPUClearFloatStatus()
Ejemplo n.º 14
0
        'skip': ['backward']}),

    # type of x and y not match
    ('LogicalOr1', {
        'block': (P.LogicalOr(), {'exception': TypeError, 'error_keywords': ['LogicalOr']}),
        'desc_inputs': [Tensor(np.ones([3, 4]).astype(np.int32)), Tensor(np.ones([3, 4]).astype(np.bool_))],
        'skip': ['backward']}),
    # shape of x and y not match
    ('LogicalOr2', {
        'block': (P.LogicalOr(), {'exception': ValueError, 'error_keywords': ['LogicalOr']}),
        'desc_inputs': [Tensor(np.ones([3, 4]).astype(np.bool_)), Tensor(np.ones([3, 2]).astype(np.bool_))],
        'skip': ['backward']}),

    # input is not tensor
    ('NPUGetFloatStatus0', {
        'block': (P.NPUGetFloatStatus(), {'exception': TypeError, 'error_keywords': ['NPUGetFloatStatus']}),
        'desc_inputs': [5.0],
        'skip': ['backward']}),
    # input is Tensor(int32), not Tensor(float32)
    ('NPUGetFloatStatus1', {
        'block': (P.NPUGetFloatStatus(), {'exception': TypeError, 'error_keywords': ['NPUGetFloatStatus']}),
        'desc_inputs': [Tensor(np.ones([3, 4]).astype(np.int32))],
        'skip': ['backward']}),
    # dims is not 1
    ('NPUGetFloatStatus2', {
        'block': (P.NPUGetFloatStatus(), {'exception': ValueError, 'error_keywords': ['NPUGetFloatStatus']}),
        'desc_inputs': [Tensor(np.ones([3, 4]).astype(np.float32))],
        'skip': ['backward']}),
    # shape[0] is not 8
    ('NPUGetFloatStatus3', {
        'block': (P.NPUGetFloatStatus(), {'exception': ValueError, 'error_keywords': ['NPUGetFloatStatus']}),
Ejemplo n.º 15
0
    # shape of x and y not match
    ('LogicalOr2', {
        'block': (P.LogicalOr(), {
            'exception': ValueError,
            'error_keywords': ['LogicalOr']
        }),
        'desc_inputs': [
            Tensor(np.ones([3, 4]).astype(np.bool_)),
            Tensor(np.ones([3, 2]).astype(np.bool_))
        ],
        'skip': ['backward']
    }),

    # input is not tensor
    ('NPUGetFloatStatus0', {
        'block': (P.NPUGetFloatStatus(), {
            'exception': TypeError,
            'error_keywords': ['NPUGetFloatStatus']
        }),
        'desc_inputs': [5.0],
        'skip': ['backward']
    }),
    # input is Tensor(int32), not Tensor(float32)
    ('NPUGetFloatStatus1', {
        'block': (P.NPUGetFloatStatus(), {
            'exception': TypeError,
            'error_keywords': ['NPUGetFloatStatus']
        }),
        'desc_inputs': [Tensor(np.ones([3, 4]).astype(np.int32))],
        'skip': ['backward']
    }),