Example #1
0
    def __init__(self, network, optimizer, sens=1.0):
        super(TrainingWrapper, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()  #False
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.grad_reducer = None

        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            if auto_parallel_context().get_device_num_is_set():
                degree = context.get_auto_parallel_context("device_num")
            else:
                degree = get_group_size()
            self.grad_reducer = nn.DistributedGradReducer(
                optimizer.parameters, mean, degree)

        self.hyper_map = C.HyperMap()
        self.alloc_status = NPUAllocFloatStatus()
        self.get_status = NPUGetFloatStatus()
        self.clear_status = NPUClearFloatStatus()
        self.reduce_sum = ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = LessEqual()
        self.allreduce = P.AllReduce()
        self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE
Example #2
0
 def __init__(self, G, generator, optimizer, sens=1.0):
     super(TrainOneStepG, self).__init__(auto_prefix=False)
     self.optimizer = optimizer
     self.G = G
     self.G.set_grad()
     self.G.set_train()
     self.G.D_A.set_grad(False)
     self.G.D_A.set_train(False)
     self.G.D_B.set_grad(False)
     self.G.D_B.set_train(False)
     self.grad = ops.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.weights = ms.ParameterTuple(generator.trainable_params())
     self.net = WithLossCell(G)
     self.reducer_flag = False
     self.grad_reducer = None
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
             ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     if self.reducer_flag:
         mean = context.get_auto_parallel_context("gradients_mean")
         if auto_parallel_context().get_device_num_is_set():
             degree = context.get_auto_parallel_context("device_num")
         else:
             degree = get_group_size()
         self.grad_reducer = nn.DistributedGradReducer(
             optimizer.parameters, mean, degree)
Example #3
0
 def __init__(self, network, optimizer, sens=1.0, use_global_norm=False):
     super(TrainingWrapper, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.weights = ms.ParameterTuple(network.trainable_params())
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.reducer_flag = False
     self.grad_reducer = None
     self.use_global_norm = use_global_norm
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
             ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     if self.reducer_flag:
         mean = context.get_auto_parallel_context("gradients_mean")
         if auto_parallel_context().get_device_num_is_set():
             degree = context.get_auto_parallel_context("device_num")
         else:
             degree = get_group_size()
         self.grad_reducer = nn.DistributedGradReducer(
             optimizer.parameters, mean, degree)
     self.hyper_map = C.HyperMap()
Example #4
0
 def __init__(self, network, optimizer, scale_update_cell=None):
     super(BertSquadCell, self).__init__(auto_prefix=False)
     self.network = network
     self.weights = ParameterTuple(network.trainable_params())
     self.optimizer = optimizer
     self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
     self.reducer_flag = False
     self.allreduce = P.AllReduce()
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
             ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     self.grad_reducer = None
     if self.reducer_flag:
         mean = context.get_auto_parallel_context("mirror_mean")
         degree = get_group_size()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                    mean, degree)
     self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
     self.cast = P.Cast()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_before_grad = P.NPUClearFloatStatus()
     self.reduce_sum = P.ReduceSum(keep_dims=False)
     self.depend_parameter_use = P.ControlDepend(depend_mode=1)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = P.LessEqual()
     self.hyper_map = C.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_cell
     if scale_update_cell:
         self.loss_scale = Parameter(Tensor(
             scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                     name="loss_scale")
    def __init__(self, network, optimizer, sens=1.0):
        super(TransformerTrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode not in ParallelMode.MODE_LIST:
            raise ValueError("Parallel mode does not support: ",
                             self.parallel_mode)
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)

        self.clip_gradients = ClipGradients()
        self.cast = P.Cast()
Example #6
0
 def __init__(self, network, optimizer, sens=1):
     super(TrainingWrapper, self).__init__(auto_prefix=False)
     self.network = network
     self.depend_network = Depend_network(network)
     # self.weights = ms.ParameterTuple(network.trainable_params())
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.reducer_flag = False
     self.grad_reducer = None
     self.print = P.Print()
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
             ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     if self.reducer_flag:
         mean = context.get_auto_parallel_context("gradients_mean")
         #if mean.get_device_num_is_set():
         # if mean:
         #degree = context.get_auto_parallel_context("device_num")
         # else:
         degree = get_group_size()
         self.grad_reducer = nn.DistributedGradReducer(
             optimizer.parameters, mean, degree)
Example #7
0
def _parallel_predict_check():
    """validate parallel model prediction"""
    if _get_parallel_mode() in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
        if not context.get_auto_parallel_context("full_batch"):
            raise RuntimeError('Model prediction only supports full batch dataset. Please set "full_batch" with True.')
        if context.get_auto_parallel_context("enable_parallel_optimizer"):
            raise RuntimeError('Model prediction does not support parallel optimizer. Please set'
                               '"enable_parallel_optimizer" with False.')
Example #8
0
 def __init__(self, network):
     super(PredictWithSigmoid, self).__init__()
     self.network = network
     self.sigmoid = P.Sigmoid()
     parallel_mode = context.get_auto_parallel_context("parallel_mode")
     full_batch = context.get_auto_parallel_context("full_batch")
     is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                                          ParallelMode.AUTO_PARALLEL)
     if is_auto_parallel and full_batch:
         self.sigmoid.shard(((1, 1), ))
Example #9
0
    def __init__(self, network, config, sens=1000.0):
        super(TrainStepWrap, self).__init__()
        self.network = network
        self.network.set_train()
        self.trainable_params = network.trainable_params()
        weights_w = []
        weights_d = []
        for params in self.trainable_params:
            if 'wide' in params.name:
                weights_w.append(params)
            else:
                weights_d.append(params)

        self.weights_w = ParameterTuple(weights_w)
        self.weights_d = ParameterTuple(weights_d)
        self.optimizer_w = FTRL(learning_rate=config.ftrl_lr,
                                params=self.weights_w,
                                l1=5e-4,
                                l2=5e-4,
                                initial_accum=0.1,
                                loss_scale=sens)

        #self.optimizer_d = ProximalAdagrad(self.weights_d, learning_rate=config.adam_lr,loss_scale=sens)
        self.optimizer_d = Adam(self.weights_d,
                                learning_rate=config.adam_lr,
                                eps=1e-6,
                                loss_scale=sens)

        self.hyper_map = C.HyperMap()

        self.grad_w = C.GradOperation('grad_w',
                                      get_by_list=True,
                                      sens_param=True)
        self.grad_d = C.GradOperation('grad_d',
                                      get_by_list=True,
                                      sens_param=True)

        self.sens = sens
        self.loss_net_w = IthOutputCell(network, output_index=0)
        self.loss_net_d = IthOutputCell(network, output_index=1)

        self.reducer_flag = False
        self.grad_reducer_w = None
        self.grad_reducer_d = None
        parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("mirror_mean")
            degree = context.get_auto_parallel_context("device_num")
            self.grad_reducer_w = DistributedGradReducer(
                self.optimizer_w.parameters, mean, degree)
            self.grad_reducer_d = DistributedGradReducer(
                self.optimizer_d.parameters, mean, degree)
Example #10
0
    def __init__(self, network, sens=1024.0, host_device_mix=False, parameter_server=False, sparse=False):
        super(TrainStepWrap, self).__init__()
        parallel_mode = context.get_auto_parallel_context("parallel_mode")
        is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL)
        self.network = network
        self.network.set_train()
        self.trainable_params = network.trainable_params()
        weights_w = []
        weights_d = []
        for params in self.trainable_params:
            if 'wide' in params.name:
                weights_w.append(params)
            else:
                weights_d.append(params)
        self.weights_w = ParameterTuple(weights_w)
        self.weights_d = ParameterTuple(weights_d)

        if (sparse and is_auto_parallel) or parameter_server:
            self.optimizer_d = LazyAdam(
                self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens)
            self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w,
                                    l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens)
            if host_device_mix or parameter_server:
                self.optimizer_w.target = "CPU"
                self.optimizer_d.target = "CPU"
        else:
            self.optimizer_d = Adam(
                self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens)
            self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w,
                                    l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens)
        self.hyper_map = C.HyperMap()
        self.grad_w = C.GradOperation(get_by_list=True,
                                      sens_param=True)
        self.grad_d = C.GradOperation(get_by_list=True,
                                      sens_param=True)
        self.sens = sens
        self.loss_net_w = IthOutputCell(network, output_index=0)
        self.loss_net_d = IthOutputCell(network, output_index=1)
        self.loss_net_w.set_grad()
        self.loss_net_d.set_grad()

        self.reducer_flag = False
        self.grad_reducer_w = None
        self.grad_reducer_d = None
        self.reducer_flag = parallel_mode in (ParallelMode.DATA_PARALLEL,
                                              ParallelMode.HYBRID_PARALLEL)
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            degree = context.get_auto_parallel_context("device_num")
            self.grad_reducer_w = DistributedGradReducer(self.optimizer_w.parameters, mean, degree)
            self.grad_reducer_d = DistributedGradReducer(self.optimizer_d.parameters, mean, degree)
Example #11
0
 def __init__(self, network, optimizer, scale_update_cell=None):
     super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True,
                                 sens_param=True)
     self.reducer_flag = False
     self.allreduce = P.AllReduce()
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
         self.reducer_flag = True
     self.grad_reducer = F.identity
     self.degree = 1
     if self.reducer_flag:
         self.degree = get_group_size()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
     self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
     self.cast = P.Cast()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_status = P.NPUClearFloatStatus()
     self.reduce_sum = P.ReduceSum(keep_dims=False)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = P.LessEqual()
     self.hyper_map = C.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_cell
     if scale_update_cell:
         self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
Example #12
0
    def _gpu_analyse(self):
        """Collect and analyse gpu performance data"""
        if context.get_auto_parallel_context(
                'device_num') > 1 and self._dev_id != get_rank():
            self._dev_id = get_rank()
            logger.error(
                'Please check the Profiler object initialized after set_auto_parallel_context() '
                'and init(). Profiler should be initialized after these code. '
            )
        self._gpu_profiler.stop()
        timeline_generator = self._generate_timeline()

        # parse minddata pipeline operator and queue for GPU
        try:
            pipeline_parser = MinddataPipelineParser(self._output_path,
                                                     self._dev_id,
                                                     self._output_path)
            pipeline_parser.parse()
        except ProfilerException as err:
            logger.warning(err.message)

        # analyse step trace info
        try:
            self._analyse_step_trace(is_training_mode_flag=timeline_generator.
                                     check_op_name('Gradients'))
        except ProfilerException as err:
            logger.warning(err.message)

        os.environ['PROFILING_MODE'] = str("false")

        logger.warning(
            '\nMemory Usage is not supported on GPU currently.\n'
            'Please running on Ascend if you would like to see memory analysis, '
            'otherwise, this warning can be ignored.')
Example #13
0
 def __init__(self, network, optimizer, sens=1.0):
     super(TrainOneStepCell, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.reducer_flag = False
     self.grad_reducer = F.identity
     self.parallel_mode = _get_parallel_mode()
     if self.parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL):
         self.reducer_flag = True
     if self.reducer_flag:
         self.mean = _get_gradients_mean()
         self.degree = _get_device_num()
         self.grad_reducer = DistributedGradReducer(self.weights, self.mean, self.degree)
     self.use_grad_accumulation = False
     if self.parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.STAND_ALONE):
         self.use_grad_accumulation = True
     if self.use_grad_accumulation:
         self.max_accumulation_step = get_auto_parallel_context("grad_accumulation_step")
         if self.max_accumulation_step <= 1:
             self.max_accumulation_step = 1
             self.use_grad_accumulation = False
     if self.use_grad_accumulation:
         self.grad_accumulation = GradientAccumulation(self.max_accumulation_step, self.optimizer)
Example #14
0
    def __init__(self, **kwargs):
        # get device_id and device_target
        self._get_devid_and_devtarget()
        output_path = kwargs.pop("output_path", "./data")
        self._output_path = validate_and_normalize_path(output_path)
        self._output_path = os.path.join(self._output_path, "profiler")
        if not os.path.exists(self._output_path):
            os.makedirs(self._output_path, exist_ok=True)
            os.chmod(self._output_path,
                     stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
        else:
            logger.warning(
                "The target dir already exists. "
                "There may be some old profiling data, and they will be rewrote in the end."
            )

        os.environ['PROFILING_MODE'] = 'true'
        os.environ['MINDDATA_PROFILING_DIR'] = self._output_path

        if self._device_target and self._device_target == "GPU":
            from mindspore._c_expression import GPUProfiler
            self._gpu_profiler = GPUProfiler.get_instance()
            self._gpu_profiler.init(self._output_path)
            self._gpu_profiler.step_profiling_enable(True)
            if context.get_auto_parallel_context('device_num') > 1:
                self._dev_id = get_rank()
            os.environ['DEVICE_ID'] = str(self._dev_id)

            if kwargs:
                logger.warning("Params not be supported yet on GPU.")
        elif self._device_target and self._device_target == "Ascend":
            optypes_not_deal = kwargs.pop("optypes_not_deal", "Variable")
            if not isinstance(optypes_not_deal, str):
                raise TypeError("The parameter optypes_not_deal must be str.")
            job_id = kwargs.pop("ascend_job_id", "")
            if kwargs:
                logger.warning("There are invalid params which don't work.")

            os.environ['DEVICE_ID'] = self._dev_id
            os.environ['AICPU_PROFILING_MODE'] = 'true'

            # use context interface to open profiling, for the new mindspore version(after 2020.5.21)
            context.set_context(enable_profiling=True,
                                profiling_options="training_trace:task_trace")

            self._container_path = os.path.join(
                self._base_profiling_container_path, self._dev_id)
            data_path = os.path.join(self._container_path, "data")
            data_path = validate_and_normalize_path(data_path)
            if not os.path.exists(data_path):
                os.makedirs(data_path, exist_ok=True)

            self._filt_optype_names = optypes_not_deal.split(
                ",") if optypes_not_deal else []
            self._profiling_job_id = job_id
            # add job id env through user input later
            self._job_id_env = 0
            self._start_time = int(time.time() * 10000000)
            logger.info("Profiling: profiling start time: %d",
                        self._start_time)
Example #15
0
 def __init__(self, parameters, mean=True, degree=None):
     super(DistributedGradReducer, self).__init__(auto_prefix=False)
     self.map_ = C.Map()
     if degree is None:
         self.degree = get_group_size()
     else:
         if not isinstance(degree, int) or degree <= 0:
             raise ValueError(
                 "Parameter 'degree' in DistributedGradReducer should large than 0 and be int"
             )
         self.degree = degree
     self.mean = mean
     self.allreduce_filter = tuple(x.layerwise_parallel is False
                                   for x in parameters)
     is_parallel_optimizer = context.get_auto_parallel_context(
         "enable_parallel_optimizer")
     split_indices = auto_parallel_context(
     ).get_all_reduce_fusion_split_indices()
     if is_parallel_optimizer and split_indices:
         self.split_fusion = True
         self.op_list = _init_allreduce_operators(len(parameters),
                                                  split_indices)
     else:
         self.split_fusion = False
         self.allreduce = AllReduce().add_prim_attr('fusion', 1)
     self.allgather = AllGather(GlobalComm.WORLD_COMM_GROUP)
     ps_filter = lambda x: x.is_param_ps
     self.ps_parameters = tuple(ps_filter(x) for x in parameters)
     self.enable_parameter_server = any(self.ps_parameters)
Example #16
0
    def step_end(self, run_context):
        """Monitor the loss in training."""
        cb_params = run_context.original_args()
        wide_loss, deep_loss = cb_params.net_outputs[0].asnumpy(
        ), cb_params.net_outputs[1].asnumpy()
        cur_step_in_epoch = (cb_params.cur_step_num -
                             1) % cb_params.batch_num + 1
        cur_num = cb_params.cur_step_num
        rank_id = 0
        parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                             ParallelMode.AUTO_PARALLEL,
                             ParallelMode.DATA_PARALLEL):
            rank_id = get_rank()

        print("===loss===",
              rank_id,
              cb_params.cur_epoch_num,
              cur_step_in_epoch,
              wide_loss,
              deep_loss,
              flush=True)

        # raise ValueError
        if self._per_print_times != 0 and cur_num % self._per_print_times == 0 and self.config is not None:
            loss_file = open(self.config.loss_file_name, "a+")
            loss_file.write(
                "epoch: %s, step: %s, wide_loss: %s, deep_loss: %s" %
                (cb_params.cur_epoch_num, cur_step_in_epoch, wide_loss,
                 deep_loss))
            loss_file.write("\n")
            loss_file.close()
            print("epoch: %s, step: %s, wide_loss: %s, deep_loss: %s" %
                  (cb_params.cur_epoch_num, cur_step_in_epoch, wide_loss,
                   deep_loss))
Example #17
0
def _set_bert_all_reduce_split():
    """set bert all_reduce fusion split, support num_hidden_layers is 12 and 24."""
    device_target = context.get_context('device_target')
    enable_graph_kernel = context.get_context('enable_graph_kernel')
    device_num = context.get_auto_parallel_context('device_num')
    if bert_net_cfg.num_hidden_layers == 12:
        if bert_net_cfg.use_relative_positions:
            context.set_auto_parallel_context(
                all_reduce_fusion_config=[29, 58, 87, 116, 145, 174, 203, 217])
        else:
            context.set_auto_parallel_context(
                all_reduce_fusion_config=[28, 55, 82, 109, 136, 163, 190, 205])
            if device_target == 'GPU' and enable_graph_kernel and device_num == 8:
                context.set_auto_parallel_context(
                    all_reduce_fusion_config=[180, 205])
            elif device_target == 'GPU' and enable_graph_kernel and device_num == 16:
                context.set_auto_parallel_context(
                    all_reduce_fusion_config=[120, 205])
    elif bert_net_cfg.num_hidden_layers == 24:
        if bert_net_cfg.use_relative_positions:
            context.set_auto_parallel_context(all_reduce_fusion_config=[
                30, 90, 150, 210, 270, 330, 390, 421
            ])
        else:
            context.set_auto_parallel_context(all_reduce_fusion_config=[
                38, 93, 148, 203, 258, 313, 368, 397
            ])
Example #18
0
 def __init__(self, network, optimizer, sens=1):
     super(CenterNetWithLossScaleCell, self).__init__(auto_prefix=False)
     self.image = ImagePreProcess()
     self.network = network
     self.network.set_grad()
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = ops.GradOperation(get_by_list=True, sens_param=True)
     self.reducer_flag = False
     self.allreduce = ops.AllReduce()
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
         self.reducer_flag = True
     self.grad_reducer = ops.identity
     self.degree = 1
     if self.reducer_flag:
         self.degree = get_group_size()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
     self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
     self.cast = ops.Cast()
     self.alloc_status = ops.NPUAllocFloatStatus()
     self.get_status = ops.NPUGetFloatStatus()
     self.clear_before_grad = ops.NPUClearFloatStatus()
     self.reduce_sum = ops.ReduceSum(keep_dims=False)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = ops.LessEqual()
     self.grad_scale = GradScale()
     self.loss_scale = sens
Example #19
0
 def _use_parallel_optimizer(self):
     """Indicates whether to use automatic parallelism."""
     if context.get_auto_parallel_context("enable_parallel_optimizer"):
         if _get_parallel_mode() == ParallelMode.DATA_PARALLEL and context.get_context("device_target") == "Ascend":
             self.use_parallel = True
         elif _get_parallel_mode() == ParallelMode.DATA_PARALLEL \
                 and context.get_context("device_target") != "Ascend":
             raise RuntimeError("Parallel optimizer only supports Ascend in data parallel mode.")
         elif _get_parallel_mode() in (ParallelMode.STAND_ALONE, ParallelMode.HYBRID_PARALLEL):
             raise RuntimeError("Parallel optimizer is not supported in {}.".format(_get_parallel_mode()))
         else:
             self.use_parallel = False
     else:
         self.use_parallel = False
     if self.use_parallel:
         if self.cls_name not in ["Lamb", "AdamWeightDecay"]:
             raise RuntimeError("Parallel optimizer does not support optimizer {}".format(self.cls_name))
         self.dev_num = _get_device_num()
         if self.dev_num > self.param_length:
             raise RuntimeError("Parallel optimizer can not be applied when the number of parameters {} is"
                                " less than the number of devices {}".format(self.param_length, self.dev_num))
         self.param_rank = self._get_parameter_group_id()
         self.optim_filter = tuple(map(lambda x: x == _get_global_rank(), self.param_rank))
         self.param_names = []
         for param in self.parameters:
             self.param_names.append(param.name)
     else:
         self.optim_filter = (True,) * self.param_length
Example #20
0
 def __init__(self, network, optimizer, sens=1.0):
     super(TrainAccuStepsCell, self).__init__(network, optimizer, sens)
     self.accumulation = False
     self.accumulation_steps = context.get_auto_parallel_context(
         "grad_accumulation_step")
     self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros')
     self.hyper_map = ops.HyperMap()
Example #21
0
def _init_allreduce_operators(length):
    """ initialize allreduce communication operators"""
    is_parallel_optimizer = context.get_auto_parallel_context(
        "enable_parallel_optimizer")
    split_indices = auto_parallel_context(
    ).get_all_reduce_fusion_split_indices()
    if is_parallel_optimizer and split_indices:
        group = 1
        fusion = ()
        for i in range(length):
            fusion = fusion + (group, )
            if split_indices[group - 1] <= i + 1:
                if group >= len(split_indices):
                    continue
                group = group + 1
        index = tuple(range(1, length + 1))
    else:
        fusion = (1, ) * length
        index = (0, ) * length
    opt_list = ()
    for i in range(length):
        opt = AllReduce('sum', GlobalComm.WORLD_COMM_GROUP)
        opt.add_prim_attr('fusion', fusion[i])
        opt.add_prim_attr('index', index[i])
        opt_list = opt_list + (opt, )
    return opt_list
Example #22
0
    def epoch_end(self, run_context):
        """
        epoch end
        """
        self.aucMetric.clear()
        parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                             ParallelMode.AUTO_PARALLEL):
            context.set_auto_parallel_context(
                strategy_ckpt_save_file="",
                strategy_ckpt_load_file="./strategy_train.ckpt")
        rank_id = 0
        if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                             ParallelMode.AUTO_PARALLEL,
                             ParallelMode.DATA_PARALLEL):
            rank_id = get_rank()
        start_time = time.time()
        out = self.model.eval(self.eval_dataset,
                              dataset_sink_mode=(not self.host_device_mix))
        end_time = time.time()
        eval_time = int(end_time - start_time)

        time_str = time.strftime("%Y-%m-%d %H:%M%S", time.localtime())
        out_str = "{} == Rank: {} == EvalCallBack model.eval(): {}; eval_time: {}s".\
            format(time_str, rank_id, out.values(), eval_time)
        print(out_str)
        self.eval_values = out.values()
        add_write(self.eval_file_name, out_str)
Example #23
0
def get_event_file_name(prefix, suffix, time_second):
    """
    Create file name: file_prefix + EVENT_FILE_NAME_MARK + time(seconds) + "." + Hostname + file_suffix.

    Args:
        prefix (str): The prefix of file name.
        suffix (str): The suffix of file name.
        time_second (str): The time stamp of file name.

    Returns:
        String, the name of event log file.
    """
    Validator.check_str_by_regular(prefix)
    Validator.check_str_by_regular(suffix)
    file_name = ""
    hostname = platform.node()

    device_num = context.get_auto_parallel_context('device_num')
    device_id = context.get_context('device_id')
    if device_num > 1 or GlobalComm.WORLD_COMM_GROUP == 'nccl_world_group':
        # Notice:
        # In GPU distribute training scene, get_context('device_id') will not work,
        # so we use get_rank instead of get_context.
        device_id = get_rank()

    file_name = f'{file_name}{EVENT_FILE_NAME_MARK}{time_second}.{device_id}.{hostname}'

    if prefix is not None:
        file_name = prefix + file_name

    if suffix is not None:
        file_name = file_name + suffix

    return file_name
    def __init__(self, network, optimizer, sens=1.0):
        super(BertTrainCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.sens = sens
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.clip_type = gradient_cfg.clip_type
        self.clip_value = gradient_cfg.clip_value
        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = F.identity
        self.degree = 1
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            self.degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, self.degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.cast = P.Cast()
        self.hyper_map = C.HyperMap()

        self.saved_params = self.weights.clone(prefix='saved')
        self.length = len(self.weights)
        self.quant_embedding_list = []
        self.quant_weight_list = []
        for i, key in enumerate(self.saved_params):
            if 'embedding_lookup' in key.name and 'min' not in key.name and 'max' not in key.name:
                self.quant_embedding_list.append(i)
            elif 'weight' in key.name and 'dense_1' not in key.name:
                self.quant_weight_list.append(i)
        self.quant_embedding_list_length = len(self.quant_embedding_list)
        self.quant_weight_list_length = len(self.quant_weight_list)

        self.quantize_embedding = QuantizeWeightCell(
            num_bits=network.embedding_bits,
            compute_type=network.compute_type,
            clip_value=network.weight_clip_value)
        self.quantize_weight = QuantizeWeightCell(
            num_bits=network.weight_bits,
            compute_type=network.compute_type,
            clip_value=network.weight_clip_value)
    def __init__(self,
                 network,
                 optimizer,
                 scale_update_cell=None,
                 accumulation_steps=1,
                 enable_global_norm=False):
        super(BertTrainAccumulateStepsWithLossScaleCell,
              self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.accumulation_steps = accumulation_steps
        self.enable_global_norm = enable_global_norm
        self.one = Tensor(np.array([1]).astype(np.int32))
        self.zero = Tensor(np.array([0]).astype(np.int32))
        self.local_step = Parameter(initializer(0, [1], mstype.int32),
                                    name="local_step")
        self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros')
        self.accu_overflow = Parameter(initializer(0, [1], mstype.int32),
                                       name="accu_overflow")
        self.loss = Parameter(initializer(0, [1], mstype.float32),
                              name="accu_loss")

        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = F.identity
        self.degree = 1
        if self.reducer_flag:
            self.degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       False, self.degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.overflow_reducer = F.identity
        if self.is_distributed:
            self.overflow_reducer = P.AllReduce()
        self.cast = P.Cast()
        self.alloc_status = P.NPUAllocFloatStatus()
        self.get_status = P.NPUGetFloatStatus()
        self.clear_before_grad = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.logical_or = P.LogicalOr()
        self.not_equal = P.NotEqual()
        self.select = P.Select()
        self.reshape = P.Reshape()
        self.hyper_map = C.HyperMap()
        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(
                scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
Example #26
0
    def __init__(self, network, optimizer, scale_update_cell=None):
        super(BertTrainWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.allreduce = P.AllReduce()
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = F.identity
        self.degree = 1
        if self.reducer_flag:
            self.degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       False, self.degree)
        self.clip_type = gradient_cfg.clip_type
        self.clip_value = gradient_cfg.clip_value
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.cast = P.Cast()
        self.alloc_status = P.NPUAllocFloatStatus()
        self.get_status = P.NPUGetFloatStatus()
        self.clear_before_grad = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.depend_parameter_use = P.ControlDepend(depend_mode=1)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()
        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(
                Tensor(scale_update_cell.get_loss_scale(),
                       dtype=mstype.float32))

        self.saved_params = self.weights.clone(prefix='saved')
        self.length = len(self.weights)
        self.quant_embedding_list = []
        self.quant_weight_list = []
        for i, key in enumerate(self.saved_params):
            if 'embedding_lookup' in key.name:
                self.quant_embedding_list.append(i)
            elif 'weight' in key.name and 'dense_1' not in key.name:
                self.quant_weight_list.append(i)
        self.quant_embedding_list_length = len(self.quant_embedding_list)
        self.quant_weight_list_length = len(self.quant_weight_list)

        self.quantize_embedding = QuantizeWeightCell(
            num_bits=network.embedding_bits,
            compute_type=network.compute_type,
            clip_value=network.weight_clip_value)
        self.quantize_weight = QuantizeWeightCell(
            num_bits=network.weight_bits,
            compute_type=network.compute_type,
            clip_value=network.weight_clip_value)
Example #27
0
    def __init__(self, network, optimizer, scale_sense):
        super(DFCNNCTCTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.optimizer = optimizer

        if isinstance(scale_sense, nn.Cell):
            self.loss_scaling_manager = scale_sense
            self.scale_sense = Parameter(Tensor(scale_sense.get_loss_scale(),
                                                dtype=mstype.float32), name="scale_sense")
        elif isinstance(scale_sense, Tensor):
            if scale_sense.shape == (1,) or scale_sense.shape == ():
                self.scale_sense = Parameter(scale_sense, name='scale_sense')
            else:
                raise ValueError("The shape of scale_sense must be (1,) or (), but got {}".format(
                    scale_sense.shape))
        else:
            raise TypeError("The scale_sense must be Cell or Tensor, but got {}".format(
                type(scale_sense)))

        self.network.set_grad()
        self.weights = ParameterTuple(network.trainable_params())

        self.grad = C.GradOperation(get_by_list=True,
                                    sens_param=True)

        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode not in ParallelMode.MODE_LIST:
            raise ValueError("Parallel mode does not support: ", self.parallel_mode)
        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)

        self.clip_gradients = ClipGradients()
        self.cast = P.Cast()
        self.addn = P.AddN()
        self.reshape = P.Reshape()
        self.hyper_map = C.HyperMap()
        self.less_equal = P.LessEqual()
        self.allreduce = P.AllReduce()
def test_reset_auto_parallel_context():
    context.reset_auto_parallel_context()
    device_num = context.get_auto_parallel_context("device_num")
    global_rank = context.get_auto_parallel_context("global_rank")
    gradients_mean = context.get_auto_parallel_context("gradients_mean")
    gradient_fp32_sync = context.get_auto_parallel_context(
        "gradient_fp32_sync")
    parallel_mode = context.get_auto_parallel_context("parallel_mode")
    parameter_broadcast = context.get_auto_parallel_context(
        "parameter_broadcast")
    device_num_is_set = auto_parallel_context().get_device_num_is_set()
    parameter_broadcast_is_set = auto_parallel_context(
    ).get_parameter_broadcast_is_set()
    stage = auto_parallel_context().get_pipeline_stages()
    communi_parallel_mode = context.get_auto_parallel_context(
        "communi_parallel_mode")

    assert device_num == 1
    assert global_rank == 0
    assert not gradients_mean
    assert gradient_fp32_sync
    assert parallel_mode == "stand_alone"
    assert not parameter_broadcast
    assert not device_num_is_set
    assert not parameter_broadcast_is_set
    assert stage == 1
    assert communi_parallel_mode == "all_group_parallel"
    def __init__(self, network, optimizer, sens=1.0):
        super(BertTrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = ops.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("mirror_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)

        self.cast = ops.Cast()
        self.hyper_map = ops.HyperMap()
    def __init__(self, network, optimizer, scale_update_cell=None):

        super(BertFinetuneCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.allreduce = P.AllReduce()
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.cast = P.Cast()
        self.gpu_target = False
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.alloc_status = P.NPUAllocFloatStatus()
            self.get_status = P.NPUGetFloatStatus()
            self.clear_status = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()
        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(
                Tensor(scale_update_cell.get_loss_scale(),
                       dtype=mstype.float32))