def train(self, epoch, train_dataset, callbacks=None, dataset_sink_mode=True, sink_size=-1): """ Training API where the iteration is controlled by python front-end. When setting pynative mode, the training process will be performed with dataset not sink. Note: CPU is not supported when dataset_sink_mode is true. If dataset_sink_mode is True, epoch of training should be equal to the count of repeat operation in dataset processing. Otherwise, errors could occur since the amount of data is not the amount training requires. If dataset_sink_mode is True, data will be sent to device. If device is Ascend, features of data will be transferred one by one. The limitation of data transmission per time is 256M. Args: epoch (int): Total number of iterations on the data. train_dataset (Dataset): A training dataset iterator. If there is no loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be returned and passed to the network. Otherwise, a tuple (data, label) should be returned, and the data and label are passed to the network and loss function respectively. callbacks (list): List of callback object. Callbacks which should be excuted while training. Default: None. dataset_sink_mode (bool): Determines whether to pass the data through dataset channel. Default: True. Configure pynative mode, the training process will be performed with dataset not sink. sink_size (int): Control the amount of data each sink. If sink_size=-1, sink the complete dataset each epoch. If sink_size>0, sink sink_size data each epoch. If dataset_sink_mode is False, set sink_size invalid. Default: -1. Examples: >>> dataset = get_dataset() >>> net = Net() >>> loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) >>> loss_scale_manager = FixedLossScaleManager() >>> optim = Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9) >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None, loss_scale_manager=loss_scale_manager) >>> model.train(2, dataset) """ check_bool(dataset_sink_mode) check_int(sink_size) if sink_size < -1 or sink_size == 0: raise ValueError( "The sink_size must be -1 or positive, but got sink_size {}.". format(sink_size)) _device_number_check(self._parallel_mode, self._device_number) _parameter_broadcast_check(self._parallel_mode, self._parameter_broadcast) self._train(epoch, train_dataset, callbacks=callbacks, dataset_sink_mode=dataset_sink_mode, sink_size=sink_size)
def eval(self, valid_dataset, callbacks=None, dataset_sink_mode=True): """ Evaluation API where the iteration is controlled by python front-end. Configure to pynative mode, the evaluation will be performed with dataset non-sink mode. Note: CPU is not supported when dataset_sink_mode is true. If dataset_sink_mode is True, data will be sent to device. If device is Ascend, features of data will be transferred one by one. The limitation of data transmission per time is 256M. Args: valid_dataset (Dataset): Dataset to evaluate the model. callbacks (list): List of callback object. Callbacks which should be excuted while training. Default: None. dataset_sink_mode (bool): Determines whether to pass the data through dataset channel. Default: True. Returns: Dict, returns the loss value & metrics values for the model in test mode. Examples: >>> dataset = get_dataset() >>> net = Net() >>> loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) >>> model = Model(net, loss_fn=loss, optimizer=None, metrics={'acc'}) >>> model.eval(dataset) """ check_bool(dataset_sink_mode) _device_number_check(self._parallel_mode, self._device_number) if not self._metric_fns: raise ValueError("metric fn can not be None or empty.") cb_params = _InternalCallbackParam() cb_params.eval_network = self._eval_network cb_params.valid_dataset = valid_dataset cb_params.batch_num = valid_dataset.get_dataset_size() cb_params.mode = "eval" cb_params.cur_step_num = 0 cb_params.list_callback = self._transform_callbacks(callbacks) cb_params.network = self._network self._eval_network.set_train(mode=False) self._eval_network.phase = 'eval' self._clear_metrics() with _CallbackManager(callbacks) as list_callback: if dataset_sink_mode: return self._eval_dataset_sink_process(valid_dataset, list_callback, cb_params) return self._eval_process(valid_dataset, list_callback, cb_params)
def _eval_dataset_sink_process(self, valid_dataset, list_callback=None, cb_params=None): """ Evaluation. The data would be passed to network through dataset channel. Args: valid_dataset (Dataset): Dataset to evaluate the model. list_callback (ListCallback): Executor of callback list. Default: None. cb_params (_InternalCallbackParam): Callback parameters. Default: None. Returns: Dict, returns the loss value & metrics values for the model in test mode. """ _device_number_check(self._parallel_mode, self._device_number) run_context = RunContext(cb_params) # remove later to deal with loop sink need_wrap = False if not hasattr(valid_dataset, '__ME_INITED__') and context.get_context("enable_loop_sink") \ and not context.get_context("enable_ge"): need_wrap = True valid_dataset.__loop_size__ = 1 dataset_helper = DatasetHelper(valid_dataset) # remove later to deal with loop sink if need_wrap: self._eval_network = nn.DataWrapper( self._eval_network, *(dataset_helper.types_shapes()), valid_dataset.__ME_INITED__) self._eval_network.set_train(mode=False) self._eval_network.phase = 'eval' list_callback.begin(run_context) for inputs in dataset_helper: cb_params.cur_step_num += 1 list_callback.step_begin(run_context) outputs = self._eval_network(*inputs) cb_params.net_outputs = outputs list_callback.step_end(run_context) self._update_metrics(outputs) metrics = self._get_metrics() cb_params.metrics = metrics list_callback.end(run_context) return metrics
def init(self, train_dataset=None, valid_dataset=None): """ Initializes compute graphs and data graphs with sink mode. Note: Pre-init process only supports `GRAPH_MODE` and `Ascend` target currently. Args: train_dataset (Dataset): A training dataset iterator. If define `train_dataset`, training graphs will be initialized. Default: None. valid_dataset (Dataset): A evaluating dataset iterator. If define `valid_dataset`, evaluation graphs will be initialized, and `metrics` in `Model` can not be None. Default: None. Examples: >>> train_dataset = get_train_dataset() >>> valid_dataset = get_valid_dataset() >>> net = Net() >>> loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) >>> optim = Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9) >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics={'acc'}) >>> model.init(train_dataset, valid_dataset) >>> model.train(2, train_dataset) >>> model.eval(valid_dataset) """ if context.get_context( "mode") != context.GRAPH_MODE or context.get_context( "device_target") != "Ascend": raise RuntimeError( 'Pre-init process only supports GRAPH MODE and Ascend target currently.' ) if not train_dataset and not valid_dataset: raise ValueError( 'Both train_dataset and valid_dataset can not be None or empty.' ) _device_number_check(self._parallel_mode, self._device_number) if train_dataset: _parameter_broadcast_check(self._parallel_mode, self._parameter_broadcast) self._train_network.set_train() self._train_network.phase = 'train' if self._parameter_broadcast: self._train_network.set_broadcast_flag() train_dataset.__no_send__ = True train_dataset_helper, train_network = self._exec_preprocess( self._train_network, is_train=True, phase='train', dataset=train_dataset, dataset_sink_mode=True) self._train_network = train_network for inputs in train_dataset_helper: self._train_network.compile(*inputs) break if valid_dataset: if not self._metric_fns: raise RuntimeError( 'If define `valid_dataset`, metric fn can not be None or empty.' ) self._eval_network.set_train(False) self._eval_network.phase = 'eval' valid_dataset.__no_send__ = True valid_dataset_helper, eval_network = self._exec_preprocess( self._eval_network, is_train=False, phase='eval', dataset=valid_dataset, dataset_sink_mode=True) self._eval_network = eval_network for inputs in valid_dataset_helper: self._eval_network.compile(*inputs) break
def train(self, epoch, train_dataset, callbacks=None, dataset_sink_mode=True): """ Training API where the iteration is controlled by python front-end. When setting pynative mode, the training process will be performed with dataset not sink. Note: CPU is not supported when dataset_sink_mode is true. If dataset_sink_mode is True, epoch of training should be equal to the count of repeat operation in dataset processing. Otherwise, errors could occur since the amount of data is not the amount training requires. If dataset_sink_mode is True, data will be sent to device. If device is Ascend, features of data will be transferred one by one. The limitation of data transmission per time is 256M. Args: epoch (int): Total number of iterations on the data. train_dataset (Dataset): A training dataset iterator. If there is no loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be returned and passed to the network. Otherwise, a tuple (data, label) should be returned, and the data and label are passed to the network and loss function respectively. callbacks (list): List of callback object. Callbacks which should be excuted while training. Default: None. dataset_sink_mode (bool): Determines whether to pass the data through dataset channel. Default: True. Configure pynative mode, the training process will be performed with dataset not sink. Examples: >>> dataset = get_dataset() >>> net = Net() >>> loss = nn.SoftmaxCrossEntropyWithLogits() >>> loss_scale_manager = FixedLossScaleManager() >>> optim = Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9) >>> model = Model(net, loss_fn=loss, optimizer=optim, metrics=None, loss_scale_manager=loss_scale_manager) >>> model.train(2, dataset) """ repeat_count = train_dataset.get_repeat_count() if epoch != repeat_count and dataset_sink_mode is True: logger.warning( f"The epoch_size {epoch} is not the same with dataset repeat_count {repeat_count}" ) check_bool(dataset_sink_mode) _device_number_check(self._parallel_mode, self._device_number) _parameter_broadcast_check(self._parallel_mode, self._parameter_broadcast) if context.get_context("device_target") in [ "CPU", "GPU" ] and context.get_context("enable_loop_sink"): raise ValueError( "CPU and GPU can't support loop sink, please set enable_loop_sink=False." ) self._train(epoch, train_dataset, callbacks=callbacks, dataset_sink_mode=dataset_sink_mode)