def test_step_end_save_graph(): """Test save checkpoint.""" train_config = CheckpointConfig(save_checkpoint_steps=16, save_checkpoint_seconds=0, keep_checkpoint_max=5, keep_checkpoint_per_n_minutes=0) cb_params = _InternalCallbackParam() net = LossNet() input_data = Tensor( np.random.randint(0, 255, [1, 3, 224, 224]).astype(np.float32)) input_label = Tensor(np.random.randint(0, 3, [1, 3]).astype(np.float32)) net(input_data, input_label) cb_params.train_network = net cb_params.epoch_num = 10 cb_params.cur_epoch_num = 5 cb_params.cur_step_num = 0 cb_params.batch_num = 32 ckpoint_cb = ModelCheckpoint(prefix="test", directory='./test_files', config=train_config) run_context = RunContext(cb_params) ckpoint_cb.begin(run_context) # import pdb;pdb.set_trace() ckpoint_cb.step_end(run_context) assert os.path.exists('./test_files/test-graph.meta') if os.path.exists('./test_files/test-graph.meta'): os.chmod('./test_files/test-graph.meta', stat.S_IWRITE) os.remove('./test_files/test-graph.meta') ckpoint_cb.step_end(run_context) assert not os.path.exists('./test_files/test-graph.meta')
def test_save_checkpoint(): """Test save checkpoint.""" train_config = CheckpointConfig(save_checkpoint_steps=16, save_checkpoint_seconds=0, keep_checkpoint_max=5, keep_checkpoint_per_n_minutes=0) cb_params = _InternalCallbackParam() net = Net() loss = nn.SoftmaxCrossEntropyWithLogits() optim = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) network_ = WithLossCell(net, loss) _train_network = TrainOneStepCell(network_, optim) cb_params.train_network = _train_network cb_params.epoch_num = 10 cb_params.cur_epoch_num = 5 cb_params.cur_step_num = 0 cb_params.batch_num = 32 ckpoint_cb = ModelCheckpoint(prefix="test_ckpt", directory='./test_files', config=train_config) run_context = RunContext(cb_params) ckpoint_cb.begin(run_context) ckpoint_cb.step_end(run_context) if os.path.exists('./test_files/test_ckpt-model.pkl'): os.chmod('./test_files/test_ckpt-model.pkl', stat.S_IWRITE) os.remove('./test_files/test_ckpt-model.pkl')
def test_checkpoint_save_ckpt_seconds(): """Test checkpoint save ckpt seconds.""" train_config = CheckpointConfig(save_checkpoint_steps=16, save_checkpoint_seconds=100, keep_checkpoint_max=0, keep_checkpoint_per_n_minutes=1) ckpt_cb = ModelCheckpoint(config=train_config) cb_params = _InternalCallbackParam() net = Net() loss = nn.SoftmaxCrossEntropyWithLogits() optim = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) network_ = WithLossCell(net, loss) _train_network = TrainOneStepCell(network_, optim) cb_params.train_network = _train_network cb_params.epoch_num = 10 cb_params.cur_epoch_num = 4 cb_params.cur_step_num = 128 cb_params.batch_num = 32 run_context = RunContext(cb_params) ckpt_cb.begin(run_context) ckpt_cb.step_end(run_context) ckpt_cb2 = ModelCheckpoint(config=train_config) cb_params.cur_epoch_num = 1 cb_params.cur_step_num = 16 ckpt_cb2.begin(run_context) ckpt_cb2.step_end(run_context)
def _eval_process(self, valid_dataset, list_callback=None, cb_params=None): """ Evaluation. The data would be passed to network directly. Args: valid_dataset (Dataset): Dataset to evaluate the model. list_callback (Callback): Executor of callback list. Default: None. cb_params (_InternalCallbackParam): Callback parameters. Default: None. Returns: Dict, returns the loss value & metrics values for the model in test mode. """ run_context = RunContext(cb_params) list_callback.begin(run_context) dataset_helper, _ = self._exec_preprocess(self._eval_network, is_train=False, phase='eval', dataset=valid_dataset, dataset_sink_mode=False) for next_element in dataset_helper: cb_params.cur_step_num += 1 list_callback.step_begin(run_context) outputs = self._eval_network(*next_element) cb_params.net_outputs = outputs list_callback.step_end(run_context) self._update_metrics(outputs) metrics = self._get_metrics() cb_params.metrics = metrics list_callback.end(run_context) return metrics
def test_loss_monitor_normal_mode(): """Test loss monitor normal(non-sink) mode.""" cb_params = _InternalCallbackParam() run_context = RunContext(cb_params) loss_cb = LossMonitor(1) cb_params.cur_epoch_num = 4 cb_params.cur_step_num = 1 cb_params.batch_num = 1 cb_params.net_outputs = Tensor(2.0) loss_cb.begin(run_context) loss_cb.epoch_begin(run_context) loss_cb.step_begin(run_context) loss_cb.step_end(run_context) loss_cb.epoch_end(run_context) loss_cb.end(run_context)
def test_RunContext(): """Test RunContext.""" context_err = 666 with pytest.raises(TypeError): RunContext(context_err) cb_params = _InternalCallbackParam() cb_params.member1 = 1 cb_params.member2 = "abc" run_context = RunContext(cb_params) run_context.original_args() assert cb_params.member1 == 1 assert cb_params.member2 == "abc" run_context.request_stop() should_stop = run_context.get_stop_requested() assert should_stop
def test_loss_monitor_sink_mode(): """Test loss monitor sink mode.""" cb_params = _InternalCallbackParam() cb_params.cur_epoch_num = 4 cb_params.cur_step_num = 2 cb_params.batch_num = 2 cb_params.net_outputs = Tensor(2.0) run_context = RunContext(cb_params) loss_cb = LossMonitor(1) callbacks = [loss_cb] with _CallbackManager(callbacks) as callbacklist: callbacklist.begin(run_context) callbacklist.epoch_begin(run_context) callbacklist.step_begin(run_context) callbacklist.step_end(run_context) callbacklist.epoch_end(run_context) callbacklist.end(run_context)
def _train_process(self, epoch, train_dataset, list_callback=None, cb_params=None): """ Training process. The data would be passed to network directly. Args: epoch (int): Total number of iterations on the data. train_dataset (Dataset): A training dataset iterator. If there is no loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be returned and passed to the network. Otherwise, a tuple (data, label) should be returned, and the data and label are passed to the network and loss function respectively. list_callback (Callback): Executor of callback list. Default: None. cb_params (_InternalCallbackParam): Callback parameters. Default: None. """ dataset_helper, _ = self._exec_preprocess(self._train_network, is_train=True, phase='train', dataset=train_dataset, dataset_sink_mode=False) cb_params.cur_step_num = 0 run_context = RunContext(cb_params) list_callback.begin(run_context) # used to stop training for early stop, such as stopAtTIme or stopATStep should_stop = False for i in range(epoch): cb_params.cur_epoch_num = i + 1 list_callback.epoch_begin(run_context) for next_element in dataset_helper: len_element = len(next_element) if self._loss_fn and len_element != 2: raise ValueError( "when loss_fn is not None, train_dataset should" "return two elements, but got {}".format(len_element)) cb_params.cur_step_num += 1 list_callback.step_begin(run_context) overflow = False if self._loss_scale_manager and self._loss_scale_manager.get_drop_overflow_update( ): scaling_sens = self._get_scaling_sens() next_element = tuple(next_element) + (Tensor( scaling_sens, mstype.float32), ) outputs = self._train_network(*next_element) cb_params.net_outputs = outputs if self._loss_scale_manager and self._loss_scale_manager.get_drop_overflow_update( ): _, overflow, _ = outputs overflow = np.all(overflow.asnumpy()) self._loss_scale_manager.update_loss_scale(overflow) list_callback.step_end(run_context) should_stop = should_stop or run_context.get_stop_requested() if should_stop: break train_dataset.reset() list_callback.epoch_end(run_context) should_stop = should_stop or run_context.get_stop_requested() if should_stop: break list_callback.end(run_context)
def _train_dataset_sink_process(self, epoch, train_dataset, list_callback=None, cb_params=None): """ Training process. The data would be passed to network through dataset channel. Args: epoch (int): Total number of iterations on the data. train_dataset (Dataset): A training dataset iterator. If there is no loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be returned and passed to the network. Otherwise, a tuple (data, label) should be returned, and the data and label are passed to the network and loss function respectively. list_callback (Callback): Executor of callback list. Default: None. cb_params (_InternalCallbackParam): Callback parameters. Default: None. """ iter_first_order = self._frequency - 1 iter_second_order = 1 train_dataset.__loop_size__ = iter_second_order dataset_helper, train_network = self._exec_preprocess( self._train_network, is_train=True, phase='train', dataset=train_dataset, dataset_sink_mode=True, iter_first_order=iter_first_order) self._train_network = train_network cb_params.train_network = self._train_network cb_params.cur_step_num = 0 loop_size = dataset_helper.loop_size() run_context = RunContext(cb_params) list_callback.begin(run_context) # used to stop training for early stop, such as stopAtTIme or stopATStep should_stop = False has_do_dataset_init = False switch_branch_one = True for i in range(epoch): cb_params.cur_epoch_num = i + 1 list_callback.epoch_begin(run_context) # for data sink dataset_helper only iter once, other wise iter epoch_size times. for inputs in dataset_helper: list_callback.step_begin(run_context) if switch_branch_one: cb_params.cur_step_num += loop_size self._train_network.add_flags_recursive(thor=True) self._train_network.phase = 'train0' else: cb_params.cur_step_num += iter_first_order self._train_network.add_flags_recursive(thor=False) self._train_network.phase = 'train1' if not has_do_dataset_init: _exec_datagraph(train_dataset, iter_first_order, phase='train1_dataset') has_do_dataset_init = True switch_branch_one = not switch_branch_one outputs = self._train_network(*inputs) cb_params.net_outputs = outputs list_callback.step_end(run_context) list_callback.epoch_end(run_context) should_stop = should_stop or run_context.get_stop_requested() if should_stop: break list_callback.end(run_context)