def test_to_full_tensor_sens_2(): elem0 = Tensor([[1, 2, 3], [4, 5, 6]], dtype=ms.float32) elem1 = Tensor([[1], [4]], dtype=ms.int32) elem = ( elem0, elem1, ) device_num = 4 global_rank = 2 full_tensor = _to_full_tensor(elem, device_num, global_rank, scaling_sens=0.1) expect0 = ([[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [1, 2, 3], [4, 5, 6], [0, 0, 0], [0, 0, 0]]) expect_tensor0 = Tensor(expect0, dtype=ms.float32) expect1 = ([[0], [0], [0], [0], [1], [4], [0], [0]]) expect_tensor1 = Tensor(expect1, dtype=ms.int32) expect_tensor_sens = Tensor(0.1, dtype=ms.float32) expect_tensors = (expect_tensor0, expect_tensor1, expect_tensor_sens) assert np.all(full_tensor[0].asnumpy() == expect_tensors[0].asnumpy()) assert np.all(full_tensor[1].asnumpy() == expect_tensors[1].asnumpy()) assert np.all(full_tensor[2].asnumpy() == expect_tensors[2].asnumpy())
def test_to_full_tensor_1(): elem = Tensor([[1, 2, 3], [4, 5, 6]], dtype=ms.float32) device_num = 4 global_rank = 2 full_tensor = _to_full_tensor(elem, device_num, global_rank, scaling_sens=None) expect = ([[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [1, 2, 3], [4, 5, 6], [0, 0, 0], [0, 0, 0]]) expect_tensor = Tensor(expect, dtype=ms.float32) assert np.all(full_tensor[0].asnumpy() == expect_tensor.asnumpy())
def _train_dataset_sink_process(self, epoch, train_dataset, list_callback=None, cb_params=None, sink_size=-1): """ Training process. The data would be passed to network through dataset channel. Args: epoch (int): Total number of iterations on the data. train_dataset (Dataset): A training dataset iterator. If there is no loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be returned and passed to the network. Otherwise, a tuple (data, label) should be returned, and the data and label are passed to the network and loss function respectively. list_callback (Callback): Executor of callback list. Default: None. cb_params (_InternalCallbackParam): Callback parameters. Default: None. sink_size (int): Control the amount of data each sink. Default: -1. """ if sink_size == -1: epoch_num = epoch else: epoch_num = math.ceil(epoch * sink_size / train_dataset.get_dataset_size()) iter_first_order = self._frequency - 1 iter_second_order = 1 train_dataset.__loop_size__ = iter_second_order dataset_helper, train_network = self._exec_preprocess( self._train_network, is_train=True, phase='train', dataset=train_dataset, dataset_sink_mode=True, sink_size=sink_size, epoch_num=epoch_num, iter_first_order=iter_first_order) self._train_network = train_network cb_params.train_network = self._train_network cb_params.cur_step_num = 0 run_context = RunContext(cb_params) list_callback.begin(run_context) # used to stop training for early stop, such as stopAtTIme or stopATStep should_stop = False has_do_dataset_init = False switch_branch_one = True train_network_init_flag = True for i in range(epoch): cb_params.cur_epoch_num = i + 1 list_callback.epoch_begin(run_context) # for data sink dataset_helper only iter once, other wise iter epoch_size times. for inputs in dataset_helper: if _need_to_full(): inputs = _to_full_tensor(inputs, self._device_number, self._global_rank) list_callback.step_begin(run_context) if switch_branch_one: cb_params.cur_step_num += dataset_helper.sink_size() if train_network_init_flag: self._train_network.add_flags_recursive(thor=True) self._train_network.phase = 'train0' else: cb_params.cur_step_num += iter_first_order if train_network_init_flag: self._train_network.add_flags_recursive(thor=False) train_network_init_flag = False self._train_network.phase = 'train1' if not has_do_dataset_init: _exec_datagraph(train_dataset, iter_first_order, phase='train1_dataset') has_do_dataset_init = True switch_branch_one = not switch_branch_one outputs = self._train_network(*inputs) cb_params.net_outputs = outputs list_callback.step_end(run_context) list_callback.epoch_end(run_context) should_stop = should_stop or run_context.get_stop_requested() if should_stop: break dataset_helper.stop_send() list_callback.end(run_context)
def _train_dataset_sink_process(self, epoch, train_dataset, list_callback=None, cb_params=None, sink_size=-1): """ Training process. The data would be passed to network through dataset channel. Args: epoch (int): Total number of iterations on the data. train_dataset (Dataset): A training dataset iterator. If there is no loss_fn, a tuple with multiple data (data1, data2, data3, ...) should be returned and passed to the network. Otherwise, a tuple (data, label) should be returned. The data and label would be passed to the network and loss function respectively. list_callback (Callback): Executor of callback list. Default: None. cb_params (_InternalCallbackParam): Callback parameters. Default: None. sink_size (int): Control the amount of data in each sink. Default: -1. """ if sink_size == -1: epoch_num = epoch else: epoch_num = math.ceil(epoch * sink_size / train_dataset.get_dataset_size()) iter_update_order = 1 iter_accu_order = self._frequency - 1 if context.get_context("device_target") == "GPU": train_dataset.__loop_size__ = 1 else: train_dataset.__loop_size__ = iter_accu_order dataset_helper, train_network = self._exec_preprocess( self._train_network, is_train=True, phase='train', dataset=train_dataset, dataset_sink_mode=True, sink_size=sink_size, epoch_num=epoch_num, iter_update_order=iter_update_order) self._train_network = train_network cb_params.train_network = self._train_network cb_params.cur_step_num = 0 run_context = RunContext(cb_params) list_callback.begin(run_context) for i in range(epoch): cb_params.cur_epoch_num = i + 1 list_callback.epoch_begin(run_context) # for data sink dataset_helper only iter once, other wise iter epoch_size times. for inputs in dataset_helper: if _need_to_full() and context.get_context( "device_target") == "GPU": inputs = _to_full_tensor(inputs, self._device_number, self._global_rank) list_callback.step_begin(run_context) if context.get_context("device_target") == "GPU": self._train_gpu_sink_step(cb_params, inputs, list_callback, iter_accu_order, run_context) else: self._train_ascend_sink_step(cb_params, train_dataset, iter_accu_order, inputs, list_callback, run_context) list_callback.epoch_end(run_context) self.should_stop = self.should_stop or run_context.get_stop_requested( ) if self.should_stop: break dataset_helper.stop_send() list_callback.end(run_context)