Esempio n. 1
0
def test_RunContext():
    """Test RunContext."""
    context_err = 666
    with pytest.raises(TypeError):
        RunContext(context_err)

    cb_params = _InternalCallbackParam()
    cb_params.member1 = 1
    cb_params.member2 = "abc"

    run_context = RunContext(cb_params)
    run_context.original_args()
    assert cb_params.member1 == 1
    assert cb_params.member2 == "abc"

    run_context.request_stop()
    should_stop = run_context.get_stop_requested()
    assert should_stop
Esempio n. 2
0
    def _train_process(self,
                       epoch,
                       train_dataset,
                       list_callback=None,
                       cb_params=None):
        """
        Training process. The data would be passed to network directly.

        Args:
            epoch (int): Total number of iterations on the data.
            train_dataset (Dataset): A training dataset iterator. If there is no
                                     loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be
                                     returned and passed to the network. Otherwise, a tuple (data, label) should
                                     be returned, and the data and label are passed to the network and loss
                                     function respectively.
            list_callback (Callback): Executor of callback list. Default: None.
            cb_params (_InternalCallbackParam): Callback parameters. Default: None.
        """
        dataset_helper, _ = self._exec_preprocess(self._train_network,
                                                  is_train=True,
                                                  phase='train',
                                                  dataset=train_dataset,
                                                  dataset_sink_mode=False)
        cb_params.cur_step_num = 0
        run_context = RunContext(cb_params)
        list_callback.begin(run_context)
        # used to stop training for early stop, such as stopAtTIme or stopATStep
        should_stop = False

        for i in range(epoch):
            cb_params.cur_epoch_num = i + 1

            list_callback.epoch_begin(run_context)

            for next_element in dataset_helper:
                len_element = len(next_element)
                if self._loss_fn and len_element != 2:
                    raise ValueError(
                        "when loss_fn is not None, train_dataset should"
                        "return two elements, but got {}".format(len_element))
                cb_params.cur_step_num += 1
                list_callback.step_begin(run_context)

                overflow = False
                if self._loss_scale_manager and self._loss_scale_manager.get_drop_overflow_update(
                ):
                    scaling_sens = self._get_scaling_sens()
                    next_element = tuple(next_element) + (Tensor(
                        scaling_sens, mstype.float32), )

                cb_params.train_dataset_element = next_element
                outputs = self._train_network(*next_element)
                cb_params.net_outputs = outputs
                if self._loss_scale_manager and self._loss_scale_manager.get_drop_overflow_update(
                ):
                    _, overflow, _ = outputs
                    overflow = np.all(overflow.asnumpy())
                    self._loss_scale_manager.update_loss_scale(overflow)

                list_callback.step_end(run_context)
                should_stop = should_stop or run_context.get_stop_requested()
                if should_stop:
                    break

            train_dataset.reset()

            list_callback.epoch_end(run_context)
            should_stop = should_stop or run_context.get_stop_requested()
            if should_stop:
                break

        list_callback.end(run_context)
Esempio n. 3
0
    def _train_dataset_sink_process(self,
                                    epoch,
                                    train_dataset,
                                    list_callback=None,
                                    cb_params=None,
                                    sink_size=-1):
        """
        Training process. The data would be passed to network through dataset channel.

        Args:
            epoch (int): Total number of iterations on the data.
            train_dataset (Dataset): A training dataset iterator. If there is no
                                     loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be
                                     returned and passed to the network. Otherwise, a tuple (data, label) should
                                     be returned, and the data and label are passed to the network and loss
                                     function respectively.
            list_callback (Callback): Executor of callback list. Default: None.
            cb_params (_InternalCallbackParam): Callback parameters. Default: None.
            sink_size (int): Control the amount of data each sink. Default: -1.
        """
        if sink_size == -1:
            epoch_num = epoch
        else:
            epoch_num = math.ceil(epoch * sink_size /
                                  train_dataset.get_dataset_size())

        iter_first_order = self._frequency - 1
        iter_second_order = 1
        train_dataset.__loop_size__ = iter_second_order
        dataset_helper, train_network = self._exec_preprocess(
            self._train_network,
            is_train=True,
            phase='train',
            dataset=train_dataset,
            dataset_sink_mode=True,
            sink_size=sink_size,
            epoch_num=epoch_num,
            iter_first_order=iter_first_order)
        self._train_network = train_network
        cb_params.train_network = self._train_network
        cb_params.cur_step_num = 0

        run_context = RunContext(cb_params)
        list_callback.begin(run_context)

        # used to stop training for early stop, such as stopAtTIme or stopATStep
        should_stop = False
        has_do_dataset_init = False
        switch_branch_one = True
        train_network_init_flag = True
        for i in range(epoch):
            cb_params.cur_epoch_num = i + 1
            list_callback.epoch_begin(run_context)

            # for data sink dataset_helper only iter once, other wise iter epoch_size times.
            for inputs in dataset_helper:
                if _need_to_full():
                    inputs = _to_full_tensor(inputs, self._device_number,
                                             self._global_rank)
                list_callback.step_begin(run_context)
                if switch_branch_one:
                    cb_params.cur_step_num += dataset_helper.sink_size()
                    if train_network_init_flag:
                        self._train_network.add_flags_recursive(thor=True)
                    self._train_network.phase = 'train0'
                else:
                    cb_params.cur_step_num += iter_first_order
                    if train_network_init_flag:
                        self._train_network.add_flags_recursive(thor=False)
                        train_network_init_flag = False
                    self._train_network.phase = 'train1'
                    if not has_do_dataset_init:
                        _exec_datagraph(train_dataset,
                                        iter_first_order,
                                        phase='train1_dataset')
                        has_do_dataset_init = True
                switch_branch_one = not switch_branch_one
                outputs = self._train_network(*inputs)
                cb_params.net_outputs = outputs
                list_callback.step_end(run_context)

            list_callback.epoch_end(run_context)
            should_stop = should_stop or run_context.get_stop_requested()
            if should_stop:
                break
        dataset_helper.stop_send()

        list_callback.end(run_context)
Esempio n. 4
0
    def _train_dataset_sink_process(self,
                                    epoch,
                                    train_dataset,
                                    list_callback=None,
                                    cb_params=None):
        """
        Training process. The data would be passed to network through dataset channel.

        Args:
            epoch (int): Total number of iterations on the data.
            train_dataset (Dataset): A training dataset iterator. If there is no
                                     loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be
                                     returned and passed to the network. Otherwise, a tuple (data, label) should
                                     be returned, and the data and label are passed to the network and loss
                                     function respectively.
            list_callback (_ListCallback): Executor of callback list. Default: None.
            cb_params (_InternalCallbackParam): Callback parameters. Default: None.
        """
        # remove later to deal with loop sink
        iter_first_order = 277
        iter_second_order = 1
        train_dataset.__loop_size__ = iter_second_order
        need_wrap = False
        if not hasattr(train_dataset, '__ME_INITED__') and context.get_context("enable_loop_sink") \
                and not context.get_context("enable_ge"):
            need_wrap = True

        dataset_helper = DatasetHelper(train_dataset, iter_first_order)
        # remove later to deal with loop sink
        if need_wrap:
            self._train_network = nn.DataWrapper(
                self._train_network, *(dataset_helper.types_shapes()),
                train_dataset.__ME_INITED__)
            cb_params.train_network = self._train_network
            self._train_network.set_train()

        cb_params.cur_step_num = 0
        loop_size = dataset_helper.loop_size()
        run_context = RunContext(cb_params)
        list_callback.begin(run_context)

        # used to stop training for early stop, such as stopAtTIme or stopATStep
        should_stop = False
        has_do_train1_dataset = False
        checkpoint_branch_one = True
        for i in range(epoch):
            cb_params.cur_epoch_num = i + 1
            list_callback.epoch_begin(run_context)

            # for data sink dataset_helper only iter once, other wise iter epoch_size times.
            for inputs in dataset_helper:
                list_callback.step_begin(run_context)
                if checkpoint_branch_one:
                    cb_params.cur_step_num += loop_size
                    self._train_network.set_second_order(True)
                    self._train_network.phase = 'train0'
                else:
                    cb_params.cur_step_num += iter_first_order
                    self._train_network.set_second_order(False)
                    self._train_network.phase = 'train1'
                    if not has_do_train1_dataset:
                        _exec_datagraph(train_dataset,
                                        iter_first_order,
                                        phase='train1_dataset')
                        has_do_train1_dataset = True
                checkpoint_branch_one = not checkpoint_branch_one
                outputs = self._train_network(*inputs)
                cb_params.net_outputs = outputs
                list_callback.step_end(run_context)

            list_callback.epoch_end(run_context)
            should_stop = should_stop or run_context.get_stop_requested()
            if should_stop:
                break

        list_callback.end(run_context)
Esempio n. 5
0
    def _train_dataset_sink_process(self,
                                    epoch,
                                    train_dataset,
                                    list_callback=None,
                                    cb_params=None,
                                    sink_size=-1):
        """
        Training process. The data would be passed to network through dataset channel.

        Args:
            epoch (int): Total number of iterations on the data.
            train_dataset (Dataset): A training dataset iterator. If there is no
                                     loss_fn, a tuple with multiple data (data1, data2, data3, ...) should be
                                     returned and passed to the network. Otherwise, a tuple (data, label) should
                                     be returned. The data and label would be passed to the network and loss
                                     function respectively.
            list_callback (Callback): Executor of callback list. Default: None.
            cb_params (_InternalCallbackParam): Callback parameters. Default: None.
            sink_size (int): Control the amount of data in each sink. Default: -1.
        """
        if sink_size == -1:
            epoch_num = epoch
        else:
            epoch_num = math.ceil(epoch * sink_size /
                                  train_dataset.get_dataset_size())

        iter_update_order = 1
        iter_accu_order = self._frequency - 1
        if context.get_context("device_target") == "GPU":
            train_dataset.__loop_size__ = 1
        else:
            train_dataset.__loop_size__ = iter_accu_order
        dataset_helper, train_network = self._exec_preprocess(
            self._train_network,
            is_train=True,
            phase='train',
            dataset=train_dataset,
            dataset_sink_mode=True,
            sink_size=sink_size,
            epoch_num=epoch_num,
            iter_update_order=iter_update_order)

        self._train_network = train_network
        cb_params.train_network = self._train_network
        cb_params.cur_step_num = 0

        run_context = RunContext(cb_params)
        list_callback.begin(run_context)

        for i in range(epoch):
            cb_params.cur_epoch_num = i + 1
            list_callback.epoch_begin(run_context)
            # for data sink dataset_helper only iter once, other wise iter epoch_size times.
            for inputs in dataset_helper:
                if _need_to_full() and context.get_context(
                        "device_target") == "GPU":
                    inputs = _to_full_tensor(inputs, self._device_number,
                                             self._global_rank)
                list_callback.step_begin(run_context)
                if context.get_context("device_target") == "GPU":
                    self._train_gpu_sink_step(cb_params, inputs, list_callback,
                                              iter_accu_order, run_context)
                else:
                    self._train_ascend_sink_step(cb_params, train_dataset,
                                                 iter_accu_order, inputs,
                                                 list_callback, run_context)
            list_callback.epoch_end(run_context)
            self.should_stop = self.should_stop or run_context.get_stop_requested(
            )
            if self.should_stop:
                break
        dataset_helper.stop_send()

        list_callback.end(run_context)
Esempio n. 6
0
    def _train_dataset_sink_process(self, epoch, train_dataset, list_callback=None, cb_params=None):
        """
        Training process. The data would be passed to network through dataset channel.

        Args:
            epoch (int): Total number of iterations on the data.
            train_dataset (Dataset): A training dataset iterator. If there is no
                                     loss_fn, a tuple with multiply data (data1, data2, data3, ...) should be
                                     returned and passed to the network. Otherwise, a tuple (data, label) should
                                     be returned, and the data and label are passed to the network and loss
                                     function respectively.
            list_callback (_ListCallback): Executor of callback list. Default: None.
            cb_params (_InternalCallbackParam): Callback parameters. Default: None.
        """
        iter_first_order = self._frequency - 1
        iter_second_order = 1
        train_dataset.__loop_size__ = iter_second_order
        dataset_helper, train_network = self._exec_preprocess(self._train_network,
                                                              is_train=True,
                                                              phase='train',
                                                              dataset=train_dataset,
                                                              dataset_sink_mode=True,
                                                              iter_first_order=iter_first_order)
        self._train_network = train_network
        cb_params.train_network = self._train_network
        cb_params.cur_step_num = 0

        loop_size = dataset_helper.loop_size()
        run_context = RunContext(cb_params)
        list_callback.begin(run_context)

        # used to stop training for early stop, such as stopAtTIme or stopATStep
        should_stop = False
        switch_branch_one = True
        # has_do_dataset_init1 = False
        for i in range(epoch):
            global index_epoch
            cb_params.cur_epoch_num = i + 1 + index_epoch
            list_callback.epoch_begin(run_context)

            # for data sink dataset_helper only iter once, other wise iter epoch_size times.
            epoch = i + index_epoch
            index_epoch += 1
            for inputs in dataset_helper:
                # if epoch < self._stop_epoch:
                list_callback.step_begin(run_context)
                if switch_branch_one:
                    cb_params.cur_step_num += loop_size
                    self._train_network.add_flags_recursive(thor=True)
                    self._train_network.phase = 'train0'
                else:
                    cb_params.cur_step_num += iter_first_order
                    self._train_network.add_flags_recursive(thor=False)
                    self._train_network.phase = 'train1'
                    if not self._has_do_dataset_init:
                        _exec_datagraph(train_dataset, iter_first_order, phase='train1_dataset')
                        self._has_do_dataset_init = True
                switch_branch_one = not switch_branch_one
                outputs = self._train_network(*inputs)
                cb_params.net_outputs = outputs
                list_callback.step_end(run_context)
                # else:
                #     iter_first_order = 5004
                #     cb_params.cur_step_num += iter_first_order
                #     self._train_network.add_flags_recursive(thor=False)
                #     self._train_network.phase = 'train2'
                #     if not has_do_dataset_init1:
                #         _exec_datagraph(train_dataset, iter_first_order, phase='train2_dataset')
                #         has_do_dataset_init1 = True
                #     outputs = self._train_network(*inputs)
                #     cb_params.net_outputs = outputs
                #     list_callback.step_end(run_context)
                #     break

            list_callback.epoch_end(run_context)
            should_stop = should_stop or run_context.get_stop_requested()
            if should_stop:
                break

        list_callback.end(run_context)