def _init_train_strategy(self, train_loss):
        """
        定义训练的优化策略
        :param train_loss: 模型中的loss
        :return:
        """
        learning_rate = lr_strategy.get_strategy(self.args)
        optimizer = model.optimizer.get_optimizer(learning_rate, self.args)

        if self.args['regularization'] == "L2":
            # L2正则
            param_list = dict()
            for param in self.train_main_prog.global_block().all_parameters():
                param_list[param.name] = param * 1.0
                param_list[param.name].stop_gradient = True

            _, param_grads = optimizer.minimize(train_loss)

            if self.args['regularization_coeff'] > 0:
                for param, grad in param_grads:
                    if self._exclude_from_weight_decay(param.name):
                        continue
                    with param.block.program._optimized_guard(
                            [param, grad]), fluid.framework.name_scope("weight_decay"):
                        updated_param = param - param_list[
                            param.name] * self.args['regularization_coeff'] * learning_rate
                        fluid.layers.assign(output=param, input=updated_param)
        else:
            optimizer.minimize(train_loss)
        return optimizer, learning_rate
    def __init__(self, pretrain_data_reader, pretrain_vocab_size, args,
                 logger):
        """

        """
        self.args = args.get_config(args.TRAIN)
        self.logger = logger
        '''
        创建训练过程
        '''
        self.logger.info("Initializing training process...")
        self.train_main_prog = fluid.Program()
        self.train_startup_prog = fluid.Program()
        with fluid.program_guard(self.train_main_prog,
                                 self.train_startup_prog):
            # 使用 fluid.unique_name.guard() 实现与test program的参数共享
            with fluid.unique_name.guard():
                self.logger.info("Initializing training neural network...")
                # train_data_loader, train_loss = network(self.args, train=True)  # 一些网络定义
                train_data_loader, qa_acc, mean_mask_lm_loss, loss = \
                    classifier.create_model_for_pretrain(args.get_config(args.MODEL_BUILD),
                                                         vocab_size=pretrain_vocab_size)

                self.logger.info("Training neural network initialized.")
                # 获取训练策略
                self.logger.info("Setting training strategy...")
                learning_rate = lr_strategy.get_strategy(self.args)
                optimizer = model.optimizer.get_optimizer(
                    learning_rate, self.args)
                optimizer.minimize(loss)
                self.logger.info("Training strategy has been set.")

        # 为训练过程设置数据集
        train_data_loader.set_batch_generator(pretrain_data_reader,
                                              places=self.get_data_run_places(
                                                  self.args))
        self.train_data_loader = train_data_loader
        self.optimizer = optimizer
        self.train_loss = loss
        self.qa_acc = qa_acc
        self.logger.info("Training process initialized.")
        '''
        过程并行化
        '''
        USE_PARALLEL = self.args["use_parallel"]

        # 备份原program,因为compiled_program没有保存
        self.origin_train_prog = self.train_main_prog
        if USE_PARALLEL:
            self.logger.info("Initialize parallel processes...")
            # 设置并行训练的策略
            # 这里可以用参数配置,不过要改的东西很多,所以先写死吧
            build_strategy = fluid.BuildStrategy()
            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
            # 构建并行过程
            self.train_main_prog = fluid.CompiledProgram(
                self.train_main_prog).with_data_parallel(
                    loss_name=self.train_loss.name,
                    places=self.get_data_run_places(self.args),
                    build_strategy=build_strategy)
            self.valid_main_prog = fluid.CompiledProgram(
                self.valid_main_prog).with_data_parallel(
                    share_vars_from=self.train_main_prog,
                    places=self.get_data_run_places(self.args),
                    build_strategy=build_strategy)
            self.logger.info("Parallel processes initialized.")
    def __init__(self, train_data_reader, valid_data_reader, args, logger):
        """
        对训练过程进行初始化
        :param train_data_reader:
        :param valid_data_reader:
        :param args:
        :param logger:
        """
        self.args = args.get_config(args.TRAIN)
        self.logger = logger
        '''
        创建训练过程
        '''
        self.logger.info("Initializing training process...")
        self.train_main_prog = fluid.Program()
        self.train_startup_prog = fluid.Program()
        with fluid.program_guard(self.train_main_prog,
                                 self.train_startup_prog):
            # 使用 fluid.unique_name.guard() 实现与test program的参数共享
            with fluid.unique_name.guard():
                self.logger.info("Initializing training neural network...")
                # train_data_loader, train_loss = network(self.args, train=True)  # 一些网络定义
                output = classifier.create_model_for_cls_merge(
                    args.get_config(args.MODEL_BUILD), is_prediction=False)
                train_data_loader = output[0]
                train_loss = output[1]
                self.logger.info("Training neural network initialized.")
                # 获取训练策略
                self.logger.info("Setting training strategy...")

                learning_rate = lr_strategy.get_strategy(self.args)
                optimizer = model.optimizer.get_optimizer(
                    learning_rate, self.args)

                if self.args['regularization'] == "L2":
                    # L2正则
                    param_list = dict()
                    for param in self.train_main_prog.global_block(
                    ).all_parameters():
                        param_list[param.name] = param * 1.0
                        param_list[param.name].stop_gradient = True

                    _, param_grads = optimizer.minimize(train_loss)

                    if self.args['regularization_coeff'] > 0:
                        for param, grad in param_grads:
                            if self._exclude_from_weight_decay(param.name):
                                continue
                            with param.block.program._optimized_guard([
                                    param, grad
                            ]), fluid.framework.name_scope("weight_decay"):
                                updated_param = param - param_list[
                                    param.name] * self.args[
                                        'regularization_coeff'] * learning_rate
                                fluid.layers.assign(output=param,
                                                    input=updated_param)
                else:
                    optimizer.minimize(train_loss)
                self.logger.info("Training strategy has been set.")

        # 为训练过程设置数据集
        train_data_loader.set_sample_list_generator(
            train_data_reader, places=self.get_data_run_places(self.args))
        self.train_data_loader = train_data_loader
        self.optimizer = optimizer
        self.train_loss = train_loss
        self.logger.info("Training process initialized.")
        '''
        创建验证过程
        '''
        self.logger.info("Initializing validation process...")
        self.valid_main_prog = fluid.Program()
        self.valid_startup_prog = fluid.Program()
        with fluid.program_guard(self.valid_main_prog,
                                 self.valid_startup_prog):
            # 使用 fluid.unique_name.guard() 实现与train program的参数共享
            with fluid.unique_name.guard():
                self.logger.info("Initializing validation neural network...")
                # valid_data_loader, valid_loss = network(self.args, train=False)  # 一些网络定义
                valid_data_loader, valid_loss, _, accuracy, _ = classifier.create_model_for_cls_merge(
                    args.get_config(args.MODEL_BUILD), is_prediction=False)
                self.logger.info("Validation neural network initialized.")

        valid_data_loader.set_sample_list_generator(
            valid_data_reader, places=self.get_data_run_places(self.args))

        self.valid_data_loader = valid_data_loader
        self.valid_loss = valid_loss
        self.valid_accuracy = accuracy
        # 对训练状态的记录
        self.pre_epoch_valid_loss = float("inf")
        self.standstill_count = 0
        self.logger.info("Validation process initialized.")
        '''
        过程并行化
        '''
        USE_PARALLEL = self.args["use_parallel"]

        # 备份原program,因为compiled_program没有保存
        self.origin_train_prog = self.train_main_prog
        if USE_PARALLEL:
            self.logger.info("Initialize parallel processes...")
            # 设置并行训练的策略
            # 这里可以用参数配置,不过要改的东西很多,所以先写死吧
            build_strategy = fluid.BuildStrategy()
            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
            # 构建并行过程
            self.train_main_prog = fluid.CompiledProgram(
                self.train_main_prog).with_data_parallel(
                    loss_name=self.train_loss.name,
                    places=self.get_data_run_places(self.args),
                    build_strategy=build_strategy)
            self.valid_main_prog = fluid.CompiledProgram(
                self.valid_main_prog).with_data_parallel(
                    share_vars_from=self.train_main_prog,
                    places=self.get_data_run_places(self.args),
                    build_strategy=build_strategy)
            self.logger.info("Parallel processes initialized.")