def _init_train_strategy(self, train_loss): """ 定义训练的优化策略 :param train_loss: 模型中的loss :return: """ learning_rate = lr_strategy.get_strategy(self.args) optimizer = model.optimizer.get_optimizer(learning_rate, self.args) if self.args['regularization'] == "L2": # L2正则 param_list = dict() for param in self.train_main_prog.global_block().all_parameters(): param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True _, param_grads = optimizer.minimize(train_loss) if self.args['regularization_coeff'] > 0: for param, grad in param_grads: if self._exclude_from_weight_decay(param.name): continue with param.block.program._optimized_guard( [param, grad]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * self.args['regularization_coeff'] * learning_rate fluid.layers.assign(output=param, input=updated_param) else: optimizer.minimize(train_loss) return optimizer, learning_rate
def __init__(self, pretrain_data_reader, pretrain_vocab_size, args, logger): """ """ self.args = args.get_config(args.TRAIN) self.logger = logger ''' 创建训练过程 ''' self.logger.info("Initializing training process...") self.train_main_prog = fluid.Program() self.train_startup_prog = fluid.Program() with fluid.program_guard(self.train_main_prog, self.train_startup_prog): # 使用 fluid.unique_name.guard() 实现与test program的参数共享 with fluid.unique_name.guard(): self.logger.info("Initializing training neural network...") # train_data_loader, train_loss = network(self.args, train=True) # 一些网络定义 train_data_loader, qa_acc, mean_mask_lm_loss, loss = \ classifier.create_model_for_pretrain(args.get_config(args.MODEL_BUILD), vocab_size=pretrain_vocab_size) self.logger.info("Training neural network initialized.") # 获取训练策略 self.logger.info("Setting training strategy...") learning_rate = lr_strategy.get_strategy(self.args) optimizer = model.optimizer.get_optimizer( learning_rate, self.args) optimizer.minimize(loss) self.logger.info("Training strategy has been set.") # 为训练过程设置数据集 train_data_loader.set_batch_generator(pretrain_data_reader, places=self.get_data_run_places( self.args)) self.train_data_loader = train_data_loader self.optimizer = optimizer self.train_loss = loss self.qa_acc = qa_acc self.logger.info("Training process initialized.") ''' 过程并行化 ''' USE_PARALLEL = self.args["use_parallel"] # 备份原program,因为compiled_program没有保存 self.origin_train_prog = self.train_main_prog if USE_PARALLEL: self.logger.info("Initialize parallel processes...") # 设置并行训练的策略 # 这里可以用参数配置,不过要改的东西很多,所以先写死吧 build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce # 构建并行过程 self.train_main_prog = fluid.CompiledProgram( self.train_main_prog).with_data_parallel( loss_name=self.train_loss.name, places=self.get_data_run_places(self.args), build_strategy=build_strategy) self.valid_main_prog = fluid.CompiledProgram( self.valid_main_prog).with_data_parallel( share_vars_from=self.train_main_prog, places=self.get_data_run_places(self.args), build_strategy=build_strategy) self.logger.info("Parallel processes initialized.")
def __init__(self, train_data_reader, valid_data_reader, args, logger): """ 对训练过程进行初始化 :param train_data_reader: :param valid_data_reader: :param args: :param logger: """ self.args = args.get_config(args.TRAIN) self.logger = logger ''' 创建训练过程 ''' self.logger.info("Initializing training process...") self.train_main_prog = fluid.Program() self.train_startup_prog = fluid.Program() with fluid.program_guard(self.train_main_prog, self.train_startup_prog): # 使用 fluid.unique_name.guard() 实现与test program的参数共享 with fluid.unique_name.guard(): self.logger.info("Initializing training neural network...") # train_data_loader, train_loss = network(self.args, train=True) # 一些网络定义 output = classifier.create_model_for_cls_merge( args.get_config(args.MODEL_BUILD), is_prediction=False) train_data_loader = output[0] train_loss = output[1] self.logger.info("Training neural network initialized.") # 获取训练策略 self.logger.info("Setting training strategy...") learning_rate = lr_strategy.get_strategy(self.args) optimizer = model.optimizer.get_optimizer( learning_rate, self.args) if self.args['regularization'] == "L2": # L2正则 param_list = dict() for param in self.train_main_prog.global_block( ).all_parameters(): param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True _, param_grads = optimizer.minimize(train_loss) if self.args['regularization_coeff'] > 0: for param, grad in param_grads: if self._exclude_from_weight_decay(param.name): continue with param.block.program._optimized_guard([ param, grad ]), fluid.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * self.args[ 'regularization_coeff'] * learning_rate fluid.layers.assign(output=param, input=updated_param) else: optimizer.minimize(train_loss) self.logger.info("Training strategy has been set.") # 为训练过程设置数据集 train_data_loader.set_sample_list_generator( train_data_reader, places=self.get_data_run_places(self.args)) self.train_data_loader = train_data_loader self.optimizer = optimizer self.train_loss = train_loss self.logger.info("Training process initialized.") ''' 创建验证过程 ''' self.logger.info("Initializing validation process...") self.valid_main_prog = fluid.Program() self.valid_startup_prog = fluid.Program() with fluid.program_guard(self.valid_main_prog, self.valid_startup_prog): # 使用 fluid.unique_name.guard() 实现与train program的参数共享 with fluid.unique_name.guard(): self.logger.info("Initializing validation neural network...") # valid_data_loader, valid_loss = network(self.args, train=False) # 一些网络定义 valid_data_loader, valid_loss, _, accuracy, _ = classifier.create_model_for_cls_merge( args.get_config(args.MODEL_BUILD), is_prediction=False) self.logger.info("Validation neural network initialized.") valid_data_loader.set_sample_list_generator( valid_data_reader, places=self.get_data_run_places(self.args)) self.valid_data_loader = valid_data_loader self.valid_loss = valid_loss self.valid_accuracy = accuracy # 对训练状态的记录 self.pre_epoch_valid_loss = float("inf") self.standstill_count = 0 self.logger.info("Validation process initialized.") ''' 过程并行化 ''' USE_PARALLEL = self.args["use_parallel"] # 备份原program,因为compiled_program没有保存 self.origin_train_prog = self.train_main_prog if USE_PARALLEL: self.logger.info("Initialize parallel processes...") # 设置并行训练的策略 # 这里可以用参数配置,不过要改的东西很多,所以先写死吧 build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce # 构建并行过程 self.train_main_prog = fluid.CompiledProgram( self.train_main_prog).with_data_parallel( loss_name=self.train_loss.name, places=self.get_data_run_places(self.args), build_strategy=build_strategy) self.valid_main_prog = fluid.CompiledProgram( self.valid_main_prog).with_data_parallel( share_vars_from=self.train_main_prog, places=self.get_data_run_places(self.args), build_strategy=build_strategy) self.logger.info("Parallel processes initialized.")