def _load_process(self, executor, main_prog): """ 读取模型的过程, 如果想从零开始,请将load_model_path设为空字符串,且read_checkpoint,continue_train为false 如果想从预训练模型(或某个基线)开始训练,请设置continue_train为False, 如果想继续训练,请设置请设置continue_train为True, 如果想从断点训练,请设置read_checkpoint为true。 :param executor: :param main_prog: :return: 字典,保存当前训练状态, 将保存在self.train_status中 """ CONTINUE = self.args["continue_train"] MODEL_PATH = self.args["load_model_path"] CHECK_POINT = self.args["read_checkpoint"] total_step = 0 step_in_epoch = 0 total_epoch = 1 # 读取模型现有的参数并为继续训练进行相应处理 if CONTINUE and CHECK_POINT: info = model_utils.load_train_snapshot(executor, main_prog, MODEL_PATH) self.logger.info("Model file in {} has been loaded".format(MODEL_PATH)) if info: total_step = info.get("total_step", 0) step_in_epoch = info.get("step_in_epoch", 0) total_epoch = info.get("epoch", 1) self.logger.info("Load train info: {}".format(info)) elif MODEL_PATH != "": # 若是第一次训练且预训练模型参数不为空,则加载预训练模型参数 model_utils.load_model_params(exe=executor, program=main_prog, params_path=MODEL_PATH) self.logger.info("Pre-trained model file in {} has been loaded".format(MODEL_PATH)) return {'total_step': total_step, 'total_epoch': total_epoch, 'step_in_epoch': step_in_epoch}
def init_model(self): """ 根据模型参数路径读入模型来初始化,包括预测程序编译,模型参数赋值,并行策略 :param vocab_size: 词典大小 :return: """ model_path = self.args["load_model_path"] self.logger.info("Initializing predict model...") self.exe = fluid.Executor( TrainEngine.get_executor_run_places(self.args)) with fluid.program_guard(self.predict_program, self.predict_startup): # 根据gzl的模型来定义网络,输出占位符 loader, probs, qas_id = classifier.create_model_for_cls_merge( args=self.args_model_build, is_prediction=True) self.logger.info("Prediction neural network created.") self.logger.info("Prediction neural network parameter initialized.") # start_up程序运行初始参数 self.exe.run(self.predict_startup) # 加载模型参数到网络中 load_model_params(self.exe, model_path, self.predict_program) # 若并行,用并行编译program if self.args["use_parallel"]: build_strategy = fluid.BuildStrategy() # 并行策略暂时写死 build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce self.predict_program = fluid.CompiledProgram(self.predict_program). \ with_data_parallel(places=TrainEngine.get_data_run_places(self.args), build_strategy=build_strategy) self.logger.info("Finish initializing predict model!") return loader, probs, qas_id
def _load_process(self, executor, main_prog): """ 读取模型的过程 :param executor: :param main_prog: :return: 字典,保存当前训练状态, 将保存在self.train_status中 """ MODEL_PATH = self.args["load_model_path"] model_utils.load_model_params(exe=executor, program=main_prog, params_path=MODEL_PATH) self.logger.info("Pre-trained model file in {} has been loaded".format(MODEL_PATH))
def _load_process(self, executor, main_prog): """ 读取模型的过程, 如果想从零开始,请将load_model_path设为空字符串,且read_checkpoint,continue_train为false 如果想从预训练模型(或某个基线)开始训练,请设置continue_train为False, 如果想继续训练,请设置请设置continue_train为True, 如果想从断点训练,请设置read_checkpoint为true。 :param executor: :param main_prog: :return: 字典,保存当前训练状态, 将保存在self.train_status中 """ MODEL_PATH = self.args["load_model_path"] model_utils.load_model_params(exe=executor, program=main_prog, params_path=MODEL_PATH) self.logger.info("Pre-trained model file in {} has been loaded".format(MODEL_PATH)) return
def train(self): """ 用于训练流程,根据参数完成训练,并使用验证数据对模型效果进行验证 :return: 无 """ APP_NAME = self.args["app_name"] MAX_EPOCH = self.args["max_epoch"] SNAPSHOT_FREQUENCY = self.args["snapshot_frequency"] EARLY_STOPPING = self.args["early_stopping"] if EARLY_STOPPING: THRESHOLD = self.args["early_stopping_threshold"] STANDSTILL_STEP = self.args["early_stopping_stand_times"] CONTINUE = self.args["continue_train"] MODEL_PATH = self.args["load_model_path"] CHECK_POINT = self.args["read_checkpoint"] # 定义执行器 executor = fluid.Executor(self.get_executor_run_places(self.args)) # 执行初始化 executor.run(self.train_startup_prog) total_step = 0 step_in_epoch = 0 total_epoch = 0 # 读取模型现有的参数并为继续训练进行相应处理 if MODEL_PATH != "": # 若是第一次训练且预训练模型参数不为空,则加载预训练模型参数 model_utils.load_model_params(exe=executor, program=self.origin_train_prog, params_path=MODEL_PATH) self.logger.info( "Pre-trained model file in {} has been loaded".format( MODEL_PATH)) self.logger.info("Ready to train the model.Executing...") self.__run_train_iterable(executor, total_step, 0, 0)
def train(self): """ 用于训练流程,根据参数完成训练,并使用验证数据对模型效果进行验证 :return: 无 """ APP_NAME = self.args["app_name"] MAX_EPOCH = self.args["max_epoch"] SNAPSHOT_FREQUENCY = self.args["snapshot_frequency"] EARLY_STOPPING = self.args["early_stopping"] if EARLY_STOPPING: THRESHOLD = self.args["early_stopping_threshold"] STANDSTILL_STEP = self.args["early_stopping_stand_times"] CONTINUE = self.args["continue_train"] if CONTINUE: MODEL_PATH = self.args["load_model_path"] PRETRAIN_MODEL = self.args["pretrained_model_path"] # 定义执行器 executor = fluid.Executor(self.get_executor_run_places(self.args)) # 执行初始化 executor.run(self.train_startup_prog) total_step = 0 step_in_epoch = 0 total_epoch = 0 # 读取模型现有的参数并为继续训练进行相应处理 if CONTINUE: info = model_utils.load_train_snapshot(executor, self.origin_train_prog, MODEL_PATH) self.logger.info( "Model file in {} has been loaded".format(MODEL_PATH)) if info: total_step == info.get("total_step", 0) step_in_epoch = info.get("step_in_epoch", 0) total_epoch = info.get("epoch", 0) elif PRETRAIN_MODEL != "": # 若是第一次训练且预训练模型参数不为空,则加载预训练模型参数 model_utils.load_model_params(exe=executor, program=self.origin_train_prog, params_path=PRETRAIN_MODEL) self.logger.info( "Pre-trained model file in {} has been loaded".format( PRETRAIN_MODEL)) self.logger.info("Ready to train the model.Executing...") # 执行MAX_EPOCH次迭代save_train_snapshot for epoch_id in range(MAX_EPOCH): if epoch_id == 0: epoch_id += total_epoch # 一个epoch的训练过程,一个迭代 total_step, loss = self.__run_train_iterable( executor, total_step, epoch_id, step_in_epoch) step_in_epoch = 0 self.logger.info( 'Epoch {epoch} done, train mean loss is {loss}'.format( epoch=epoch_id, loss=loss)) # 进行一次验证集上的验证 info = {"total_step": total_step, "epoch": epoch_id} file_path = model_utils.save_train_snapshot( executor, self.origin_train_prog, file_name="{}_epoch{}".format(APP_NAME, epoch_id), train_info=info) self.logger.info( "Snapshot of training process has been saved as folder {}". format(file_path)) # 保存现有模型 file_path = model_utils.save_train_snapshot(executor, self.origin_train_prog, APP_NAME) self.logger.info( "Training process completed. model saved in {}".format(file_path))
def train(self): """ 用于训练流程,根据参数完成训练,并使用验证数据对模型效果进行验证 :return: 无 """ APP_NAME = self.args["app_name"] MAX_EPOCH = self.args["max_epoch"] SNAPSHOT_FREQUENCY = self.args["snapshot_frequency"] EARLY_STOPPING = self.args["early_stopping"] if EARLY_STOPPING: THRESHOLD = self.args["early_stopping_threshold"] STANDSTILL_STEP = self.args["early_stopping_stand_times"] CONTINUE = self.args["continue_train"] MODEL_PATH = self.args["load_model_path"] CHECK_POINT = self.args["read_checkpoint"] # 定义执行器 executor = fluid.Executor(self.get_executor_run_places(self.args)) # 执行初始化 executor.run(self.train_startup_prog) total_step = 0 step_in_epoch = 0 total_epoch = 0 # 读取模型现有的参数并为继续训练进行相应处理 if CONTINUE and CHECK_POINT: info = model_utils.load_train_snapshot(executor, self.origin_train_prog, MODEL_PATH) self.logger.info( "Model file in {} has been loaded".format(MODEL_PATH)) if info: total_step = info.get("total_step", 0) step_in_epoch = info.get("step_in_epoch", 0) total_epoch = info.get("epoch", 0) self.logger.info("Load train info: {}".format(info)) elif MODEL_PATH != "": # 若是第一次训练且预训练模型参数不为空,则加载预训练模型参数 model_utils.load_model_params(exe=executor, program=self.origin_train_prog, params_path=MODEL_PATH) self.logger.info( "Pre-trained model file in {} has been loaded".format( MODEL_PATH)) self.logger.info("Ready to train the model.Executing...") # 执行MAX_EPOCH次迭代save_train_snapshot for epoch_id in range(total_epoch, MAX_EPOCH): # 一个epoch的训练过程,一个迭代 total_step, loss = self.__run_train_iterable( executor, total_step, epoch_id, step_in_epoch) step_in_epoch = 0 self.logger.info( 'Epoch {epoch} done, train mean loss is {loss}'.format( epoch=epoch_id, loss=loss)) # 进行一次验证集上的验证 valid_loss, valid_acc = self.__valid(executor) self.logger.info(' Epoch {epoch} Validated'.format(epoch=epoch_id)) # 进行保存 info = {"total_step": total_step, "epoch": epoch_id} file_path = model_utils.save_train_snapshot( executor, self.origin_train_prog, file_name="{}_epoch{}".format(APP_NAME, epoch_id), train_info=info) self.logger.info( "Snapshot of training process has been saved as folder {}". format(file_path)) # 应用早停策略 if EARLY_STOPPING: need_stop = self.early_stopping_strategy( -valid_acc, threshold=THRESHOLD, standstill_step=STANDSTILL_STEP) if need_stop: self.logger.info( "Performance improvement stalled, ending the training process" ) break # 保存现有模型 file_path = model_utils.save_train_snapshot(executor, self.origin_train_prog, APP_NAME) self.logger.info( "Training process completed. model saved in {}".format(file_path))