def eval_model(cls, master_gpu_id, model, eval_dataset, eval_batch_size=1, use_cuda=False, num_workers=1): model.eval() eval_dataloader = DataLoader(dataset=eval_dataset, pin_memory=use_cuda, batch_size=eval_batch_size, num_workers=num_workers, shuffle=False) predicted_probs = [] true_labels = [] batch_count = 1 for batch in tqdm(eval_dataloader, unit="batch", ncols=100, desc="Evaluating process: "): labels = batch["label"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["label"] tokens = batch["tokens"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["tokens"] segment_ids = batch["segment_ids"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch[ "segment_ids"] attention_mask = batch["attention_mask"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else \ batch["attention_mask"] audio = batch["audio"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["audio"] with torch.no_grad(): main_output, asr_output = model(tokens, segment_ids, attention_mask, audio) # 将模型输出转为列表 main_output = torch.softmax(main_output, dim=1).cpu().tolist() # 获取正例结果 prob = np.array(main_output)[:, 1] # 将该Batch的正例预测值列表拼接至全局正例预测值列表中 predicted_probs.extend(prob.tolist()) # 将真实label列表拼接至全局真实label列表 true_labels.extend(labels.tolist()) LoggerHelper.info("Batch: " + str(batch_count)) batch_count += 1 predicted_probs = [round(prob, 2) for prob in predicted_probs] precision, recall, _thresholds = precision_recall_curve(true_labels, predicted_probs) auc = roc_auc_score(true_labels, predicted_probs) logloss = log_loss(true_labels, predicted_probs) for i in range(len(_thresholds)): log_str_th = 'VAL => Thresholds: {0:>2}, Precision: {1:>7.2%}, Recall: {2:>7.2%}, F1: {3:>7.2%}'.format( _thresholds[i], precision[i], recall[i], f1_score(precision[i], recall[i])) LoggerHelper.info(log_str_th) LoggerHelper.info("AUC: " + str(auc)) LoggerHelper.info("Logloss: " + str(logloss)) return
def save_model(save_dir, model, epoch): """ 保存模型 :param save_dir: 保存路径 :param model: 模型 :param epoch: 训练Epoch :return: """ LoggerHelper.info("Save Model".center(60, "=")) if not os.path.exists(save_dir): os.makedirs(save_dir) model_name = 'Epoch_' + str(epoch) + '.model' save_path = os.path.join(save_dir, model_name) torch.save(model.state_dict(), save_path) LoggerHelper.info("Save Model Done".center(60, "=")) return
def main(data): """ 模型环境线上主流程 :param runtime_config: :param call_type: 调用类型:single: 独立调用 interface: 接口调用 :return: """ print(data) ################ # 预测部分 # ################ LoggerHelper.info("Predicting".center(60, "=")) dataset = dataset_builder(data, config) if function == 'probability': predict_results = predict.predict(master_gpu_id, model, dataset, config["predict_batch_size"], config["use_cuda"], config["predict_num_workers"]) LoggerHelper.info("Predict Result: " + str(predict_results)) elif function == 'score': predict_results = None else: predict_results = None LoggerHelper.info("Predicting Done".center(60, "=")) return predict_results
def predict(master_gpu_id, model, predict_dataset, predict_batch_size=1, use_cuda=False, num_workers=1): """ :param master_gpu_id: :param model: :param predict_dataset: :param predict_batch_size: :param use_cuda: :param num_workers: :return: """ LoggerHelper.info("Start Predicing".center(60, "=")) # 设置模型为评估状态 model.eval() # 加载预测数据加载器 predict_loader = DataLoader(dataset=predict_dataset, pin_memory=use_cuda, batch_size=predict_batch_size, num_workers=num_workers, shuffle=False) # 初始化模型预测结果列表 predicted_result_list = list() # 遍历评估数据集中的每个batch current_batch_index = 0 for batch in predict_loader: LoggerHelper.info("Batch: " + str(current_batch_index)) # current_batch_index += 1 # # tokens = batch["tokens"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["tokens"] # segment_ids = batch["segment_ids"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch[ # "segment_ids"] # attention_mask = batch["attention_mask"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else \ # batch["attention_mask"] # labels = batch["label"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["label"] # # # 获取模型输出 # with torch.no_grad(): # _, logit = model(tokens, # token_type_ids=None, # attention_mask=attention_mask, # labels=labels) # # # 将模型输出转为列表 # logit = torch.softmax(logit, dim=1).cpu().tolist() # # 获取正例结果 # logit = np.array(logit)[:, 1] # predicted_result_list.extend(logit.tolist()) LoggerHelper.info("Predicting Ends".center(60, "=")) return predicted_result_list
def train_model(cls, master_gpu_id, model, optimizer, scheduler, data_loader, gradient_accumulation_steps, use_cuda): """ :param master_gpu_id: :param model: :param optimizer: :param scheduler: :param data_loader: :param gradient_accumulation_steps: :param use_cuda: :return: """ model.train() loss_criterion = nn.CrossEntropyLoss() total_loss = 0.0 correct_sum = 0 num_batch = data_loader.__len__() num_sample = data_loader.dataset.__len__() # for step, batch in enumerate(tqdm(data_loader, unit="batch", ncols=100, desc="Training process: ")): for step, batch in enumerate(data_loader): start_t = time.time() # 获取label和音频数据并配置GPU labels = batch['label'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch['label'] tokens = batch['tokens'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch['tokens'] segment_ids = batch['segment_ids'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch[ 'segment_ids'] attention_mask = batch["attention_mask"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else \ batch["attention_mask"] # 获取模型输出 output = model(tokens, segment_ids, attention_mask) # 计算loss loss = loss_criterion(output, labels) if gradient_accumulation_steps > 1: loss /= gradient_accumulation_steps # 反向传播 loss.backward() if (step + 1) % gradient_accumulation_steps == 0: # 更新参数 optimizer.step() # 清除梯度 model.zero_grad() scheduler.step() loss_value = loss.item() _, top_index = output.topk(1) top_index = top_index.cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else top_index labels = labels.cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else labels correct_sum += (top_index.view(-1) == labels).sum().item() total_loss += loss_value cost_t = time.time() - start_t LoggerHelper.info("step: {}\tloss: {:.2f}\ttime(s): {:.2f}".format(step, loss, cost_t)) LoggerHelper.info("Total Training Samples: " + str(num_sample)) LoggerHelper.info("Correct Prediction: " + str(correct_sum)) LoggerHelper.info("Error Rate: " + format(1 - (correct_sum / num_sample), "0.4f")) return total_loss / num_batch
def prepare(function, config_name, gpu_ids, saved_model, pretrain_model=''): """ 模型准备函数 完成模型训练、验证和预测的各项准备工作 :param function: 执行功能名称 :param config_name: 配置名称 :param gpu_ids: GPU ID列表字符串(逗号分隔) :param saved_model: 模型保存目录名称 :return: """ ################ # 配置部分 # ################ # 根据Function的不同加载对应配置文件 config_file_path = os.path.join( base_dir, 'config') + '/' + config_name + '.' + function + '.yaml' with open(config_file_path, 'r') as conf_file: config = yaml.load(conf_file, Loader=yaml.FullLoader) # 设置模型保存路径 config['save_path'] = os.path.join(base_dir, 'saved_model', config['instance_name']) # 设置预训练模型保存路径 if pretrain_model != '': config["pretrain_model_dir"] = pretrain_model ################ # 日志部分 # ################ # 获取日志文件路径 log_file_path = os.path.join( base_dir, 'log', config['instance_name'] + '_' + str(function) + '.log') # 配置日志系统 logger_config(log_file_name=log_file_path, log_level=config['log_level'], need_loghead=False, timed_rotating=True) LoggerHelper.info("Loading HyperParameters".center(60, "=")) LoggerHelper.info(config) LoggerHelper.info("Load HyperParameters Done".center(60, "=")) ################ # GPU配置 # ################ # 根据配置文件设置是否使用GPU标识 use_cuda = config["use_cuda"] # 初始化主GPU ID为空 master_gpu_id = None # 初始化GPU ID列表为空 gpu_id_list = None if gpu_ids: if len(gpu_ids) == 1: master_gpu_id = int(gpu_ids) else: gpu_id_list = [int(gpu_id) for gpu_id in gpu_ids.split(",")] master_gpu_id = gpu_id_list[0] ################ # 模型部分 # ################ # 初始化模型 if function == 'probability': model = BertForSequenceClassification.from_pretrained( config["pretrain_model_dir"], num_labels=config["num_labels"], output_attentions=False, # 模型是否返回 attentions weights. # output_hidden_states = False, # 模型是否返回所有隐层状态. ) elif function == 'score': model = None # 判断是否使用GPU if use_cuda: # 判断是否设置主GPU ID if master_gpu_id is not None: # 判断是否加载已有模型 if saved_model: LoggerHelper.info("Loading Saved Model".center(60, "=")) LoggerHelper.info("Load saved model from: " + saved_model) model.load_state_dict(torch.load(saved_model)) LoggerHelper.info("Loading Saved Model Done".center(60, "=")) LoggerHelper.info("GPU training or evaluating.") model = model.cuda(int(master_gpu_id)) # 判断是否使用多GPU if gpu_id_list is not None: LoggerHelper.info("Multiple GPU training or evaluating.") model = torch.nn.DataParallel(model, device_ids=gpu_id_list) else: LoggerHelper.info("Single GPU training or evaluating.") else: # 判断是否加载已有模型 if saved_model: LoggerHelper.info("Loading Saved Model".center(60, "=")) LoggerHelper.info("Load saved model from: " + saved_model) model.load_state_dict(torch.load(saved_model, map_location='cpu')) LoggerHelper.info("Loading Saved Model Done".center(60, "=")) return model, config, master_gpu_id
def train_model(cls, master_gpu_id, model, optimizer, scheduler, data_loader, gradient_accumulation_steps, use_cuda): model.train() loss_criterion = nn.CrossEntropyLoss() total_loss = 0.0 correct_sum = 0 num_batch = data_loader.__len__() num_sample = data_loader.dataset.__len__() # for step, batch in enumerate(tqdm(data_loader, unit="batch", ncols=100, desc="Training process: ")): for step, batch in enumerate(data_loader): start_t = time.time() labels = batch["label"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["label"] asr_label = batch["asr_label"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch[ "asr_label"] tokens = batch['tokens'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch['tokens'] segment_ids = batch['segment_ids'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch[ 'segment_ids'] attention_mask = batch["attention_mask"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else \ batch["attention_mask"] audio = batch["audio"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["audio"] main_output, asr_output = model(tokens, segment_ids, attention_mask, audio) main_loss = loss_criterion(main_output, labels) asr_loss = loss_criterion(asr_output, asr_label) overall_loss = main_loss + asr_loss if gradient_accumulation_steps > 1: overall_loss /= gradient_accumulation_steps overall_loss.backward() if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() model.zero_grad() scheduler.step() loss_value = overall_loss.item() _, top_index = main_output.topk(1) top_index = top_index.cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else top_index labels = labels.cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else labels correct_sum += (top_index.view(-1) == labels).sum().item() total_loss += loss_value cost_t = time.time() - start_t LoggerHelper.info("step: {}\tloss: {:.2f}\ttime(s): {:.2f}".format(step, overall_loss, cost_t)) LoggerHelper.info("Total Training Samples: " + str(num_sample)) LoggerHelper.info("Correct Prediction: " + str(correct_sum)) LoggerHelper.info("Error Rate: " + format(1 - (correct_sum / num_sample), "0.4f")) return total_loss / num_batch
def main(runtime_config): """ 模型环境线下主流程 :param runtime_config: :param call_type: 调用类型:single: 独立调用 interface: 接口调用 :return: """ ################ # 运行时参数 # ################ # 获取运行时参数 # function 运行功能 # task_type 任务类型 # mode 运行模式 # config_name 配置名称 # gpu_ids GPU ID配置 # saved_model 模型参数文件地址 function = runtime_config.function task_type = runtime_config.task_type mode = runtime_config.mode config_name = runtime_config.config_name gpu_ids = runtime_config.gpu_ids saved_model = runtime_config.saved_model saved_model_list = list() if saved_model is not None: if os.path.isdir(saved_model): model_file_list = os.listdir(saved_model) for model_file in model_file_list: if 'model' in model_file: saved_model_list.append( os.path.join(saved_model, model_file)) else: saved_model_list.append(saved_model) else: saved_model_list.append(None) for saved_model in saved_model_list: # 完成模型准备工作 # 获取配置和模型 model, dataset, config, master_gpu_id, optimizer, scheduler = prepare( function, task_type, config_name, gpu_ids, saved_model) # 根据运行模式进入对应流程 if mode == 'train': ################ # 训练部分 # ################ LoggerHelper.info("Training".center(60, "=")) if len(saved_model_list) > 1: LoggerHelper.error("The initial model is die".center(60, "=")) return if dataset[0] is not None: # 根据Function不同调用不同训练函数 if function == 'probability': if task_type == 'single_model_text_gate_merge': from model_core.src.train_gate_merge import train_model train_model(task_type, config["save_path"], master_gpu_id, model, optimizer, scheduler, config["epochs"], dataset[0], batch_size=config['train_batch_size'], gradient_accumulation_steps=config[ "gradient_accumulation_steps"], use_cuda=config['use_cuda'], num_workers=config['train_num_workers'], shuffle=config['train_shuffle']) elif task_type == 'single_model_text_no_pretrain_embedding_share_and_gate_merge': from model_core.src.train_gate_merge_text_no_pretrain import train_model train_model(task_type, config["save_path"], master_gpu_id, model, optimizer, scheduler, config["epochs"], dataset[0], batch_size=config['train_batch_size'], gradient_accumulation_steps=config[ "gradient_accumulation_steps"], use_cuda=config['use_cuda'], num_workers=config['train_num_workers'], shuffle=config['train_shuffle']) elif task_type == 'single_model_audio_lstm_text_gate_merge': from model_core.src.train_audio_gate_merge import train_model train_model(task_type, config["save_path"], master_gpu_id, model, optimizer, scheduler, config["epochs"], dataset[0], batch_size=config['train_batch_size'], gradient_accumulation_steps=config[ "gradient_accumulation_steps"], use_cuda=config['use_cuda'], num_workers=config['train_num_workers'], shuffle=config['train_shuffle']) else: train.train_model( task_type, config["save_path"], master_gpu_id, model, optimizer, scheduler, config["epochs"], dataset[0], batch_size=config['train_batch_size'], gradient_accumulation_steps=config[ "gradient_accumulation_steps"], use_cuda=config['use_cuda'], num_workers=config['train_num_workers'], shuffle=config['train_shuffle']) elif function == 'score': pass LoggerHelper.info("Training Done".center(60, "=")) elif mode == 'eval': ################ # 评价部分 # ################ LoggerHelper.info("Evaluating".center(60, "=")) print(len(dataset[1])) if dataset[1] is not None: if function == 'probability': eval.eval_model(task_type, master_gpu_id, model, dataset[1], config["eval_batch_size"], config["use_cuda"], config["eval_num_workers"]) elif function == 'score': pass LoggerHelper.info("Evaluating Done".center(60, "=")) elif mode == 'test': pass elif mode == 'predict': ################ # 预测部分 # ################ LoggerHelper.info("Predicting".center(60, "=")) if len(saved_model_list) > 1: LoggerHelper.error("The initial model is die".center(60, "=")) return if dataset[2] is not None and config['predict_result_save_path']: if function == 'probability': predict_results = predict.predict( master_gpu_id, model, dataset[2], config["predict_batch_size"], config["use_cuda"], config["predict_num_workers"]) elif function == 'score': predict_results = None else: predict_results = None predict_result_list = np.array(predict_results) result = pd.DataFrame(predict_result_list) result.to_csv(config['predict_result_save_path'], index=False, header=False) LoggerHelper.info("Predicting Done".center(60, "=")) # 全部过程结束 LoggerHelper.info("All process finished.".center(60, "="))
def train_model(cls, master_gpu_id, model, optimizer, scheduler, data_loader, gradient_accumulation_steps, use_cuda): """ :param master_gpu_id: :param model: :param optimizer: :param scheduler: :param data_loader: :param gradient_accumulation_steps: :param use_cuda: :return: """ model.train() loss_criterion = nn.CrossEntropyLoss() loss_loss_criterion = nn.L1Loss() total_loss = 0.0 correct_sum = 0 num_batch = data_loader.__len__() num_sample = data_loader.dataset.__len__() # for step, batch in enumerate(tqdm(data_loader, unit="batch", ncols=100, desc="Training process: ")): for step, batch in enumerate(data_loader): start_t = time.time() # 获取label和音频数据并配置GPU label_inputs = batch['label'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch['label'] asr_label = batch["asr_label"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["asr_label"] tokens = batch['tokens'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch['tokens'] segment_ids = batch['segment_ids'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch[ 'segment_ids'] attention_mask = batch["attention_mask"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["attention_mask"] audio_inputs = batch['audio'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch['audio'] audio_length = batch['audio_length'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch['audio_length'] main_output, asr_output = model(tokens, segment_ids, attention_mask, audio_inputs, audio_length) # 计算主模型损失和ASR模型损失 main_loss = loss_criterion(main_output, label_inputs) asr_loss = loss_criterion(asr_output, asr_label) # 计算加权总损失 weighted_loss = model.module.loss_weight(main_loss, asr_loss) # 求加权损失累积均值 if gradient_accumulation_steps > 1: weighted_loss /= gradient_accumulation_steps # 记录第一个Batch的损失 # 用于之后计算评估训练速度的参数 if step == 0: initial_asr_loss = asr_loss.detach() initial_main_loss = main_loss.detach() # 判断是否到达累计总步数 # 若到达则清空累计梯度 if step % gradient_accumulation_steps == 0: model.zero_grad() # 损失反向传播且不清除计算图 weighted_loss.backward(retain_graph=True) # 计算ASR模型损失到三层FC的梯度 asr_gradient = torch.autograd.grad(model.module.loss_weight.asr_model_weight * asr_loss, model.module.asr_model.parameters(), retain_graph=True, create_graph=True) # 计算上述梯度中对应FC第一层Linear的梯度并加权求二范数 asr_norms = torch.norm(asr_gradient[0], 2) # 计算主模型损失到MobileNetV2的梯度 main_gradient = torch.autograd.grad(model.module.loss_weight.main_model_weight * main_loss, model.module.main_model.parameters(), retain_graph=True, create_graph=True) # 计算上述梯度中对应MobileNetV2第一层的梯度并加权求二范数 main_norms = torch.norm(main_gradient[0], 2) # 求上述两个梯度二范数的均值 mean_norm = torch.div(torch.add(asr_norms, main_norms), 2) # 计算ASR模型当前Batch loss与首个Batch loss的比例 asr_loss_ratio = torch.div(asr_loss, initial_asr_loss) # 计算主模型当前Batch loss与首个Batch loss的比例 main_loss_ratio = torch.div(main_loss.data, initial_main_loss) mean_loss_ratio = torch.div(torch.add(asr_loss_ratio, main_loss_ratio), 2) # 计算ASR模型当前的训练速度参数 asr_train_rate = torch.div(asr_loss_ratio, mean_loss_ratio) # 计算主模型当前的训练速度参数 main_train_rate = torch.div(main_loss_ratio, mean_loss_ratio) # Todo # 超参读入 asr_loss_target = mean_norm * (asr_train_rate) ** 0.16 main_loss_target = mean_norm * (main_train_rate) ** 0.16 asr_loss_target = asr_loss_target.detach() main_loss_target = main_loss_target.detach() optimizer[1].zero_grad() loss_sum = torch.add(loss_loss_criterion(asr_norms, asr_loss_target), loss_loss_criterion(main_norms, main_loss_target)) loss_sum.backward() optimizer[1].step() if (step + 1) % gradient_accumulation_steps == 0: optimizer[0].step() # scheduler.step() loss_value = weighted_loss.item() _, top_index = main_output.topk(1) top_index = top_index.cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else top_index labels = label_inputs.cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else label_inputs correct_sum += (top_index.view(-1) == labels).sum().item() total_loss += loss_value cost_t = time.time() - start_t LoggerHelper.info("step: {}\tloss: {:.2f}\ttime(s): {:.2f}".format(step, loss_value, cost_t)) normalize_coef = 2 / torch.add(model.module.loss_weight.asr_model_weight.data, model.module.loss_weight.main_model_weight.data) model.module.loss_weight.asr_model_weight.data = model.module.loss_weight.asr_model_weight.data * normalize_coef model.module.loss_weight.main_model_weight.data = model.module.loss_weight.main_model_weight.data * normalize_coef LoggerHelper.info("asr loss weight: {}\tmain loss weight: {}".format(model.module.loss_weight.asr_model_weight.item(), model.module.loss_weight.main_model_weight.item())) LoggerHelper.info("Total Training Samples: " + str(num_sample)) LoggerHelper.info("Correct Prediction: " + str(correct_sum)) LoggerHelper.info("Error Rate: " + format(1 - (correct_sum / num_sample), "0.4f")) return total_loss / num_batch
def train_model(cls, master_gpu_id, model, optimizer, scheduler, data_loader, gradient_accumulation_steps, use_cuda): """ :param master_gpu_id: :param model: :param optimizer: :param scheduler: :param data_loader: :param gradient_accumulation_steps: :param use_cuda: :return: """ model.train() loss_function = nn.CrossEntropyLoss() loss_loss_criterion = nn.L1Loss() total_loss = 0.0 correct_sum = 0 num_batch = data_loader.__len__() num_sample = data_loader.dataset.__len__() if hasattr(model, 'module'): current_model = model.module else: current_model = model for step, batch in enumerate(data_loader): # 记录开始时间 start_t = time.time() # 获取Batch数据 text_inputs, label_inputs, asr_label_inputs = batch # 处理数据放置位置 # label数据 label_inputs = label_inputs.cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else label_inputs asr_label_inputs = asr_label_inputs.cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else asr_label_inputs # 文本字符向量和文本长度 text_inputs, text_length = text_inputs text_inputs = text_inputs.cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else text_inputs text_length = text_length.cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else text_length # 获得模型输出 prediction, auxiliary_prediction = model(text_inputs, text_length) # 计算模型loss main_loss = loss_function(prediction, label_inputs) auxiliary_loss = loss_function(auxiliary_prediction, asr_label_inputs) # 计算加权总损失 weighted_loss = current_model.loss_weight(main_loss, auxiliary_loss) # 求加权损失累积均值 if gradient_accumulation_steps > 1: weighted_loss /= gradient_accumulation_steps # 记录第一个Batch的损失 # 用于之后计算评估训练速度的参数 if step == 0: initial_auxiliary_loss = auxiliary_loss.detach() initial_main_loss = main_loss.detach() # 判断是否到达累计总步数 # 若到达则清空累计梯度 if step % gradient_accumulation_steps == 0: model.zero_grad() # 损失反向传播且不清除计算图 weighted_loss.backward(retain_graph=True) text_embeddings_last_layer_params = list( current_model.text_embeddings.parameters()) # 计算ASR模型损失到三层FC的梯度 auxiliary_gradient = torch.autograd.grad( current_model.loss_weight.auxiliary_model_weight * auxiliary_loss, text_embeddings_last_layer_params[-1], retain_graph=True, create_graph=True) # 计算上述梯度中对应FC第一层Linear的梯度并加权求二范数 auxiliary_norms = torch.norm(auxiliary_gradient[0], 2) # 计算主模型损失到MobileNetV2的梯度 main_gradient = torch.autograd.grad( current_model.loss_weight.main_model_weight * main_loss, text_embeddings_last_layer_params[-1], retain_graph=True, create_graph=True) # 计算上述梯度中对应MobileNetV2第一层的梯度并加权求二范数 main_norms = torch.norm(main_gradient[0], 2) # 求上述两个梯度二范数的均值 mean_norm = torch.div(torch.add(auxiliary_norms, main_norms), 2) # 计算ASR模型当前Batch loss与首个Batch loss的比例 auxiliary_loss_ratio = torch.div(auxiliary_norms, initial_auxiliary_loss) # 计算主模型当前Batch loss与首个Batch loss的比例 main_loss_ratio = torch.div(main_loss.data, initial_main_loss) mean_loss_ratio = torch.div( torch.add(auxiliary_loss_ratio, main_loss_ratio), 2) # 计算ASR模型当前的训练速度参数 auxiliary_train_rate = torch.div(auxiliary_loss_ratio, mean_loss_ratio) # 计算主模型当前的训练速度参数 main_train_rate = torch.div(main_loss_ratio, mean_loss_ratio) # Todo # 超参读入 auxiliary_loss_target = mean_norm * (auxiliary_train_rate)**0.16 main_loss_target = mean_norm * (main_train_rate)**0.16 auxiliary_loss_target = auxiliary_loss_target.detach() main_loss_target = main_loss_target.detach() optimizer[1].zero_grad() loss_sum = torch.add( loss_loss_criterion(auxiliary_norms, auxiliary_loss_target), loss_loss_criterion(main_norms, main_loss_target)) loss_sum.backward() optimizer[1].step() if (step + 1) % gradient_accumulation_steps == 0: optimizer[0].step() #scheduler.step() loss_value = weighted_loss.item() _, top_index = prediction.topk(1) top_index = top_index.cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else top_index correct_sum += (top_index.view(-1) == label_inputs).sum().item() total_loss += loss_value cost_t = time.time() - start_t LoggerHelper.info( "step: {}\tloss: {:.2f}\tmain_loss: {:.2f}\tasr_loss: {:.2f}\ttime(s): {:.2f}" .format(step, loss_value, main_loss, auxiliary_loss, cost_t)) normalize_coef = 2 / torch.add( current_model.loss_weight.auxiliary_model_weight.data, current_model.loss_weight.main_model_weight.data) current_model.loss_weight.auxiliary_model_weight.data = current_model.loss_weight.auxiliary_model_weight.data * normalize_coef current_model.loss_weight.main_model_weight.data = current_model.loss_weight.main_model_weight.data * normalize_coef LoggerHelper.info( "asr loss weight: {}\tmain loss weight: {}".format( model.module.loss_weight.auxiliary_model_weight.item(), model.module.loss_weight.main_model_weight.item())) LoggerHelper.info("Total Training Samples: " + str(num_sample)) LoggerHelper.info("Correct Prediction: " + str(correct_sum)) LoggerHelper.info("Error Rate: " + format(1 - (correct_sum / num_sample), "0.4f")) return total_loss / num_batch
def prepare(function, task_type, config_name, gpu_ids, saved_model): """ 模型准备函数 完成模型训练、验证和预测的各项准备工作 :param function: 执行功能名称 :param config_name: 配置名称 :param gpu_ids: GPU ID列表字符串(逗号分隔) :param saved_model: 模型保存目录名称 :return: """ ################ # 配置部分 # ################ # 根据Function的不同加载对应配置文件 config_file_path = os.path.join( base_dir, 'config') + '/' + config_name + '.' + function + '.yaml' with open(config_file_path, 'r') as conf_file: config = yaml.load(conf_file, Loader=yaml.FullLoader) # 设置模型保存路径 config['save_path'] = os.path.join(base_dir, 'saved_model', config['instance_name']) ################ # 日志部分 # ################ # 获取日志文件路径 log_file_path = os.path.join( base_dir, 'log', config['instance_name'] + '_' + str(function) + '.log') # 配置日志系统 logger_config(log_file_name=log_file_path, log_level=config['log_level'], need_loghead=False, timed_rotating=True) LoggerHelper.info("Loading HyperParameters".center(60, "=")) LoggerHelper.info(config) LoggerHelper.info("Load HyperParameters Done".center(60, "=")) ################ # GPU配置 # ################ # 根据配置文件设置是否使用GPU标识 use_cuda = config["use_cuda"] # 初始化主GPU ID为空 master_gpu_id = None # 初始化GPU ID列表为空 gpu_id_list = None if gpu_ids: if len(gpu_ids) == 1: master_gpu_id = int(gpu_ids) else: gpu_id_list = [int(gpu_id) for gpu_id in gpu_ids.split(",")] master_gpu_id = gpu_id_list[0] ################ # 模型部分 # ################ # 初始化模型 if function == 'probability': # Todo # 根据任务路由和统一配置文件完成基于不同任务的统一动态调用 # task_router = TaskRouter() # task_dict = if task_type == 'single_model_audio': model = AudioSingleModel() elif task_type == 'single_model_audio_lstm': model = AudioSingleModelBasedOnLSTM( config_file_path=os.path.join(base_dir, 'config') + '/' + config['didi_multimodel_config_file']) elif task_type == 'single_model_audio_lstm_text_gate_merge': model = AudioSingleModelBasedOnLSTMTextGateMerge( config_file_path=os.path.join(base_dir, 'config') + '/' + config['didi_multimodel_config_file'], asr_pretrain_model=config['text_pretrain_model'], asr_embedding_dim=768, audio_embedding_dim=12288) elif task_type == 'single_model_audio_gate_merge': model = AudioSingleModelAndTextGateMerge( config['text_pretrain_model'], asr_embedding_dim=768, audio_pretrain_model=config['audio_pretrain_model'], audio_embedding_dim=1280) elif task_type == 'single_model_text_bert': model = TextSingleModelBasedOnBert(config['text_pretrain_model']) # model = BertForSequenceClassification.from_pretrained(config["text_pretrain_model_dir"], # num_labels=config["num_labels"], # output_attentions=False, # 模型是否返回 attentions weights. # # output_hidden_states = False, # 模型是否返回所有隐层状态. # ) elif task_type == 'single_model_text_gate_merge': model = TextSingleModelAndTextGateMerge( config['text_pretrain_model'], asr_embedding_dim=768) elif task_type == 'multimodel_embedding_fuse_text_bert': model = EmbeddingShareMultimodel(config['text_pretrain_model'], asr_embedding_dim=768, audio_embedding_dim=1280) elif task_type == 'multimodel_feature_fuse_text_bert_gate_merge': model = OutputGateMergeMultimodel(config['text_pretrain_model'], asr_embedding_dim=768, audio_embedding_dim=1280) # elif task_type == 'multimodel_hybrid': # model = EmbeddingShareAndOutputMergeMultimodel(config['text_pretrain_model'], # asr_embedding_dim=768, # audio_pretrain_model=config['audio_pretrain_model'], # audio_embedding_dim=1280) elif task_type == 'multimodel_didi': model = DiDiMultimodel( config_file_path=os.path.join(base_dir, 'config') + '/' + config['didi_multimodel_config_file']) elif task_type == 'multimodel_didi_pretrain': pass #model = DiDiMultimodelPretrain(config_file_path=os.path.join(base_dir, 'config') + '/' + config['didi_multimodel_config_file'], #asr_pretrain_model=config['text_pretrain_model']) elif task_type == 'single_model_text_no_pretrain': model = TextSingleModelBasedNoPretrain( config_file_path=os.path.join(base_dir, 'config') + '/' + config['didi_multimodel_config_file']) elif task_type == 'single_model_text_no_pretrain_embedding_share_and_gate_merge': model = TextSingleModelBasedNoPretrainEmbeddingShareAndGateMerge( config_file_path=os.path.join(base_dir, 'config') + '/' + config['didi_multimodel_config_file']) elif task_type == 'single_model_audio_no_pretrain': model = AudioSingleModelNoPretrain( config_file_path=os.path.join(base_dir, 'config') + '/' + config['didi_multimodel_config_file']) elif task_type == 'single_model_audio_no_pretrain_gate_merge': model = AudioSingleModelNoPretrainGateMerge( config_file_path=os.path.join(base_dir, 'config') + '/' + config['didi_multimodel_config_file']) elif task_type == 'multimodel_didi_embedding_share': model = DiDiMultimodelEmbeddingShare( config_file_path=os.path.join(base_dir, 'config') + '/' + config['didi_multimodel_config_file']) elif task_type == 'multimodel_didi_embedding_share_and_output_merge': model = DiDiMultimodelEmbeddingShareAndOutputMerge( config_file_path=os.path.join(base_dir, 'config') + '/' + config['didi_multimodel_config_file']) elif function == 'score': model = None # 判断是否使用GPU if use_cuda and master_gpu_id is not None: # 判断是否加载已有模型 if saved_model: LoggerHelper.info("Loading Saved Model".center(60, "=")) LoggerHelper.info("Load saved model from: " + saved_model) # model.load_state_dict(torch.load(saved_model)) model.load_state_dict({ k.replace('module.', ''): v for k, v in torch.load(saved_model).items() }) LoggerHelper.info("Loading Saved Model Done".center(60, "=")) LoggerHelper.info("GPU training or evaluating.") model = model.cuda(int(master_gpu_id)) # 判断是否使用多GPU if gpu_id_list is not None: LoggerHelper.info("Multiple GPU training or evaluating.") model = torch.nn.DataParallel(model, device_ids=gpu_id_list) else: LoggerHelper.info("Single GPU training or evaluating.") else: # 判断是否加载已有模型 if saved_model: LoggerHelper.info("Loading Saved Model".center(60, "=")) LoggerHelper.info("Load saved model from: " + saved_model) model.load_state_dict(torch.load(saved_model, map_location='cpu')) LoggerHelper.info("Loading Saved Model Done".center(60, "=")) ################ # 数据部分 # ################ LoggerHelper.info("Loading Dataset".center(60, "=")) train_dataset = None eval_dataset = None predict_dataset = None # 根据Function不同加载对应数据集 if function == 'probability': if config['train_dataset_path'] and os.path.exists( config['train_dataset_path']): if task_type in ( 'multimodel_didi', 'multimodel_didi_embedding_share', 'multimodel_didi_embedding_share_and_output_merge', 'single_model_audio_no_pretrain_gate_merge'): glove = vocab.GloVe( name='6B', dim=300, cache=config['didi_multimodel_vocabulary_dict']) vocabulary_dict = glove.stoi train_dataset = DiDiDataset( data=config['train_dataset_path'], audio_dir=config['train_audio_dir'], vocabulary_dict=vocabulary_dict, audio_length=config['didi_multimodel_audio_length']) elif task_type in ( 'single_model_text_no_pretrain', 'single_model_text_no_pretrain_embedding_share_and_gate_merge' ): glove = vocab.GloVe( name='6B', dim=300, cache=config['didi_multimodel_vocabulary_dict']) vocabulary_dict = glove.stoi train_dataset = DiDiDatasetText( data=config['train_dataset_path'], vocabulary_dict=vocabulary_dict) elif task_type == 'single_model_audio_no_pretrain': train_dataset = DiDiDatasetAudio( data=config['train_dataset_path'], audio_dir=config['train_audio_dir'], audio_length=config['didi_multimodel_audio_length']) else: train_dataset = FluencyDataset( data=config['train_dataset_path'], task_type=task_type, audio_dir=config['train_audio_dir'], max_seq_len=config['max_seq_len'], asr_pretrain_model=config['text_pretrain_model'], audio_pretrain_model=config['audio_pretrain_model'], predict=False, cache=config['cache'], temp_dir=config['temp_dir']) if config['eval_dataset_path'] and os.path.exists( config['eval_dataset_path']): if task_type in ( 'multimodel_didi', 'multimodel_didi_embedding_share', 'multimodel_didi_embedding_share_and_output_merge', 'single_model_audio_no_pretrain_gate_merge'): glove = vocab.GloVe( name='6B', dim=300, cache=config['didi_multimodel_vocabulary_dict']) vocabulary_dict = glove.stoi eval_dataset = DiDiDataset( data=config['eval_dataset_path'], audio_dir=config['eval_audio_dir'], vocabulary_dict=vocabulary_dict, audio_length=config['didi_multimodel_audio_length']) elif task_type in ( 'single_model_text_no_pretrain', 'single_model_text_no_pretrain_embedding_share_and_gate_merge' ): glove = vocab.GloVe( name='6B', dim=300, cache=config['didi_multimodel_vocabulary_dict']) vocabulary_dict = glove.stoi eval_dataset = DiDiDatasetText( data=config['eval_dataset_path'], vocabulary_dict=vocabulary_dict) elif task_type == 'single_model_audio_no_pretrain': eval_dataset = DiDiDatasetAudio( data=config['eval_dataset_path'], audio_dir=config['train_audio_dir'], audio_length=config['didi_multimodel_audio_length']) else: eval_dataset = FluencyDataset( data=config['eval_dataset_path'], task_type=task_type, audio_dir=config['eval_audio_dir'], max_seq_len=config['max_seq_len'], asr_pretrain_model=config['text_pretrain_model'], audio_pretrain_model=config['audio_pretrain_model'], predict=False, cache=config['cache'], temp_dir=config['temp_dir']) if config['predict_dataset_path'] and os.path.exists( config['predict_dataset_path']): pass # predict_dataset = AudioFluencyDataset(data=config['predict_dataset_path'], # audio_dir=config['predict_audio_path'], # max_seq_len=config['max_seq_len'], # audio_pretrain_model_dir=config['audio_pretrain_model_dir'], # text_pretrain_model_dir=config['text_pretrain_model_dir']) elif function == 'score': pass LoggerHelper.info("Loading Dataset Done".center(60, "=")) ################ # 优化器部分 # ################ optimizer = None if task_type == 'single_model_audio_gate_merge': pass # loss_params_id = list() # loss_params = list() # # from model_core.src.models.loss_weight import LossWeight # for m in model.modules(): # if isinstance(m, LossWeight): # loss_params_id += list(map(id, m.parameters())) # loss_params += m.parameters() # # base_params = list(filter(lambda p: id(p) not in loss_params_id, model.parameters())) # # base_optimizer = AdamW(base_params, lr=config['lr']) # loss_optimizer = AdamW(loss_params, lr=config['lr']) # optimizer = [base_optimizer, loss_optimizer] else: optimizer = AdamW( model.parameters(), lr=config['lr'], # args.learning_rate - default is 5e-5 # eps = 1e-8 # args.adam_epsilon - default is 1e-8 ) ################ # 调度器部分 # ################ scheduler = None if task_type == 'single_model_audio_gate_merge': pass elif task_type in ( 'single_model_text_no_pretrain_embedding_share_and_gate_merge', 'single_model_audio_no_pretrain_gate_merge', 'single_model_text_gate_merge', 'single_model_audio_lstm_text_gate_merge'): pass else: total_steps = None if train_dataset is not None: total_steps = train_dataset.__len__() * config["epochs"] scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) return model, [train_dataset, eval_dataset, predict_dataset ], config, master_gpu_id, optimizer, scheduler
def train_model(cls, master_gpu_id, model, optimizer, scheduler, data_loader, gradient_accumulation_steps, use_cuda): """ :param master_gpu_id: :param model: :param optimizer: :param scheduler: :param data_loader: :param gradient_accumulation_steps: :param use_cuda: :return: """ model.train() loss_function = nn.CrossEntropyLoss() total_loss = 0.0 correct_sum = 0 num_batch = data_loader.__len__() num_sample = data_loader.dataset.__len__() for step, batch in enumerate(data_loader): start_t = time.time() text_inputs, label_inputs, _ = batch label_inputs = label_inputs.cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else label_inputs text_inputs, text_length = text_inputs text_inputs = text_inputs.cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else text_inputs text_length = text_length.cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else text_length prediction = model(text_inputs, text_length) loss = loss_function(prediction, label_inputs) if gradient_accumulation_steps > 1: loss /= gradient_accumulation_steps loss.backward() if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() model.zero_grad() scheduler.step() loss_value = loss.item() _, top_index = prediction.topk(1) top_index = top_index.cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else top_index labels = label_inputs.cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else label_inputs correct_sum += (top_index.view(-1) == labels).sum().item() total_loss += loss_value cost_t = time.time() - start_t LoggerHelper.info("step: {}\tloss: {:.2f}\ttime(s): {:.2f}".format( step, loss, cost_t)) LoggerHelper.info("Total Training Samples: " + str(num_sample)) LoggerHelper.info("Correct Prediction: " + str(correct_sum)) LoggerHelper.info("Error Rate: " + format(1 - (correct_sum / num_sample), "0.4f")) return total_loss / num_batch
def train_model(task_type, model_save_path, master_gpu_id, model, optimizer, scheduler, epochs, train_dataset, batch_size, gradient_accumulation_steps=1, use_cuda=False, num_workers=1, shuffle=True): """ 模型训练 :param model_save_path: :param master_gpu_id: :param model: :param optimizer: :param scheduler: :param epochs: :param train_dataset: :param batch_size: :param gradient_accumulation_steps: :param use_cuda: :param num_workers: :param shuffle: :return: """ LoggerHelper.info("Start Training".center(60, "=")) if task_type in ('multimodel_didi', 'multimodel_didi_embedding_share', 'multimodel_didi_embedding_share_and_output_merge', 'single_model_audio_no_pretrain_gate_merge'): from model_core.src.data.didi_dataset import DiDiDataset train_loader = DataLoader(dataset=train_dataset, pin_memory=use_cuda, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, collate_fn=DiDiDataset.collate) elif task_type in ( 'single_model_text_no_pretrain', 'single_model_text_no_pretrain_embedding_share_and_gate_merge'): from model_core.src.data.didi_dataset_text import DiDiDatasetText train_loader = DataLoader(dataset=train_dataset, pin_memory=use_cuda, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, collate_fn=DiDiDatasetText.collate) elif task_type == 'single_model_audio_no_pretrain': from model_core.src.data.didi_dataset_audio import DiDiDatasetAudio train_loader = DataLoader(dataset=train_dataset, pin_memory=use_cuda, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, collate_fn=DiDiDatasetAudio.collate) else: train_loader = DataLoader(dataset=train_dataset, pin_memory=use_cuda, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle) for epoch in range(1, epochs + 1): LoggerHelper.info("Training Epoch: " + str(epoch)) # avg_loss = train_epoch(master_gpu_id, # model, # optimizer, # scheduler, # train_loader, # gradient_accumulation_steps, # use_cuda) if task_type == 'single_model_audio': avg_loss = AudioSingleModel.train_model( master_gpu_id, model, optimizer, scheduler, train_loader, gradient_accumulation_steps, use_cuda) LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model, epoch) elif task_type == 'single_model_audio_lstm': avg_loss = AudioSingleModelBasedOnLSTM.train_model( master_gpu_id, model, optimizer, scheduler, train_loader, gradient_accumulation_steps, use_cuda) LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model, epoch) elif task_type == 'single_model_audio_gate_merge': avg_loss = AudioSingleModelAndTextGateMerge.train_model( master_gpu_id, model, optimizer, scheduler, train_loader, gradient_accumulation_steps, use_cuda) LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model, epoch) elif task_type == 'single_model_text_bert': avg_loss = TextSingleModelBasedOnBert.train_model( master_gpu_id, model, optimizer, scheduler, train_loader, gradient_accumulation_steps, use_cuda) LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model, epoch) elif task_type == 'single_model_text_gate_merge': avg_loss = TextSingleModelAndTextGateMerge.train_model( master_gpu_id, model, optimizer, scheduler, train_loader, gradient_accumulation_steps, use_cuda) LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model[0], epoch) elif task_type == 'multimodel_embedding_fuse_text_bert': avg_loss = EmbeddingShareMultimodel.train_model( master_gpu_id, model, optimizer, scheduler, train_loader, gradient_accumulation_steps, use_cuda) LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model, epoch) elif task_type == 'multimodel_feature_fuse_text_bert_gate_merge': avg_loss = OutputGateMergeMultimodel.train_model( master_gpu_id, model, optimizer, scheduler, train_loader, gradient_accumulation_steps, use_cuda) LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model, epoch) elif task_type == 'multimodel_hybrid': pass # avg_loss = EmbeddingShareAndOutputMergeMultimodel.train_model(master_gpu_id, # model, # optimizer, # scheduler, # train_loader, # gradient_accumulation_steps, # use_cuda) # # LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) # save_model(model_save_path, model, epoch) elif task_type == 'multimodel_didi': avg_loss = DiDiMultimodel.train_model(master_gpu_id, model, optimizer, scheduler, train_loader, gradient_accumulation_steps, use_cuda) LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model, epoch) elif task_type == 'multimodel_didi_pretrain': pass #avg_loss = DiDiMultimodelPretrain.train_model(master_gpu_id, # model, # optimizer, # scheduler, # train_loader, # gradient_accumulation_steps, # use_cuda) elif task_type == 'single_model_text_no_pretrain': avg_loss = TextSingleModelBasedNoPretrain.train_model( master_gpu_id, model, optimizer, scheduler, train_loader, gradient_accumulation_steps, use_cuda) LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model, epoch) elif task_type == 'single_model_text_no_pretrain_embedding_share_and_gate_merge': avg_loss = TextSingleModelBasedNoPretrainEmbeddingShareAndGateMerge.train_model( master_gpu_id, model, optimizer, scheduler, train_loader, gradient_accumulation_steps, use_cuda) LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model, epoch) elif task_type == 'single_model_audio_no_pretrain': avg_loss = AudioSingleModelNoPretrain.train_model( master_gpu_id, model, optimizer, scheduler, train_loader, gradient_accumulation_steps, use_cuda) LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model, epoch) elif task_type == 'single_model_audio_no_pretrain_gate_merge': avg_loss = AudioSingleModelNoPretrainGateMerge.train_model( master_gpu_id, model, optimizer, scheduler, train_loader, gradient_accumulation_steps, use_cuda) LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model, epoch) elif task_type == 'multimodel_didi_embedding_share': avg_loss = DiDiMultimodelEmbeddingShare.train_model( master_gpu_id, model, optimizer, scheduler, train_loader, gradient_accumulation_steps, use_cuda) LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model, epoch) elif task_type == 'multimodel_didi_embedding_share_and_output_merge': avg_loss = DiDiMultimodelEmbeddingShareAndOutputMerge.train_model( master_gpu_id, model, optimizer, scheduler, train_loader, gradient_accumulation_steps, use_cuda) LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model, epoch) LoggerHelper.info("Training Done".center(60, "=")) return
def train_model(cls, master_gpu_id, model, optimizer, scheduler, data_loader, gradient_accumulation_steps, use_cuda): model.train() loss_function = nn.CrossEntropyLoss() total_loss = 0.0 correct_sum = 0 num_batch = data_loader.__len__() num_sample = data_loader.dataset.__len__() for step, batch in enumerate(data_loader): start_t = time.time() labels = batch["label"].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch["label"] asr_label = batch["asr_label"].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch["asr_label"] tokens = batch['tokens'].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch['tokens'] segment_ids = batch['segment_ids'].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch[ 'segment_ids'] attention_mask = batch["attention_mask"].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch[ "attention_mask"] tokens_length = batch['tokens_length'].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch[ 'tokens_length'] audio_inputs = batch["audio"].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch["audio"] audio_length = batch["audio_length"].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch[ "audio_length"] prediction = model(tokens, segment_ids, attention_mask, tokens_length, audio_inputs, audio_length) loss = loss_function(prediction, labels) if gradient_accumulation_steps > 1: loss /= gradient_accumulation_steps loss.backward() if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() model.zero_grad() scheduler.step() loss_value = loss.item() _, top_index = prediction.topk(1) top_index = top_index.cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else top_index labels = label_inputs.cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else label_inputs correct_sum += (top_index.view(-1) == labels).sum().item() total_loss += loss_value cost_t = time.time() - start_t LoggerHelper.info("step: {}\tloss: {:.2f}\ttime(s): {:.2f}".format( step, loss, cost_t)) LoggerHelper.info("Total Training Samples: " + str(num_sample)) LoggerHelper.info("Correct Prediction: " + str(correct_sum)) LoggerHelper.info("Error Rate: " + format(1 - (correct_sum / num_sample), "0.4f")) return total_loss / num_batch
def train_model(task_type, model_save_path, master_gpu_id, model, optimizer, scheduler, epochs, train_dataset, batch_size, gradient_accumulation_steps=1, use_cuda=False, num_workers=1, shuffle=True): """ 模型训练 :param model_save_path: :param master_gpu_id: :param model: :param optimizer: :param scheduler: :param epochs: :param train_dataset: :param batch_size: :param gradient_accumulation_steps: :param use_cuda: :param num_workers: :param shuffle: :return: """ LoggerHelper.info("Start Training".center(60, "=")) data_loader = DataLoader(dataset=train_dataset, pin_memory=use_cuda, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle) model.train() loss_criterion = nn.CrossEntropyLoss() loss_loss_criterion = nn.L1Loss() num_batch = data_loader.__len__() num_sample = data_loader.dataset.__len__() device = torch.device('cuda:' + str(master_gpu_id) if use_cuda and master_gpu_id is not None else 'cpu') main_loss_weight = torch.tensor([1], dtype=torch.float64, requires_grad=True, device=device) auxiliary_loss_weight = torch.tensor([1], dtype=torch.float64, requires_grad=True, device=device) loss_params = [main_loss_weight, auxiliary_loss_weight] loss_optimizer = torch.optim.Adam( loss_params, lr=optimizer.state_dict()['param_groups'][0]['lr']) for epoch in range(1, epochs + 1): LoggerHelper.info("Training Epoch: " + str(epoch)) total_loss = 0.0 correct_sum = 0 if hasattr(model, 'module'): current_model = model.module else: current_model = model # for step, batch in enumerate(tqdm(data_loader, unit="batch", ncols=100, desc="Training process: ")): for step, batch in enumerate(data_loader): start_t = time.time() labels = batch["label"].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch["label"] asr_label = batch["asr_label"].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch["asr_label"] tokens = batch['tokens'].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch['tokens'] segment_ids = batch['segment_ids'].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch[ 'segment_ids'] attention_mask = batch["attention_mask"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else \ batch["attention_mask"] audio_inputs = batch['audio'].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch['audio'] audio_length = batch['audio_length'].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch[ 'audio_length'] main_output, asr_output = model(tokens, segment_ids, attention_mask, audio_inputs, audio_length) main_loss = loss_criterion(main_output, labels) auxiliary_loss = loss_criterion(asr_output, asr_label) # 计算加权总损失 weighted_main_loss = loss_params[0] * main_loss weighted_auxiliary_loss = loss_params[1] * auxiliary_loss # weighted_loss = torch.div(torch.add(weighted_main_loss, weighted_auxiliary_loss), 2) weighted_loss = torch.div( torch.add(weighted_main_loss, weighted_auxiliary_loss), 2) # 求加权损失累积均值 if gradient_accumulation_steps > 1: weighted_loss /= gradient_accumulation_steps # 记录第一个Batch的损失 # 用于之后计算评估训练速度的参数 if step == 0: initial_auxiliary_loss = auxiliary_loss.detach() initial_main_loss = main_loss.detach() # # 判断是否到达累计总步数 # # 若到达则清空累计梯度 # if step % gradient_accumulation_steps == 0: # optimizer.zero_grad() # 损失反向传播且不清除计算图 optimizer.zero_grad() weighted_loss.backward(retain_graph=True) optimizer.step() asr_embedding_params = list( current_model.asr_embedding_model.parameters()) # 计算ASR模型损失到三层FC的梯度 auxiliary_gradient = torch.autograd.grad(weighted_auxiliary_loss, asr_embedding_params[-2], retain_graph=True, create_graph=True) # 计算上述梯度中对应FC第一层Linear的梯度并加权求二范数 auxiliary_norms = torch.norm(auxiliary_gradient[0], 2) # 计算主模型损失到MobileNetV2的梯度 main_gradient = torch.autograd.grad(weighted_main_loss, asr_embedding_params[-2], retain_graph=True, create_graph=True) # 计算上述梯度中对应MobileNetV2第一层的梯度并加权求二范数 main_norms = torch.norm(main_gradient[0], 2) # 求上述两个梯度二范数的均值 mean_norm = torch.div(torch.add(auxiliary_norms, main_norms), 2) # 计算ASR模型当前Batch loss与首个Batch loss的比例 auxiliary_loss_ratio = torch.div(auxiliary_loss, initial_auxiliary_loss) # 计算主模型当前Batch loss与首个Batch loss的比例 main_loss_ratio = torch.div(main_loss, initial_main_loss) mean_loss_ratio = torch.div( torch.add(auxiliary_loss_ratio, main_loss_ratio), 2) # 计算ASR模型当前的训练速度参数 auxiliary_train_rate = torch.div(auxiliary_loss_ratio, mean_loss_ratio) # 计算主模型当前的训练速度参数 main_train_rate = torch.div(main_loss_ratio, mean_loss_ratio) # Todo # 超参读入 auxiliary_loss_target = mean_norm * (auxiliary_train_rate)**0.12 main_loss_target = mean_norm * (main_train_rate)**0.12 auxiliary_loss_target = auxiliary_loss_target.detach() main_loss_target = main_loss_target.detach() loss_optimizer.zero_grad() loss_sum = torch.add( loss_loss_criterion(auxiliary_norms, auxiliary_loss_target), loss_loss_criterion(main_norms, main_loss_target)) loss_sum.backward() loss_optimizer.step() # if (step + 1) % gradient_accumulation_steps == 0: # optimizer.step() # # scheduler.step() loss_value = weighted_loss.item() _, top_index = main_output.topk(1) top_index = top_index.cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else top_index labels = labels.cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else labels correct_sum += (top_index.view(-1) == labels).sum().item() total_loss += loss_value cost_t = time.time() - start_t LoggerHelper.info( "step: {}\tloss: {:.2f}\tmain_loss: {:.2f}\tasr_loss: {:.2f}\ttime(s): {:.2f}" .format(step, loss_value, main_loss.item(), auxiliary_loss.item(), cost_t)) coef = 2 / torch.add(main_loss_weight, auxiliary_loss_weight) loss_params = [ coef * main_loss_weight, coef * auxiliary_loss_weight ] LoggerHelper.info( "main loss weight: {}\tauxiliary loss weight: {}".format( main_loss_weight.item(), auxiliary_loss_weight.item())) LoggerHelper.info("Total Training Samples: " + str(num_sample)) LoggerHelper.info("Correct Prediction: " + str(correct_sum)) LoggerHelper.info("Error Rate: " + format(1 - (correct_sum / num_sample), "0.4f")) avg_loss = total_loss / num_batch LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f")) save_model(model_save_path, model, epoch) LoggerHelper.info("Training Done".center(60, "=")) return