def eval_model(cls,
                   master_gpu_id,
                   model,
                   eval_dataset,
                   eval_batch_size=1,
                   use_cuda=False,
                   num_workers=1):
        model.eval()

        eval_dataloader = DataLoader(dataset=eval_dataset,
                                     pin_memory=use_cuda,
                                     batch_size=eval_batch_size,
                                     num_workers=num_workers,
                                     shuffle=False)

        predicted_probs = []
        true_labels = []

        batch_count = 1
        for batch in tqdm(eval_dataloader, unit="batch", ncols=100, desc="Evaluating process: "):
            labels = batch["label"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["label"]

            tokens = batch["tokens"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["tokens"]
            segment_ids = batch["segment_ids"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch[
                "segment_ids"]
            attention_mask = batch["attention_mask"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else \
            batch["attention_mask"]

            audio = batch["audio"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["audio"]

            with torch.no_grad():
                main_output, asr_output = model(tokens, segment_ids, attention_mask, audio)

                # 将模型输出转为列表
                main_output = torch.softmax(main_output, dim=1).cpu().tolist()
                # 获取正例结果
                prob = np.array(main_output)[:, 1]
                # 将该Batch的正例预测值列表拼接至全局正例预测值列表中
                predicted_probs.extend(prob.tolist())

                # 将真实label列表拼接至全局真实label列表
                true_labels.extend(labels.tolist())

                LoggerHelper.info("Batch: " + str(batch_count))
                batch_count += 1

        predicted_probs = [round(prob, 2) for prob in predicted_probs]
        precision, recall, _thresholds = precision_recall_curve(true_labels, predicted_probs)
        auc = roc_auc_score(true_labels, predicted_probs)
        logloss = log_loss(true_labels, predicted_probs)
        for i in range(len(_thresholds)):
            log_str_th = 'VAL => Thresholds: {0:>2}, Precision: {1:>7.2%}, Recall: {2:>7.2%}, F1: {3:>7.2%}'.format(
                _thresholds[i], precision[i], recall[i], f1_score(precision[i], recall[i]))
            LoggerHelper.info(log_str_th)

        LoggerHelper.info("AUC: " + str(auc))
        LoggerHelper.info("Logloss: " + str(logloss))

        return
Example #2
0
def save_model(save_dir, model, epoch):
    """
    保存模型
    :param save_dir:   保存路径
    :param model:       模型
    :param epoch:       训练Epoch
    :return:
    """
    LoggerHelper.info("Save Model".center(60, "="))

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    model_name = 'Epoch_' + str(epoch) + '.model'
    save_path = os.path.join(save_dir, model_name)
    torch.save(model.state_dict(), save_path)

    LoggerHelper.info("Save Model Done".center(60, "="))
    return
Example #3
0
def main(data):
    """
    模型环境线上主流程
    :param runtime_config:
    :param call_type: 调用类型:single: 独立调用 interface: 接口调用
    :return:
    """
    print(data)
    ################
    #    预测部分   #
    ################
    LoggerHelper.info("Predicting".center(60, "="))

    dataset = dataset_builder(data, config)

    if function == 'probability':
        predict_results = predict.predict(master_gpu_id,
                                          model,
                                          dataset,
                                          config["predict_batch_size"],
                                          config["use_cuda"],
                                          config["predict_num_workers"])
        LoggerHelper.info("Predict Result: " + str(predict_results))
    elif function == 'score':
        predict_results = None
    else:
        predict_results = None

    LoggerHelper.info("Predicting Done".center(60, "="))

    return predict_results
Example #4
0
def predict(master_gpu_id,
            model,
            predict_dataset,
            predict_batch_size=1,
            use_cuda=False,
            num_workers=1):
    """

    :param master_gpu_id:
    :param model:
    :param predict_dataset:
    :param predict_batch_size:
    :param use_cuda:
    :param num_workers:
    :return:
    """
    LoggerHelper.info("Start Predicing".center(60, "="))

    # 设置模型为评估状态
    model.eval()
    # 加载预测数据加载器
    predict_loader = DataLoader(dataset=predict_dataset,
                                pin_memory=use_cuda,
                                batch_size=predict_batch_size,
                                num_workers=num_workers,
                                shuffle=False)

    # 初始化模型预测结果列表
    predicted_result_list = list()

    # 遍历评估数据集中的每个batch
    current_batch_index = 0
    for batch in predict_loader:
        LoggerHelper.info("Batch: " + str(current_batch_index))
        # current_batch_index += 1
        #
        # tokens = batch["tokens"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["tokens"]
        # segment_ids = batch["segment_ids"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch[
        #     "segment_ids"]
        # attention_mask = batch["attention_mask"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else \
        # batch["attention_mask"]
        # labels = batch["label"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["label"]
        #
        # # 获取模型输出
        # with torch.no_grad():
        #     _, logit = model(tokens,
        #                      token_type_ids=None,
        #                      attention_mask=attention_mask,
        #                      labels=labels)
        #
        # # 将模型输出转为列表
        # logit = torch.softmax(logit, dim=1).cpu().tolist()
        # # 获取正例结果
        # logit = np.array(logit)[:, 1]
        # predicted_result_list.extend(logit.tolist())

    LoggerHelper.info("Predicting Ends".center(60, "="))

    return predicted_result_list
    def train_model(cls,
                    master_gpu_id,
                    model,
                    optimizer,
                    scheduler,
                    data_loader,
                    gradient_accumulation_steps,
                    use_cuda):
        """

        :param master_gpu_id:
        :param model:
        :param optimizer:
        :param scheduler:
        :param data_loader:
        :param gradient_accumulation_steps:
        :param use_cuda:
        :return:
        """
        model.train()

        loss_criterion = nn.CrossEntropyLoss()

        total_loss = 0.0
        correct_sum = 0
        num_batch = data_loader.__len__()
        num_sample = data_loader.dataset.__len__()

        # for step, batch in enumerate(tqdm(data_loader, unit="batch", ncols=100, desc="Training process: ")):
        for step, batch in enumerate(data_loader):
            start_t = time.time()

            # 获取label和音频数据并配置GPU
            labels = batch['label'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch['label']

            tokens = batch['tokens'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch['tokens']
            segment_ids = batch['segment_ids'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch[
                'segment_ids']
            attention_mask = batch["attention_mask"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else \
            batch["attention_mask"]

            # 获取模型输出
            output = model(tokens, segment_ids, attention_mask)
            # 计算loss
            loss = loss_criterion(output, labels)
            if gradient_accumulation_steps > 1:
                loss /= gradient_accumulation_steps
            # 反向传播
            loss.backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                # 更新参数
                optimizer.step()
                # 清除梯度
                model.zero_grad()
                scheduler.step()

            loss_value = loss.item()
            _, top_index = output.topk(1)
            top_index = top_index.cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else top_index
            labels = labels.cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else labels
            correct_sum += (top_index.view(-1) == labels).sum().item()
            total_loss += loss_value

            cost_t = time.time() - start_t
            LoggerHelper.info("step: {}\tloss: {:.2f}\ttime(s): {:.2f}".format(step, loss, cost_t))

        LoggerHelper.info("Total Training Samples: " + str(num_sample))
        LoggerHelper.info("Correct Prediction: " + str(correct_sum))
        LoggerHelper.info("Error Rate: " + format(1 - (correct_sum / num_sample), "0.4f"))

        return total_loss / num_batch
Example #6
0
def prepare(function, config_name, gpu_ids, saved_model, pretrain_model=''):
    """
    模型准备函数
    完成模型训练、验证和预测的各项准备工作
    :param function:        执行功能名称
    :param config_name:     配置名称
    :param gpu_ids:         GPU ID列表字符串(逗号分隔)
    :param saved_model:     模型保存目录名称
    :return:
    """
    ################
    #    配置部分   #
    ################

    # 根据Function的不同加载对应配置文件
    config_file_path = os.path.join(
        base_dir, 'config') + '/' + config_name + '.' + function + '.yaml'
    with open(config_file_path, 'r') as conf_file:
        config = yaml.load(conf_file, Loader=yaml.FullLoader)

    # 设置模型保存路径
    config['save_path'] = os.path.join(base_dir, 'saved_model',
                                       config['instance_name'])
    # 设置预训练模型保存路径
    if pretrain_model != '':
        config["pretrain_model_dir"] = pretrain_model

    ################
    #    日志部分   #
    ################
    # 获取日志文件路径
    log_file_path = os.path.join(
        base_dir, 'log',
        config['instance_name'] + '_' + str(function) + '.log')
    # 配置日志系统
    logger_config(log_file_name=log_file_path,
                  log_level=config['log_level'],
                  need_loghead=False,
                  timed_rotating=True)
    LoggerHelper.info("Loading HyperParameters".center(60, "="))
    LoggerHelper.info(config)
    LoggerHelper.info("Load HyperParameters Done".center(60, "="))

    ################
    #    GPU配置    #
    ################
    # 根据配置文件设置是否使用GPU标识
    use_cuda = config["use_cuda"]
    # 初始化主GPU ID为空
    master_gpu_id = None
    # 初始化GPU ID列表为空
    gpu_id_list = None
    if gpu_ids:
        if len(gpu_ids) == 1:
            master_gpu_id = int(gpu_ids)
        else:
            gpu_id_list = [int(gpu_id) for gpu_id in gpu_ids.split(",")]
            master_gpu_id = gpu_id_list[0]

    ################
    #    模型部分   #
    ################
    # 初始化模型
    if function == 'probability':
        model = BertForSequenceClassification.from_pretrained(
            config["pretrain_model_dir"],
            num_labels=config["num_labels"],
            output_attentions=False,  # 模型是否返回 attentions weights.
            # output_hidden_states = False, # 模型是否返回所有隐层状态.
        )
    elif function == 'score':
        model = None

    # 判断是否使用GPU
    if use_cuda:
        # 判断是否设置主GPU ID
        if master_gpu_id is not None:
            # 判断是否加载已有模型
            if saved_model:
                LoggerHelper.info("Loading Saved Model".center(60, "="))
                LoggerHelper.info("Load saved model from: " + saved_model)
                model.load_state_dict(torch.load(saved_model))
                LoggerHelper.info("Loading Saved Model Done".center(60, "="))

            LoggerHelper.info("GPU training or evaluating.")
            model = model.cuda(int(master_gpu_id))
            # 判断是否使用多GPU
            if gpu_id_list is not None:
                LoggerHelper.info("Multiple GPU training or evaluating.")
                model = torch.nn.DataParallel(model, device_ids=gpu_id_list)
            else:
                LoggerHelper.info("Single GPU training or evaluating.")
    else:
        # 判断是否加载已有模型
        if saved_model:
            LoggerHelper.info("Loading Saved Model".center(60, "="))
            LoggerHelper.info("Load saved model from: " + saved_model)
            model.load_state_dict(torch.load(saved_model, map_location='cpu'))
            LoggerHelper.info("Loading Saved Model Done".center(60, "="))

    return model, config, master_gpu_id
    def train_model(cls,
                    master_gpu_id,
                    model,
                    optimizer,
                    scheduler,
                    data_loader,
                    gradient_accumulation_steps,
                    use_cuda):
        model.train()

        loss_criterion = nn.CrossEntropyLoss()

        total_loss = 0.0
        correct_sum = 0
        num_batch = data_loader.__len__()
        num_sample = data_loader.dataset.__len__()

        # for step, batch in enumerate(tqdm(data_loader, unit="batch", ncols=100, desc="Training process: ")):
        for step, batch in enumerate(data_loader):
            start_t = time.time()

            labels = batch["label"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["label"]
            asr_label = batch["asr_label"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch[
                "asr_label"]

            tokens = batch['tokens'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch['tokens']
            segment_ids = batch['segment_ids'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch[
                'segment_ids']
            attention_mask = batch["attention_mask"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else \
            batch["attention_mask"]

            audio = batch["audio"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["audio"]

            main_output, asr_output = model(tokens, segment_ids, attention_mask, audio)

            main_loss = loss_criterion(main_output, labels)
            asr_loss = loss_criterion(asr_output, asr_label)

            overall_loss = main_loss + asr_loss

            if gradient_accumulation_steps > 1:
                overall_loss /= gradient_accumulation_steps

            overall_loss.backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                model.zero_grad()
                scheduler.step()

            loss_value = overall_loss.item()
            _, top_index = main_output.topk(1)
            top_index = top_index.cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else top_index
            labels = labels.cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else labels
            correct_sum += (top_index.view(-1) == labels).sum().item()
            total_loss += loss_value

            cost_t = time.time() - start_t
            LoggerHelper.info("step: {}\tloss: {:.2f}\ttime(s): {:.2f}".format(step, overall_loss, cost_t))

        LoggerHelper.info("Total Training Samples: " + str(num_sample))
        LoggerHelper.info("Correct Prediction: " + str(correct_sum))
        LoggerHelper.info("Error Rate: " + format(1 - (correct_sum / num_sample), "0.4f"))

        return total_loss / num_batch
Example #8
0
def main(runtime_config):
    """
    模型环境线下主流程
    :param runtime_config:
    :param call_type: 调用类型:single: 独立调用 interface: 接口调用
    :return:
    """
    ################
    #   运行时参数   #
    ################
    # 获取运行时参数
    # function          运行功能
    # task_type         任务类型
    # mode              运行模式
    # config_name       配置名称
    # gpu_ids           GPU ID配置
    # saved_model       模型参数文件地址
    function = runtime_config.function
    task_type = runtime_config.task_type
    mode = runtime_config.mode
    config_name = runtime_config.config_name
    gpu_ids = runtime_config.gpu_ids
    saved_model = runtime_config.saved_model

    saved_model_list = list()

    if saved_model is not None:
        if os.path.isdir(saved_model):
            model_file_list = os.listdir(saved_model)
            for model_file in model_file_list:
                if 'model' in model_file:
                    saved_model_list.append(
                        os.path.join(saved_model, model_file))
        else:
            saved_model_list.append(saved_model)
    else:
        saved_model_list.append(None)

    for saved_model in saved_model_list:
        # 完成模型准备工作
        # 获取配置和模型
        model, dataset, config, master_gpu_id, optimizer, scheduler = prepare(
            function, task_type, config_name, gpu_ids, saved_model)

        # 根据运行模式进入对应流程
        if mode == 'train':
            ################
            #    训练部分   #
            ################
            LoggerHelper.info("Training".center(60, "="))

            if len(saved_model_list) > 1:
                LoggerHelper.error("The initial model is die".center(60, "="))
                return

            if dataset[0] is not None:
                # 根据Function不同调用不同训练函数
                if function == 'probability':
                    if task_type == 'single_model_text_gate_merge':
                        from model_core.src.train_gate_merge import train_model

                        train_model(task_type,
                                    config["save_path"],
                                    master_gpu_id,
                                    model,
                                    optimizer,
                                    scheduler,
                                    config["epochs"],
                                    dataset[0],
                                    batch_size=config['train_batch_size'],
                                    gradient_accumulation_steps=config[
                                        "gradient_accumulation_steps"],
                                    use_cuda=config['use_cuda'],
                                    num_workers=config['train_num_workers'],
                                    shuffle=config['train_shuffle'])

                    elif task_type == 'single_model_text_no_pretrain_embedding_share_and_gate_merge':
                        from model_core.src.train_gate_merge_text_no_pretrain import train_model

                        train_model(task_type,
                                    config["save_path"],
                                    master_gpu_id,
                                    model,
                                    optimizer,
                                    scheduler,
                                    config["epochs"],
                                    dataset[0],
                                    batch_size=config['train_batch_size'],
                                    gradient_accumulation_steps=config[
                                        "gradient_accumulation_steps"],
                                    use_cuda=config['use_cuda'],
                                    num_workers=config['train_num_workers'],
                                    shuffle=config['train_shuffle'])

                    elif task_type == 'single_model_audio_lstm_text_gate_merge':
                        from model_core.src.train_audio_gate_merge import train_model

                        train_model(task_type,
                                    config["save_path"],
                                    master_gpu_id,
                                    model,
                                    optimizer,
                                    scheduler,
                                    config["epochs"],
                                    dataset[0],
                                    batch_size=config['train_batch_size'],
                                    gradient_accumulation_steps=config[
                                        "gradient_accumulation_steps"],
                                    use_cuda=config['use_cuda'],
                                    num_workers=config['train_num_workers'],
                                    shuffle=config['train_shuffle'])

                    else:
                        train.train_model(
                            task_type,
                            config["save_path"],
                            master_gpu_id,
                            model,
                            optimizer,
                            scheduler,
                            config["epochs"],
                            dataset[0],
                            batch_size=config['train_batch_size'],
                            gradient_accumulation_steps=config[
                                "gradient_accumulation_steps"],
                            use_cuda=config['use_cuda'],
                            num_workers=config['train_num_workers'],
                            shuffle=config['train_shuffle'])
                elif function == 'score':
                    pass

            LoggerHelper.info("Training Done".center(60, "="))

        elif mode == 'eval':
            ################
            #    评价部分   #
            ################
            LoggerHelper.info("Evaluating".center(60, "="))
            print(len(dataset[1]))
            if dataset[1] is not None:
                if function == 'probability':
                    eval.eval_model(task_type, master_gpu_id, model,
                                    dataset[1], config["eval_batch_size"],
                                    config["use_cuda"],
                                    config["eval_num_workers"])
                elif function == 'score':
                    pass

            LoggerHelper.info("Evaluating Done".center(60, "="))

        elif mode == 'test':
            pass

        elif mode == 'predict':
            ################
            #    预测部分   #
            ################
            LoggerHelper.info("Predicting".center(60, "="))

            if len(saved_model_list) > 1:
                LoggerHelper.error("The initial model is die".center(60, "="))
                return

            if dataset[2] is not None and config['predict_result_save_path']:
                if function == 'probability':
                    predict_results = predict.predict(
                        master_gpu_id, model, dataset[2],
                        config["predict_batch_size"], config["use_cuda"],
                        config["predict_num_workers"])
                elif function == 'score':
                    predict_results = None
                else:
                    predict_results = None

                predict_result_list = np.array(predict_results)
                result = pd.DataFrame(predict_result_list)
                result.to_csv(config['predict_result_save_path'],
                              index=False,
                              header=False)

            LoggerHelper.info("Predicting Done".center(60, "="))

        # 全部过程结束
        LoggerHelper.info("All process finished.".center(60, "="))
    def train_model(cls,
                    master_gpu_id,
                    model,
                    optimizer,
                    scheduler,
                    data_loader,
                    gradient_accumulation_steps,
                    use_cuda):
        """

        :param master_gpu_id:
        :param model:
        :param optimizer:
        :param scheduler:
        :param data_loader:
        :param gradient_accumulation_steps:
        :param use_cuda:
        :return:
        """
        model.train()

        loss_criterion = nn.CrossEntropyLoss()
        loss_loss_criterion = nn.L1Loss()

        total_loss = 0.0
        correct_sum = 0
        num_batch = data_loader.__len__()
        num_sample = data_loader.dataset.__len__()

        # for step, batch in enumerate(tqdm(data_loader, unit="batch", ncols=100, desc="Training process: ")):
        for step, batch in enumerate(data_loader):
            start_t = time.time()

            # 获取label和音频数据并配置GPU
            label_inputs = batch['label'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch['label']
            asr_label = batch["asr_label"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["asr_label"]

            tokens = batch['tokens'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch['tokens']
            segment_ids = batch['segment_ids'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch[
                'segment_ids']
            attention_mask = batch["attention_mask"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch["attention_mask"]

            audio_inputs = batch['audio'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch['audio']
            audio_length = batch['audio_length'].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else batch['audio_length']

            main_output, asr_output = model(tokens, segment_ids, attention_mask, audio_inputs, audio_length)

            # 计算主模型损失和ASR模型损失
            main_loss = loss_criterion(main_output, label_inputs)
            asr_loss = loss_criterion(asr_output, asr_label)

            # 计算加权总损失
            weighted_loss = model.module.loss_weight(main_loss, asr_loss)
            # 求加权损失累积均值
            if gradient_accumulation_steps > 1:
                weighted_loss /= gradient_accumulation_steps

            # 记录第一个Batch的损失
            # 用于之后计算评估训练速度的参数
            if step == 0:
                initial_asr_loss = asr_loss.detach()
                initial_main_loss = main_loss.detach()

            # 判断是否到达累计总步数
            # 若到达则清空累计梯度
            if step % gradient_accumulation_steps == 0:
                model.zero_grad()

            # 损失反向传播且不清除计算图
            weighted_loss.backward(retain_graph=True)

            # 计算ASR模型损失到三层FC的梯度

            asr_gradient = torch.autograd.grad(model.module.loss_weight.asr_model_weight * asr_loss,
                                               model.module.asr_model.parameters(),
                                               retain_graph=True,
                                               create_graph=True)
            # 计算上述梯度中对应FC第一层Linear的梯度并加权求二范数
            asr_norms = torch.norm(asr_gradient[0], 2)

            # 计算主模型损失到MobileNetV2的梯度
            main_gradient = torch.autograd.grad(model.module.loss_weight.main_model_weight * main_loss,
                                                model.module.main_model.parameters(),
                                                retain_graph=True,
                                                create_graph=True)
            # 计算上述梯度中对应MobileNetV2第一层的梯度并加权求二范数
            main_norms = torch.norm(main_gradient[0], 2)

            # 求上述两个梯度二范数的均值
            mean_norm = torch.div(torch.add(asr_norms, main_norms), 2)

            # 计算ASR模型当前Batch loss与首个Batch loss的比例
            asr_loss_ratio = torch.div(asr_loss, initial_asr_loss)
            # 计算主模型当前Batch loss与首个Batch loss的比例
            main_loss_ratio = torch.div(main_loss.data, initial_main_loss)
            mean_loss_ratio = torch.div(torch.add(asr_loss_ratio, main_loss_ratio), 2)

            # 计算ASR模型当前的训练速度参数
            asr_train_rate = torch.div(asr_loss_ratio, mean_loss_ratio)
            # 计算主模型当前的训练速度参数
            main_train_rate = torch.div(main_loss_ratio, mean_loss_ratio)

            # Todo
            # 超参读入
            asr_loss_target = mean_norm * (asr_train_rate) ** 0.16
            main_loss_target = mean_norm * (main_train_rate) ** 0.16
            asr_loss_target = asr_loss_target.detach()
            main_loss_target = main_loss_target.detach()

            optimizer[1].zero_grad()
            loss_sum = torch.add(loss_loss_criterion(asr_norms, asr_loss_target),
                                 loss_loss_criterion(main_norms, main_loss_target))
            loss_sum.backward()

            optimizer[1].step()

            if (step + 1) % gradient_accumulation_steps == 0:
                optimizer[0].step()
                # scheduler.step()

            loss_value = weighted_loss.item()
            _, top_index = main_output.topk(1)
            top_index = top_index.cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else top_index
            labels = label_inputs.cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else label_inputs
            correct_sum += (top_index.view(-1) == labels).sum().item()
            total_loss += loss_value

            cost_t = time.time() - start_t
            LoggerHelper.info("step: {}\tloss: {:.2f}\ttime(s): {:.2f}".format(step, loss_value, cost_t))

            normalize_coef = 2 / torch.add(model.module.loss_weight.asr_model_weight.data,
                                           model.module.loss_weight.main_model_weight.data)
            model.module.loss_weight.asr_model_weight.data = model.module.loss_weight.asr_model_weight.data * normalize_coef
            model.module.loss_weight.main_model_weight.data = model.module.loss_weight.main_model_weight.data * normalize_coef

            LoggerHelper.info("asr loss weight: {}\tmain loss weight: {}".format(model.module.loss_weight.asr_model_weight.item(), model.module.loss_weight.main_model_weight.item()))

        LoggerHelper.info("Total Training Samples: " + str(num_sample))
        LoggerHelper.info("Correct Prediction: " + str(correct_sum))
        LoggerHelper.info("Error Rate: " + format(1 - (correct_sum / num_sample), "0.4f"))

        return total_loss / num_batch
Example #10
0
    def train_model(cls, master_gpu_id, model, optimizer, scheduler,
                    data_loader, gradient_accumulation_steps, use_cuda):
        """

        :param master_gpu_id:
        :param model:
        :param optimizer:
        :param scheduler:
        :param data_loader:
        :param gradient_accumulation_steps:
        :param use_cuda:
        :return:
        """
        model.train()

        loss_function = nn.CrossEntropyLoss()
        loss_loss_criterion = nn.L1Loss()

        total_loss = 0.0
        correct_sum = 0
        num_batch = data_loader.__len__()
        num_sample = data_loader.dataset.__len__()

        if hasattr(model, 'module'):
            current_model = model.module
        else:
            current_model = model

        for step, batch in enumerate(data_loader):
            # 记录开始时间
            start_t = time.time()

            # 获取Batch数据
            text_inputs, label_inputs, asr_label_inputs = batch
            # 处理数据放置位置
            # label数据
            label_inputs = label_inputs.cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else label_inputs
            asr_label_inputs = asr_label_inputs.cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else asr_label_inputs
            # 文本字符向量和文本长度
            text_inputs, text_length = text_inputs
            text_inputs = text_inputs.cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else text_inputs
            text_length = text_length.cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else text_length

            # 获得模型输出
            prediction, auxiliary_prediction = model(text_inputs, text_length)
            # 计算模型loss
            main_loss = loss_function(prediction, label_inputs)
            auxiliary_loss = loss_function(auxiliary_prediction,
                                           asr_label_inputs)

            # 计算加权总损失
            weighted_loss = current_model.loss_weight(main_loss,
                                                      auxiliary_loss)
            # 求加权损失累积均值
            if gradient_accumulation_steps > 1:
                weighted_loss /= gradient_accumulation_steps

            # 记录第一个Batch的损失
            # 用于之后计算评估训练速度的参数
            if step == 0:
                initial_auxiliary_loss = auxiliary_loss.detach()
                initial_main_loss = main_loss.detach()

            # 判断是否到达累计总步数
            # 若到达则清空累计梯度
            if step % gradient_accumulation_steps == 0:
                model.zero_grad()

            # 损失反向传播且不清除计算图
            weighted_loss.backward(retain_graph=True)

            text_embeddings_last_layer_params = list(
                current_model.text_embeddings.parameters())

            # 计算ASR模型损失到三层FC的梯度
            auxiliary_gradient = torch.autograd.grad(
                current_model.loss_weight.auxiliary_model_weight *
                auxiliary_loss,
                text_embeddings_last_layer_params[-1],
                retain_graph=True,
                create_graph=True)
            # 计算上述梯度中对应FC第一层Linear的梯度并加权求二范数
            auxiliary_norms = torch.norm(auxiliary_gradient[0], 2)

            # 计算主模型损失到MobileNetV2的梯度
            main_gradient = torch.autograd.grad(
                current_model.loss_weight.main_model_weight * main_loss,
                text_embeddings_last_layer_params[-1],
                retain_graph=True,
                create_graph=True)
            # 计算上述梯度中对应MobileNetV2第一层的梯度并加权求二范数
            main_norms = torch.norm(main_gradient[0], 2)

            # 求上述两个梯度二范数的均值
            mean_norm = torch.div(torch.add(auxiliary_norms, main_norms), 2)

            # 计算ASR模型当前Batch loss与首个Batch loss的比例
            auxiliary_loss_ratio = torch.div(auxiliary_norms,
                                             initial_auxiliary_loss)
            # 计算主模型当前Batch loss与首个Batch loss的比例
            main_loss_ratio = torch.div(main_loss.data, initial_main_loss)
            mean_loss_ratio = torch.div(
                torch.add(auxiliary_loss_ratio, main_loss_ratio), 2)

            # 计算ASR模型当前的训练速度参数
            auxiliary_train_rate = torch.div(auxiliary_loss_ratio,
                                             mean_loss_ratio)
            # 计算主模型当前的训练速度参数
            main_train_rate = torch.div(main_loss_ratio, mean_loss_ratio)

            # Todo
            # 超参读入
            auxiliary_loss_target = mean_norm * (auxiliary_train_rate)**0.16
            main_loss_target = mean_norm * (main_train_rate)**0.16
            auxiliary_loss_target = auxiliary_loss_target.detach()
            main_loss_target = main_loss_target.detach()

            optimizer[1].zero_grad()
            loss_sum = torch.add(
                loss_loss_criterion(auxiliary_norms, auxiliary_loss_target),
                loss_loss_criterion(main_norms, main_loss_target))
            loss_sum.backward()

            optimizer[1].step()

            if (step + 1) % gradient_accumulation_steps == 0:
                optimizer[0].step()
                #scheduler.step()

            loss_value = weighted_loss.item()
            _, top_index = prediction.topk(1)
            top_index = top_index.cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else top_index
            correct_sum += (top_index.view(-1) == label_inputs).sum().item()
            total_loss += loss_value

            cost_t = time.time() - start_t
            LoggerHelper.info(
                "step: {}\tloss: {:.2f}\tmain_loss: {:.2f}\tasr_loss: {:.2f}\ttime(s): {:.2f}"
                .format(step, loss_value, main_loss, auxiliary_loss, cost_t))

            normalize_coef = 2 / torch.add(
                current_model.loss_weight.auxiliary_model_weight.data,
                current_model.loss_weight.main_model_weight.data)
            current_model.loss_weight.auxiliary_model_weight.data = current_model.loss_weight.auxiliary_model_weight.data * normalize_coef
            current_model.loss_weight.main_model_weight.data = current_model.loss_weight.main_model_weight.data * normalize_coef

            LoggerHelper.info(
                "asr loss weight: {}\tmain loss weight: {}".format(
                    model.module.loss_weight.auxiliary_model_weight.item(),
                    model.module.loss_weight.main_model_weight.item()))

        LoggerHelper.info("Total Training Samples: " + str(num_sample))
        LoggerHelper.info("Correct Prediction: " + str(correct_sum))
        LoggerHelper.info("Error Rate: " +
                          format(1 - (correct_sum / num_sample), "0.4f"))

        return total_loss / num_batch
Example #11
0
def prepare(function, task_type, config_name, gpu_ids, saved_model):
    """
    模型准备函数
    完成模型训练、验证和预测的各项准备工作
    :param function:        执行功能名称
    :param config_name:     配置名称
    :param gpu_ids:         GPU ID列表字符串(逗号分隔)
    :param saved_model:     模型保存目录名称
    :return:
    """
    ################
    #    配置部分   #
    ################

    # 根据Function的不同加载对应配置文件
    config_file_path = os.path.join(
        base_dir, 'config') + '/' + config_name + '.' + function + '.yaml'
    with open(config_file_path, 'r') as conf_file:
        config = yaml.load(conf_file, Loader=yaml.FullLoader)

    # 设置模型保存路径
    config['save_path'] = os.path.join(base_dir, 'saved_model',
                                       config['instance_name'])

    ################
    #    日志部分   #
    ################
    # 获取日志文件路径
    log_file_path = os.path.join(
        base_dir, 'log',
        config['instance_name'] + '_' + str(function) + '.log')
    # 配置日志系统
    logger_config(log_file_name=log_file_path,
                  log_level=config['log_level'],
                  need_loghead=False,
                  timed_rotating=True)
    LoggerHelper.info("Loading HyperParameters".center(60, "="))
    LoggerHelper.info(config)
    LoggerHelper.info("Load HyperParameters Done".center(60, "="))

    ################
    #    GPU配置    #
    ################
    # 根据配置文件设置是否使用GPU标识
    use_cuda = config["use_cuda"]
    # 初始化主GPU ID为空
    master_gpu_id = None
    # 初始化GPU ID列表为空
    gpu_id_list = None
    if gpu_ids:
        if len(gpu_ids) == 1:
            master_gpu_id = int(gpu_ids)
        else:
            gpu_id_list = [int(gpu_id) for gpu_id in gpu_ids.split(",")]
            master_gpu_id = gpu_id_list[0]

    ################
    #    模型部分   #
    ################
    # 初始化模型
    if function == 'probability':
        # Todo
        # 根据任务路由和统一配置文件完成基于不同任务的统一动态调用
        # task_router = TaskRouter()
        # task_dict =

        if task_type == 'single_model_audio':
            model = AudioSingleModel()

        elif task_type == 'single_model_audio_lstm':
            model = AudioSingleModelBasedOnLSTM(
                config_file_path=os.path.join(base_dir, 'config') + '/' +
                config['didi_multimodel_config_file'])

        elif task_type == 'single_model_audio_lstm_text_gate_merge':
            model = AudioSingleModelBasedOnLSTMTextGateMerge(
                config_file_path=os.path.join(base_dir, 'config') + '/' +
                config['didi_multimodel_config_file'],
                asr_pretrain_model=config['text_pretrain_model'],
                asr_embedding_dim=768,
                audio_embedding_dim=12288)

        elif task_type == 'single_model_audio_gate_merge':
            model = AudioSingleModelAndTextGateMerge(
                config['text_pretrain_model'],
                asr_embedding_dim=768,
                audio_pretrain_model=config['audio_pretrain_model'],
                audio_embedding_dim=1280)

        elif task_type == 'single_model_text_bert':
            model = TextSingleModelBasedOnBert(config['text_pretrain_model'])
            # model = BertForSequenceClassification.from_pretrained(config["text_pretrain_model_dir"],
            #                                                   num_labels=config["num_labels"],
            #                                                   output_attentions=False,  # 模型是否返回 attentions weights.
            #                                                   # output_hidden_states = False, # 模型是否返回所有隐层状态.
            #     )

        elif task_type == 'single_model_text_gate_merge':
            model = TextSingleModelAndTextGateMerge(
                config['text_pretrain_model'], asr_embedding_dim=768)

        elif task_type == 'multimodel_embedding_fuse_text_bert':
            model = EmbeddingShareMultimodel(config['text_pretrain_model'],
                                             asr_embedding_dim=768,
                                             audio_embedding_dim=1280)

        elif task_type == 'multimodel_feature_fuse_text_bert_gate_merge':
            model = OutputGateMergeMultimodel(config['text_pretrain_model'],
                                              asr_embedding_dim=768,
                                              audio_embedding_dim=1280)

        # elif task_type == 'multimodel_hybrid':
        #     model = EmbeddingShareAndOutputMergeMultimodel(config['text_pretrain_model'],
        #                                                    asr_embedding_dim=768,
        #                                                    audio_pretrain_model=config['audio_pretrain_model'],
        #                                                    audio_embedding_dim=1280)

        elif task_type == 'multimodel_didi':
            model = DiDiMultimodel(
                config_file_path=os.path.join(base_dir, 'config') + '/' +
                config['didi_multimodel_config_file'])

        elif task_type == 'multimodel_didi_pretrain':
            pass
            #model = DiDiMultimodelPretrain(config_file_path=os.path.join(base_dir, 'config') + '/' + config['didi_multimodel_config_file'],
            #asr_pretrain_model=config['text_pretrain_model'])

        elif task_type == 'single_model_text_no_pretrain':
            model = TextSingleModelBasedNoPretrain(
                config_file_path=os.path.join(base_dir, 'config') + '/' +
                config['didi_multimodel_config_file'])

        elif task_type == 'single_model_text_no_pretrain_embedding_share_and_gate_merge':
            model = TextSingleModelBasedNoPretrainEmbeddingShareAndGateMerge(
                config_file_path=os.path.join(base_dir, 'config') + '/' +
                config['didi_multimodel_config_file'])

        elif task_type == 'single_model_audio_no_pretrain':
            model = AudioSingleModelNoPretrain(
                config_file_path=os.path.join(base_dir, 'config') + '/' +
                config['didi_multimodel_config_file'])

        elif task_type == 'single_model_audio_no_pretrain_gate_merge':
            model = AudioSingleModelNoPretrainGateMerge(
                config_file_path=os.path.join(base_dir, 'config') + '/' +
                config['didi_multimodel_config_file'])

        elif task_type == 'multimodel_didi_embedding_share':
            model = DiDiMultimodelEmbeddingShare(
                config_file_path=os.path.join(base_dir, 'config') + '/' +
                config['didi_multimodel_config_file'])

        elif task_type == 'multimodel_didi_embedding_share_and_output_merge':
            model = DiDiMultimodelEmbeddingShareAndOutputMerge(
                config_file_path=os.path.join(base_dir, 'config') + '/' +
                config['didi_multimodel_config_file'])

    elif function == 'score':
        model = None

    # 判断是否使用GPU
    if use_cuda and master_gpu_id is not None:
        # 判断是否加载已有模型
        if saved_model:
            LoggerHelper.info("Loading Saved Model".center(60, "="))
            LoggerHelper.info("Load saved model from: " + saved_model)
            # model.load_state_dict(torch.load(saved_model))
            model.load_state_dict({
                k.replace('module.', ''): v
                for k, v in torch.load(saved_model).items()
            })
            LoggerHelper.info("Loading Saved Model Done".center(60, "="))

        LoggerHelper.info("GPU training or evaluating.")
        model = model.cuda(int(master_gpu_id))
        # 判断是否使用多GPU
        if gpu_id_list is not None:
            LoggerHelper.info("Multiple GPU training or evaluating.")
            model = torch.nn.DataParallel(model, device_ids=gpu_id_list)
        else:
            LoggerHelper.info("Single GPU training or evaluating.")
    else:
        # 判断是否加载已有模型
        if saved_model:
            LoggerHelper.info("Loading Saved Model".center(60, "="))
            LoggerHelper.info("Load saved model from: " + saved_model)
            model.load_state_dict(torch.load(saved_model, map_location='cpu'))
            LoggerHelper.info("Loading Saved Model Done".center(60, "="))

    ################
    #    数据部分   #
    ################
    LoggerHelper.info("Loading Dataset".center(60, "="))
    train_dataset = None
    eval_dataset = None
    predict_dataset = None
    # 根据Function不同加载对应数据集
    if function == 'probability':
        if config['train_dataset_path'] and os.path.exists(
                config['train_dataset_path']):
            if task_type in (
                    'multimodel_didi', 'multimodel_didi_embedding_share',
                    'multimodel_didi_embedding_share_and_output_merge',
                    'single_model_audio_no_pretrain_gate_merge'):
                glove = vocab.GloVe(
                    name='6B',
                    dim=300,
                    cache=config['didi_multimodel_vocabulary_dict'])
                vocabulary_dict = glove.stoi

                train_dataset = DiDiDataset(
                    data=config['train_dataset_path'],
                    audio_dir=config['train_audio_dir'],
                    vocabulary_dict=vocabulary_dict,
                    audio_length=config['didi_multimodel_audio_length'])
            elif task_type in (
                    'single_model_text_no_pretrain',
                    'single_model_text_no_pretrain_embedding_share_and_gate_merge'
            ):
                glove = vocab.GloVe(
                    name='6B',
                    dim=300,
                    cache=config['didi_multimodel_vocabulary_dict'])
                vocabulary_dict = glove.stoi

                train_dataset = DiDiDatasetText(
                    data=config['train_dataset_path'],
                    vocabulary_dict=vocabulary_dict)

            elif task_type == 'single_model_audio_no_pretrain':
                train_dataset = DiDiDatasetAudio(
                    data=config['train_dataset_path'],
                    audio_dir=config['train_audio_dir'],
                    audio_length=config['didi_multimodel_audio_length'])

            else:
                train_dataset = FluencyDataset(
                    data=config['train_dataset_path'],
                    task_type=task_type,
                    audio_dir=config['train_audio_dir'],
                    max_seq_len=config['max_seq_len'],
                    asr_pretrain_model=config['text_pretrain_model'],
                    audio_pretrain_model=config['audio_pretrain_model'],
                    predict=False,
                    cache=config['cache'],
                    temp_dir=config['temp_dir'])

        if config['eval_dataset_path'] and os.path.exists(
                config['eval_dataset_path']):
            if task_type in (
                    'multimodel_didi', 'multimodel_didi_embedding_share',
                    'multimodel_didi_embedding_share_and_output_merge',
                    'single_model_audio_no_pretrain_gate_merge'):
                glove = vocab.GloVe(
                    name='6B',
                    dim=300,
                    cache=config['didi_multimodel_vocabulary_dict'])
                vocabulary_dict = glove.stoi

                eval_dataset = DiDiDataset(
                    data=config['eval_dataset_path'],
                    audio_dir=config['eval_audio_dir'],
                    vocabulary_dict=vocabulary_dict,
                    audio_length=config['didi_multimodel_audio_length'])

            elif task_type in (
                    'single_model_text_no_pretrain',
                    'single_model_text_no_pretrain_embedding_share_and_gate_merge'
            ):
                glove = vocab.GloVe(
                    name='6B',
                    dim=300,
                    cache=config['didi_multimodel_vocabulary_dict'])
                vocabulary_dict = glove.stoi

                eval_dataset = DiDiDatasetText(
                    data=config['eval_dataset_path'],
                    vocabulary_dict=vocabulary_dict)

            elif task_type == 'single_model_audio_no_pretrain':
                eval_dataset = DiDiDatasetAudio(
                    data=config['eval_dataset_path'],
                    audio_dir=config['train_audio_dir'],
                    audio_length=config['didi_multimodel_audio_length'])

            else:
                eval_dataset = FluencyDataset(
                    data=config['eval_dataset_path'],
                    task_type=task_type,
                    audio_dir=config['eval_audio_dir'],
                    max_seq_len=config['max_seq_len'],
                    asr_pretrain_model=config['text_pretrain_model'],
                    audio_pretrain_model=config['audio_pretrain_model'],
                    predict=False,
                    cache=config['cache'],
                    temp_dir=config['temp_dir'])

        if config['predict_dataset_path'] and os.path.exists(
                config['predict_dataset_path']):
            pass
            # predict_dataset = AudioFluencyDataset(data=config['predict_dataset_path'],
            #                                       audio_dir=config['predict_audio_path'],
            #                                       max_seq_len=config['max_seq_len'],
            #                                       audio_pretrain_model_dir=config['audio_pretrain_model_dir'],
            #                                       text_pretrain_model_dir=config['text_pretrain_model_dir'])
    elif function == 'score':
        pass

    LoggerHelper.info("Loading Dataset Done".center(60, "="))

    ################
    #   优化器部分   #
    ################
    optimizer = None

    if task_type == 'single_model_audio_gate_merge':
        pass
        # loss_params_id = list()
        # loss_params = list()
        #
        # from model_core.src.models.loss_weight import LossWeight
        # for m in model.modules():
        #     if isinstance(m, LossWeight):
        #         loss_params_id += list(map(id, m.parameters()))
        #         loss_params += m.parameters()
        #
        # base_params = list(filter(lambda p: id(p) not in loss_params_id, model.parameters()))
        #
        # base_optimizer = AdamW(base_params, lr=config['lr'])
        # loss_optimizer = AdamW(loss_params, lr=config['lr'])
        # optimizer = [base_optimizer, loss_optimizer]

    else:
        optimizer = AdamW(
            model.parameters(),
            lr=config['lr'],  # args.learning_rate - default is 5e-5
            # eps = 1e-8 # args.adam_epsilon  - default is 1e-8
        )

    ################
    #   调度器部分   #
    ################
    scheduler = None

    if task_type == 'single_model_audio_gate_merge':
        pass

    elif task_type in (
            'single_model_text_no_pretrain_embedding_share_and_gate_merge',
            'single_model_audio_no_pretrain_gate_merge',
            'single_model_text_gate_merge',
            'single_model_audio_lstm_text_gate_merge'):
        pass

    else:
        total_steps = None
        if train_dataset is not None:
            total_steps = train_dataset.__len__() * config["epochs"]
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    return model, [train_dataset, eval_dataset, predict_dataset
                   ], config, master_gpu_id, optimizer, scheduler
Example #12
0
    def train_model(cls, master_gpu_id, model, optimizer, scheduler,
                    data_loader, gradient_accumulation_steps, use_cuda):
        """

        :param master_gpu_id:
        :param model:
        :param optimizer:
        :param scheduler:
        :param data_loader:
        :param gradient_accumulation_steps:
        :param use_cuda:
        :return:
        """
        model.train()

        loss_function = nn.CrossEntropyLoss()

        total_loss = 0.0
        correct_sum = 0
        num_batch = data_loader.__len__()
        num_sample = data_loader.dataset.__len__()

        for step, batch in enumerate(data_loader):
            start_t = time.time()

            text_inputs, label_inputs, _ = batch

            label_inputs = label_inputs.cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else label_inputs

            text_inputs, text_length = text_inputs
            text_inputs = text_inputs.cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else text_inputs
            text_length = text_length.cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else text_length

            prediction = model(text_inputs, text_length)
            loss = loss_function(prediction, label_inputs)

            if gradient_accumulation_steps > 1:
                loss /= gradient_accumulation_steps

            loss.backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                model.zero_grad()
                scheduler.step()

            loss_value = loss.item()
            _, top_index = prediction.topk(1)
            top_index = top_index.cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else top_index
            labels = label_inputs.cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else label_inputs
            correct_sum += (top_index.view(-1) == labels).sum().item()
            total_loss += loss_value

            cost_t = time.time() - start_t
            LoggerHelper.info("step: {}\tloss: {:.2f}\ttime(s): {:.2f}".format(
                step, loss, cost_t))

        LoggerHelper.info("Total Training Samples: " + str(num_sample))
        LoggerHelper.info("Correct Prediction: " + str(correct_sum))
        LoggerHelper.info("Error Rate: " +
                          format(1 - (correct_sum / num_sample), "0.4f"))

        return total_loss / num_batch
Example #13
0
def train_model(task_type,
                model_save_path,
                master_gpu_id,
                model,
                optimizer,
                scheduler,
                epochs,
                train_dataset,
                batch_size,
                gradient_accumulation_steps=1,
                use_cuda=False,
                num_workers=1,
                shuffle=True):
    """
    模型训练
    :param model_save_path:
    :param master_gpu_id:
    :param model:
    :param optimizer:
    :param scheduler:
    :param epochs:
    :param train_dataset:
    :param batch_size:
    :param gradient_accumulation_steps:
    :param use_cuda:
    :param num_workers:
    :param shuffle:
    :return:
    """
    LoggerHelper.info("Start Training".center(60, "="))

    if task_type in ('multimodel_didi', 'multimodel_didi_embedding_share',
                     'multimodel_didi_embedding_share_and_output_merge',
                     'single_model_audio_no_pretrain_gate_merge'):
        from model_core.src.data.didi_dataset import DiDiDataset

        train_loader = DataLoader(dataset=train_dataset,
                                  pin_memory=use_cuda,
                                  batch_size=batch_size,
                                  num_workers=num_workers,
                                  shuffle=shuffle,
                                  collate_fn=DiDiDataset.collate)

    elif task_type in (
            'single_model_text_no_pretrain',
            'single_model_text_no_pretrain_embedding_share_and_gate_merge'):
        from model_core.src.data.didi_dataset_text import DiDiDatasetText

        train_loader = DataLoader(dataset=train_dataset,
                                  pin_memory=use_cuda,
                                  batch_size=batch_size,
                                  num_workers=num_workers,
                                  shuffle=shuffle,
                                  collate_fn=DiDiDatasetText.collate)

    elif task_type == 'single_model_audio_no_pretrain':
        from model_core.src.data.didi_dataset_audio import DiDiDatasetAudio

        train_loader = DataLoader(dataset=train_dataset,
                                  pin_memory=use_cuda,
                                  batch_size=batch_size,
                                  num_workers=num_workers,
                                  shuffle=shuffle,
                                  collate_fn=DiDiDatasetAudio.collate)

    else:
        train_loader = DataLoader(dataset=train_dataset,
                                  pin_memory=use_cuda,
                                  batch_size=batch_size,
                                  num_workers=num_workers,
                                  shuffle=shuffle)

    for epoch in range(1, epochs + 1):
        LoggerHelper.info("Training Epoch: " + str(epoch))

        # avg_loss = train_epoch(master_gpu_id,
        #                        model,
        #                        optimizer,
        #                        scheduler,
        #                        train_loader,
        #                        gradient_accumulation_steps,
        #                        use_cuda)

        if task_type == 'single_model_audio':
            avg_loss = AudioSingleModel.train_model(
                master_gpu_id, model, optimizer, scheduler, train_loader,
                gradient_accumulation_steps, use_cuda)

            LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            save_model(model_save_path, model, epoch)

        elif task_type == 'single_model_audio_lstm':
            avg_loss = AudioSingleModelBasedOnLSTM.train_model(
                master_gpu_id, model, optimizer, scheduler, train_loader,
                gradient_accumulation_steps, use_cuda)

            LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            save_model(model_save_path, model, epoch)

        elif task_type == 'single_model_audio_gate_merge':
            avg_loss = AudioSingleModelAndTextGateMerge.train_model(
                master_gpu_id, model, optimizer, scheduler, train_loader,
                gradient_accumulation_steps, use_cuda)

            LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            save_model(model_save_path, model, epoch)

        elif task_type == 'single_model_text_bert':
            avg_loss = TextSingleModelBasedOnBert.train_model(
                master_gpu_id, model, optimizer, scheduler, train_loader,
                gradient_accumulation_steps, use_cuda)

            LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            save_model(model_save_path, model, epoch)

        elif task_type == 'single_model_text_gate_merge':
            avg_loss = TextSingleModelAndTextGateMerge.train_model(
                master_gpu_id, model, optimizer, scheduler, train_loader,
                gradient_accumulation_steps, use_cuda)

            LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            save_model(model_save_path, model[0], epoch)

        elif task_type == 'multimodel_embedding_fuse_text_bert':
            avg_loss = EmbeddingShareMultimodel.train_model(
                master_gpu_id, model, optimizer, scheduler, train_loader,
                gradient_accumulation_steps, use_cuda)

            LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            save_model(model_save_path, model, epoch)

        elif task_type == 'multimodel_feature_fuse_text_bert_gate_merge':
            avg_loss = OutputGateMergeMultimodel.train_model(
                master_gpu_id, model, optimizer, scheduler, train_loader,
                gradient_accumulation_steps, use_cuda)

            LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            save_model(model_save_path, model, epoch)

        elif task_type == 'multimodel_hybrid':
            pass
            # avg_loss = EmbeddingShareAndOutputMergeMultimodel.train_model(master_gpu_id,
            #                                                               model,
            #                                                               optimizer,
            #                                                               scheduler,
            #                                                               train_loader,
            #                                                               gradient_accumulation_steps,
            #                                                               use_cuda)
            #
            # LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            # save_model(model_save_path, model, epoch)

        elif task_type == 'multimodel_didi':
            avg_loss = DiDiMultimodel.train_model(master_gpu_id, model,
                                                  optimizer, scheduler,
                                                  train_loader,
                                                  gradient_accumulation_steps,
                                                  use_cuda)

            LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            save_model(model_save_path, model, epoch)

        elif task_type == 'multimodel_didi_pretrain':
            pass
            #avg_loss = DiDiMultimodelPretrain.train_model(master_gpu_id,
            #                                              model,
            #                                              optimizer,
            #                                              scheduler,
            #                                              train_loader,
            #                                              gradient_accumulation_steps,
            #                                              use_cuda)

        elif task_type == 'single_model_text_no_pretrain':
            avg_loss = TextSingleModelBasedNoPretrain.train_model(
                master_gpu_id, model, optimizer, scheduler, train_loader,
                gradient_accumulation_steps, use_cuda)

            LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            save_model(model_save_path, model, epoch)

        elif task_type == 'single_model_text_no_pretrain_embedding_share_and_gate_merge':
            avg_loss = TextSingleModelBasedNoPretrainEmbeddingShareAndGateMerge.train_model(
                master_gpu_id, model, optimizer, scheduler, train_loader,
                gradient_accumulation_steps, use_cuda)

            LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            save_model(model_save_path, model, epoch)

        elif task_type == 'single_model_audio_no_pretrain':
            avg_loss = AudioSingleModelNoPretrain.train_model(
                master_gpu_id, model, optimizer, scheduler, train_loader,
                gradient_accumulation_steps, use_cuda)

            LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            save_model(model_save_path, model, epoch)

        elif task_type == 'single_model_audio_no_pretrain_gate_merge':
            avg_loss = AudioSingleModelNoPretrainGateMerge.train_model(
                master_gpu_id, model, optimizer, scheduler, train_loader,
                gradient_accumulation_steps, use_cuda)

            LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            save_model(model_save_path, model, epoch)

        elif task_type == 'multimodel_didi_embedding_share':
            avg_loss = DiDiMultimodelEmbeddingShare.train_model(
                master_gpu_id, model, optimizer, scheduler, train_loader,
                gradient_accumulation_steps, use_cuda)

            LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            save_model(model_save_path, model, epoch)

        elif task_type == 'multimodel_didi_embedding_share_and_output_merge':
            avg_loss = DiDiMultimodelEmbeddingShareAndOutputMerge.train_model(
                master_gpu_id, model, optimizer, scheduler, train_loader,
                gradient_accumulation_steps, use_cuda)

            LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
            save_model(model_save_path, model, epoch)

    LoggerHelper.info("Training Done".center(60, "="))

    return
    def train_model(cls, master_gpu_id, model, optimizer, scheduler,
                    data_loader, gradient_accumulation_steps, use_cuda):
        model.train()

        loss_function = nn.CrossEntropyLoss()

        total_loss = 0.0
        correct_sum = 0
        num_batch = data_loader.__len__()
        num_sample = data_loader.dataset.__len__()

        for step, batch in enumerate(data_loader):
            start_t = time.time()

            labels = batch["label"].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch["label"]
            asr_label = batch["asr_label"].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch["asr_label"]

            tokens = batch['tokens'].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch['tokens']
            segment_ids = batch['segment_ids'].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch[
                'segment_ids']
            attention_mask = batch["attention_mask"].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch[
                "attention_mask"]
            tokens_length = batch['tokens_length'].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch[
                'tokens_length']

            audio_inputs = batch["audio"].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch["audio"]
            audio_length = batch["audio_length"].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch[
                "audio_length"]

            prediction = model(tokens, segment_ids, attention_mask,
                               tokens_length, audio_inputs, audio_length)
            loss = loss_function(prediction, labels)

            if gradient_accumulation_steps > 1:
                loss /= gradient_accumulation_steps

            loss.backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                model.zero_grad()
                scheduler.step()

            loss_value = loss.item()
            _, top_index = prediction.topk(1)
            top_index = top_index.cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else top_index
            labels = label_inputs.cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else label_inputs
            correct_sum += (top_index.view(-1) == labels).sum().item()
            total_loss += loss_value

            cost_t = time.time() - start_t
            LoggerHelper.info("step: {}\tloss: {:.2f}\ttime(s): {:.2f}".format(
                step, loss, cost_t))

        LoggerHelper.info("Total Training Samples: " + str(num_sample))
        LoggerHelper.info("Correct Prediction: " + str(correct_sum))
        LoggerHelper.info("Error Rate: " +
                          format(1 - (correct_sum / num_sample), "0.4f"))

        return total_loss / num_batch
Example #15
0
def train_model(task_type,
                model_save_path,
                master_gpu_id,
                model,
                optimizer,
                scheduler,
                epochs,
                train_dataset,
                batch_size,
                gradient_accumulation_steps=1,
                use_cuda=False,
                num_workers=1,
                shuffle=True):
    """
    模型训练
    :param model_save_path:
    :param master_gpu_id:
    :param model:
    :param optimizer:
    :param scheduler:
    :param epochs:
    :param train_dataset:
    :param batch_size:
    :param gradient_accumulation_steps:
    :param use_cuda:
    :param num_workers:
    :param shuffle:
    :return:
    """
    LoggerHelper.info("Start Training".center(60, "="))

    data_loader = DataLoader(dataset=train_dataset,
                             pin_memory=use_cuda,
                             batch_size=batch_size,
                             num_workers=num_workers,
                             shuffle=shuffle)

    model.train()

    loss_criterion = nn.CrossEntropyLoss()
    loss_loss_criterion = nn.L1Loss()

    num_batch = data_loader.__len__()
    num_sample = data_loader.dataset.__len__()

    device = torch.device('cuda:' + str(master_gpu_id)
                          if use_cuda and master_gpu_id is not None else 'cpu')
    main_loss_weight = torch.tensor([1],
                                    dtype=torch.float64,
                                    requires_grad=True,
                                    device=device)
    auxiliary_loss_weight = torch.tensor([1],
                                         dtype=torch.float64,
                                         requires_grad=True,
                                         device=device)

    loss_params = [main_loss_weight, auxiliary_loss_weight]
    loss_optimizer = torch.optim.Adam(
        loss_params, lr=optimizer.state_dict()['param_groups'][0]['lr'])

    for epoch in range(1, epochs + 1):
        LoggerHelper.info("Training Epoch: " + str(epoch))

        total_loss = 0.0
        correct_sum = 0

        if hasattr(model, 'module'):
            current_model = model.module
        else:
            current_model = model

        # for step, batch in enumerate(tqdm(data_loader, unit="batch", ncols=100, desc="Training process: ")):
        for step, batch in enumerate(data_loader):
            start_t = time.time()

            labels = batch["label"].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch["label"]
            asr_label = batch["asr_label"].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch["asr_label"]

            tokens = batch['tokens'].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch['tokens']
            segment_ids = batch['segment_ids'].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch[
                'segment_ids']
            attention_mask = batch["attention_mask"].cuda(master_gpu_id) if use_cuda and master_gpu_id is not None else \
            batch["attention_mask"]

            audio_inputs = batch['audio'].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch['audio']
            audio_length = batch['audio_length'].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch[
                'audio_length']

            main_output, asr_output = model(tokens, segment_ids,
                                            attention_mask, audio_inputs,
                                            audio_length)

            main_loss = loss_criterion(main_output, labels)
            auxiliary_loss = loss_criterion(asr_output, asr_label)

            # 计算加权总损失
            weighted_main_loss = loss_params[0] * main_loss
            weighted_auxiliary_loss = loss_params[1] * auxiliary_loss
            # weighted_loss = torch.div(torch.add(weighted_main_loss, weighted_auxiliary_loss), 2)
            weighted_loss = torch.div(
                torch.add(weighted_main_loss, weighted_auxiliary_loss), 2)
            # 求加权损失累积均值
            if gradient_accumulation_steps > 1:
                weighted_loss /= gradient_accumulation_steps

            # 记录第一个Batch的损失
            # 用于之后计算评估训练速度的参数
            if step == 0:
                initial_auxiliary_loss = auxiliary_loss.detach()
                initial_main_loss = main_loss.detach()

            # # 判断是否到达累计总步数
            # # 若到达则清空累计梯度
            # if step % gradient_accumulation_steps == 0:
            #     optimizer.zero_grad()

            # 损失反向传播且不清除计算图
            optimizer.zero_grad()
            weighted_loss.backward(retain_graph=True)
            optimizer.step()

            asr_embedding_params = list(
                current_model.asr_embedding_model.parameters())

            # 计算ASR模型损失到三层FC的梯度
            auxiliary_gradient = torch.autograd.grad(weighted_auxiliary_loss,
                                                     asr_embedding_params[-2],
                                                     retain_graph=True,
                                                     create_graph=True)
            # 计算上述梯度中对应FC第一层Linear的梯度并加权求二范数
            auxiliary_norms = torch.norm(auxiliary_gradient[0], 2)

            # 计算主模型损失到MobileNetV2的梯度
            main_gradient = torch.autograd.grad(weighted_main_loss,
                                                asr_embedding_params[-2],
                                                retain_graph=True,
                                                create_graph=True)
            # 计算上述梯度中对应MobileNetV2第一层的梯度并加权求二范数
            main_norms = torch.norm(main_gradient[0], 2)

            # 求上述两个梯度二范数的均值
            mean_norm = torch.div(torch.add(auxiliary_norms, main_norms), 2)

            # 计算ASR模型当前Batch loss与首个Batch loss的比例
            auxiliary_loss_ratio = torch.div(auxiliary_loss,
                                             initial_auxiliary_loss)
            # 计算主模型当前Batch loss与首个Batch loss的比例
            main_loss_ratio = torch.div(main_loss, initial_main_loss)
            mean_loss_ratio = torch.div(
                torch.add(auxiliary_loss_ratio, main_loss_ratio), 2)

            # 计算ASR模型当前的训练速度参数
            auxiliary_train_rate = torch.div(auxiliary_loss_ratio,
                                             mean_loss_ratio)
            # 计算主模型当前的训练速度参数
            main_train_rate = torch.div(main_loss_ratio, mean_loss_ratio)

            # Todo
            # 超参读入
            auxiliary_loss_target = mean_norm * (auxiliary_train_rate)**0.12
            main_loss_target = mean_norm * (main_train_rate)**0.12
            auxiliary_loss_target = auxiliary_loss_target.detach()
            main_loss_target = main_loss_target.detach()

            loss_optimizer.zero_grad()
            loss_sum = torch.add(
                loss_loss_criterion(auxiliary_norms, auxiliary_loss_target),
                loss_loss_criterion(main_norms, main_loss_target))
            loss_sum.backward()

            loss_optimizer.step()

            # if (step + 1) % gradient_accumulation_steps == 0:
            #     optimizer.step()
            #     # scheduler.step()

            loss_value = weighted_loss.item()
            _, top_index = main_output.topk(1)
            top_index = top_index.cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else top_index
            labels = labels.cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else labels
            correct_sum += (top_index.view(-1) == labels).sum().item()
            total_loss += loss_value

            cost_t = time.time() - start_t
            LoggerHelper.info(
                "step: {}\tloss: {:.2f}\tmain_loss: {:.2f}\tasr_loss: {:.2f}\ttime(s): {:.2f}"
                .format(step, loss_value, main_loss.item(),
                        auxiliary_loss.item(), cost_t))

            coef = 2 / torch.add(main_loss_weight, auxiliary_loss_weight)
            loss_params = [
                coef * main_loss_weight, coef * auxiliary_loss_weight
            ]

            LoggerHelper.info(
                "main loss weight: {}\tauxiliary loss weight: {}".format(
                    main_loss_weight.item(), auxiliary_loss_weight.item()))

        LoggerHelper.info("Total Training Samples: " + str(num_sample))
        LoggerHelper.info("Correct Prediction: " + str(correct_sum))
        LoggerHelper.info("Error Rate: " +
                          format(1 - (correct_sum / num_sample), "0.4f"))

        avg_loss = total_loss / num_batch

        LoggerHelper.info("Average Loss: " + format(avg_loss, "0.4f"))
        save_model(model_save_path, model, epoch)

    LoggerHelper.info("Training Done".center(60, "="))

    return